You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by rh...@apache.org on 2013/11/18 20:29:27 UTC
svn commit: r1543120 [1/16] - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/ data/files/ ql/src/java/org/apache/hadoop/hive/ql/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org/a...

Author: rhbutani
Date: Mon Nov 18 19:29:24 2013
New Revision: 1543120

URL: http://svn.apache.org/r1543120
Log:
HIVE-5369 Annotate hive operator tree with statistics from metastore (Prasanth Jayachandran via Harish Butani)

Added:
    hive/trunk/data/files/alltypes.txt
    hive/trunk/data/files/dept.txt
    hive/trunk/data/files/emp.txt
    hive/trunk/data/files/loc.txt
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_filter.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_groupby.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_limit.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_part.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_select.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_table.q.out
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_union.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/util/JavaDataModel.java
    hive/trunk/ql/src/test/results/clientpositive/alter_partition_coltype.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_join_reordering_values.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_11.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_12.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_2.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_3.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_4.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_5.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_7.q.out
    hive/trunk/ql/src/test/results/clientpositive/auto_sortmerge_join_8.q.out
    hive/trunk/ql/src/test/results/clientpositive/binary_output_format.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket1.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket2.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket3.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket4.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket5.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket_map_join_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucket_map_join_2.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_2.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_3.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_4.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_5.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_6.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_7.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketcontext_8.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin1.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin10.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin11.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin12.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin13.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin2.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin3.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin4.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin5.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin7.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin8.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin9.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin_negative.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin_negative2.q.out
    hive/trunk/ql/src/test/results/clientpositive/bucketmapjoin_negative3.q.out
    hive/trunk/ql/src/test/results/clientpositive/combine2_hadoop20.q.out
    hive/trunk/ql/src/test/results/clientpositive/ctas_hadoop20.q.out
    hive/trunk/ql/src/test/results/clientpositive/disable_merge_for_bucketing.q.out
    hive/trunk/ql/src/test/results/clientpositive/dynamic_partition_skip_default.q.out
    hive/trunk/ql/src/test/results/clientpositive/filter_join_breaktask.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_map_ppr.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_map_ppr_multi_distinct.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_ppr.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_ppr_multi_distinct.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_6.q.out
    hive/trunk/ql/src/test/results/clientpositive/groupby_sort_skew_1.q.out
    hive/trunk/ql/src/test/results/clientpositive/input23.q.out
    hive/trunk/ql/src/test/results/clientpositive/input42.q.out
    hive/trunk/ql/src/test/results/clientpositive/input_part1.q.out
    hive/trunk/ql/src/test/results/clientpositive/input_part2.q.out
    hive/trunk/ql/src/test/results/clientpositive/input_part7.q.out
    hive/trunk/ql/src/test/results/clientpositive/input_part9.q.out
    hive/trunk/ql/src/test/results/clientpositive/join17.q.out
    hive/trunk/ql/src/test/results/clientpositive/join26.q.out
    hive/trunk/ql/src/test/results/clientpositive/join32.q.out
    hive/trunk/ql/src/test/results/clientpositive/join32_lessSize.q.out
    hive/trunk/ql/src/test/results/clientpositive/join33.q.out
    hive/trunk/ql/src/test/results/clientpositive/join34.q.out
    hive/trunk/ql/src/test/results/clientpositive/join35.q.out
    hive/trunk/ql/src/test/results/clientpositive/join9.q.out
    hive/trunk/ql/src/test/results/clientpositive/join_filters_overlap.q.out
    hive/trunk/ql/src/test/results/clientpositive/join_map_ppr.q.out
    hive/trunk/ql/src/test/results/clientpositive/list_bucket_dml_10.q.out
    hive/trunk/ql/src/test/results/clientpositive/load_dyn_part8.q.out
    hive/trunk/ql/src/test/results/clientpositive/louter_join_ppr.q.out
    hive/trunk/ql/src/test/results/clientpositive/macro.q.out
    hive/trunk/ql/src/test/results/clientpositive/merge3.q.out
    hive/trunk/ql/src/test/results/clientpositive/metadataonly1.q.out
    hive/trunk/ql/src/test/results/clientpositive/outer_join_ppr.q.out
    hive/trunk/ql/src/test/results/clientpositive/pcr.q.out
    hive/trunk/ql/src/test/results/clientpositive/ppd_join_filter.q.out
    hive/trunk/ql/src/test/results/clientpositive/ppd_union_view.q.out
    hive/trunk/ql/src/test/results/clientpositive/ppd_vc.q.out
    hive/trunk/ql/src/test/results/clientpositive/ppr_allchildsarenull.q.out
    hive/trunk/ql/src/test/results/clientpositive/push_or.q.out
    hive/trunk/ql/src/test/results/clientpositive/rand_partitionpruner1.q.out
    hive/trunk/ql/src/test/results/clientpositive/rand_partitionpruner2.q.out
    hive/trunk/ql/src/test/results/clientpositive/rand_partitionpruner3.q.out
    hive/trunk/ql/src/test/results/clientpositive/reduce_deduplicate.q.out
    hive/trunk/ql/src/test/results/clientpositive/regexp_extract.q.out
    hive/trunk/ql/src/test/results/clientpositive/router_join_ppr.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample1.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample10.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample2.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample4.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample5.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample6.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample7.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample8.q.out
    hive/trunk/ql/src/test/results/clientpositive/sample9.q.out
    hive/trunk/ql/src/test/results/clientpositive/serde_user_properties.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin9.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_11.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_12.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_13.q.out
    hive/trunk/ql/src/test/results/clientpositive/smb_mapjoin_15.q.out
    hive/trunk/ql/src/test/results/clientpositive/sort_merge_join_desc_5.q.out
    hive/trunk/ql/src/test/results/clientpositive/sort_merge_join_desc_6.q.out
    hive/trunk/ql/src/test/results/clientpositive/sort_merge_join_desc_7.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats0.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats11.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats12.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats13.q.out
    hive/trunk/ql/src/test/results/clientpositive/transform_ppr1.q.out
    hive/trunk/ql/src/test/results/clientpositive/transform_ppr2.q.out
    hive/trunk/ql/src/test/results/clientpositive/udf_explode.q.out
    hive/trunk/ql/src/test/results/clientpositive/udf_java_method.q.out
    hive/trunk/ql/src/test/results/clientpositive/udf_reflect.q.out
    hive/trunk/ql/src/test/results/clientpositive/udf_reflect2.q.out
    hive/trunk/ql/src/test/results/clientpositive/udtf_explode.q.out
    hive/trunk/ql/src/test/results/clientpositive/union22.q.out
    hive/trunk/ql/src/test/results/clientpositive/union24.q.out
    hive/trunk/ql/src/test/results/clientpositive/union_ppr.q.out

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Mon Nov 18 19:29:24 2013
@@ -635,6 +635,14 @@ public class HiveConf extends Configurat
     HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0),
     HIVE_STATS_KEY_PREFIX_MAX_LENGTH("hive.stats.key.prefix.max.length", 150),
     HIVE_STATS_KEY_PREFIX("hive.stats.key.prefix", ""), // internal usage only
+    // if length of variable length data type cannot be determined this length will be used.
+    HIVE_STATS_MAX_VARIABLE_LENGTH("hive.stats.max.variable.length", 100),
+    // if number of elements in list cannot be determined, this value will be used
+    HIVE_STATS_LIST_NUM_ENTRIES("hive.stats.list.num.entries", 10),
+    // if number of elements in map cannot be determined, this value will be used
+    HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10),
+    // to accurately compute statistics for GROUPBY map side parallelism needs to be known
+    HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1),
 
     // Concurrency
     HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false),

Added: hive/trunk/data/files/alltypes.txt
URL: http://svn.apache.org/viewvc/hive/trunk/data/files/alltypes.txt?rev=1543120&view=auto
==============================================================================
--- hive/trunk/data/files/alltypes.txt (added)
+++ hive/trunk/data/files/alltypes.txt Mon Nov 18 19:29:24 2013
@@ -0,0 +1,2 @@
+true|10|100|1000|10000|4.0|20.0|2.2222|1969-12-31 15:59:58.174|1970-01-01 00:00:00|hello|k1:v1,k2:v2|100,200|{10, "foo"}
+true|20|200|2000|20000|8.0|40.0|4.2222|1970-12-31 15:59:58.174|1971-01-01 00:00:00||k3:v3,k4:v4|200,300|{20, "bar"}

Added: hive/trunk/data/files/dept.txt
URL: http://svn.apache.org/viewvc/hive/trunk/data/files/dept.txt?rev=1543120&view=auto
==============================================================================
--- hive/trunk/data/files/dept.txt (added)
+++ hive/trunk/data/files/dept.txt Mon Nov 18 19:29:24 2013
@@ -0,0 +1,4 @@
+31|sales
+33|engineering
+34|clerical
+35|marketing

Added: hive/trunk/data/files/emp.txt
URL: http://svn.apache.org/viewvc/hive/trunk/data/files/emp.txt?rev=1543120&view=auto
==============================================================================
--- hive/trunk/data/files/emp.txt (added)
+++ hive/trunk/data/files/emp.txt Mon Nov 18 19:29:24 2013
@@ -0,0 +1,6 @@
+Rafferty|31
+Jones|33
+Steinberg|33
+Robinson|34
+Smith|34
+John|

Added: hive/trunk/data/files/loc.txt
URL: http://svn.apache.org/viewvc/hive/trunk/data/files/loc.txt?rev=1543120&view=auto
==============================================================================
--- hive/trunk/data/files/loc.txt (added)
+++ hive/trunk/data/files/loc.txt Mon Nov 18 19:29:24 2013
@@ -0,0 +1,8 @@
+OH|31|43201|2001
+IO|32|43202|2001
+CA|35|43809|2001
+FL|33|54342|2001
+UT|35||2001
+CA|35|43809|2001
+|34|40000|
+FL|33|54342|2001

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/ErrorMsg.java Mon Nov 18 19:29:24 2013
@@ -408,6 +408,8 @@ public enum ErrorMsg {
   DROP_COMMAND_NOT_ALLOWED_FOR_PARTITION(30011, "Partition protected from being dropped"),
   COLUMNSTATSCOLLECTOR_INVALID_COLUMN(30012, "Column statistics are not supported "
       + "for partition columns"),
+
+  STATISTICS_CLONING_FAILED(30013, "Cloning of statistics failed"),
     ;
 
   private int errorCode;

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java Mon Nov 18 19:29:24 2013
@@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.parse.S
 import org.apache.hadoop.hive.ql.plan.Explain;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.plan.api.OperatorType;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
@@ -1594,4 +1595,17 @@ public abstract class Operator<T extends
     }
     return false;
   }
+
+  public Statistics getStatistics() {
+    if (conf != null) {
+      return conf.getStatistics();
+    }
+    return null;
+  }
+
+  public void setStatistics(Statistics stats) {
+    if (conf != null) {
+      conf.setStatistics(stats);
+    }
+  }
 }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java Mon Nov 18 19:29:24 2013
@@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.optimiz
 import org.apache.hadoop.hive.ql.optimizer.listbucketingpruner.ListBucketingPruner;
 import org.apache.hadoop.hive.ql.optimizer.pcr.PartitionConditionRemover;
 import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
+import org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics;
 import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcessor;
 import org.apache.hadoop.hive.ql.parse.ParseContext;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
@@ -114,6 +115,9 @@ public class Optimizer {
     if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTIMIZEMETADATAQUERIES)) {
       transformations.add(new StatsOptimizer());
     }
+    if (pctx.getContext().getExplain()) {
+      transformations.add(new AnnotateWithStatistics());
+    }
     transformations.add(new SimpleFetchOptimizer());  // must be called last
 
     if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEFETCHTASKAGGR)) {

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateStatsProcCtx.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+
+public class AnnotateStatsProcCtx implements NodeProcessorCtx {
+
+  private ParseContext pctx;
+  private HiveConf conf;
+  private Statistics andExprStats = null;
+
+  public AnnotateStatsProcCtx(ParseContext pctx) {
+    this.setParseContext(pctx);
+    if(pctx != null) {
+      this.setConf(pctx.getConf());
+    } else {
+      this.setConf(null);
+    }
+  }
+
+  public HiveConf getConf() {
+    return conf;
+  }
+
+  public void setConf(HiveConf conf) {
+    this.conf = conf;
+  }
+
+  public ParseContext getParseContext() {
+    return pctx;
+  }
+
+  public void setParseContext(ParseContext pctx) {
+    this.pctx = pctx;
+  }
+
+  public Statistics getAndExprStats() {
+    return andExprStats;
+  }
+
+  public void setAndExprStats(Statistics andExprStats) {
+    this.andExprStats = andExprStats;
+  }
+
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
+import org.apache.hadoop.hive.ql.exec.DemuxOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.LimitOperator;
+import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.PreOrderWalker;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.optimizer.Transform;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+public class AnnotateWithStatistics implements Transform {
+
+  @Override
+  public ParseContext transform(ParseContext pctx) throws SemanticException {
+    AnnotateStatsProcCtx aspCtx = new AnnotateStatsProcCtx(pctx);
+
+    // create a walker which walks the tree in a DFS manner while maintaining the
+    // operator stack. The dispatcher generates the plan from the operator tree
+    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
+    opRules.put(new RuleRegExp("TS", TableScanOperator.getOperatorName() + "%"),
+        StatsRulesProcFactory.getTableScanRule());
+    opRules.put(new RuleRegExp("SEL", SelectOperator.getOperatorName() + "%"),
+        StatsRulesProcFactory.getSelectRule());
+    opRules.put(new RuleRegExp("FIL", FilterOperator.getOperatorName() + "%"),
+        StatsRulesProcFactory.getFilterRule());
+    opRules.put(new RuleRegExp("GBY", GroupByOperator.getOperatorName() + "%"),
+        StatsRulesProcFactory.getGroupByRule());
+    opRules.put(new RuleRegExp("JOIN", CommonJoinOperator.getOperatorName() + "%|"
+        + MapJoinOperator.getOperatorName() + "%"), StatsRulesProcFactory.getJoinRule());
+    opRules.put(new RuleRegExp("LIM", LimitOperator.getOperatorName() + "%"),
+        StatsRulesProcFactory.getLimitRule());
+
+    // The dispatcher fires the processor corresponding to the closest matching
+    // rule and passes the context along
+    Dispatcher disp = new DefaultRuleDispatcher(StatsRulesProcFactory.getDefaultRule(), opRules,
+        aspCtx);
+    GraphWalker ogw = new PreOrderWalker(disp);
+
+    // Create a list of topop nodes
+    ArrayList<Node> topNodes = new ArrayList<Node>();
+    topNodes.addAll(pctx.getTopOps().values());
+    ogw.startWalking(topNodes, null);
+
+    return pctx;
+  }
+}

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,1004 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.ql.ErrorMsg;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
+import org.apache.hadoop.hive.ql.exec.DemuxOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.LimitOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.JoinDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
+import org.apache.hadoop.hive.serde.serdeConstants;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class StatsRulesProcFactory {
+
+  /**
+   * Collect basic statistics like number of rows, data size and column level
+   * statistics from the table. Also sets the state of the available statistics.
+   * Basic and column statistics can have one of the following states
+   * COMPLETE, PARTIAL, NONE. In case of partitioned table, the basic and column
+   * stats are aggregated together to table level statistics.
+   *
+   */
+  public static class TableScanStatsRule extends DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      TableScanOperator tsop = (TableScanOperator) nd;
+      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
+      PrunedPartitionList partList = null;
+      try {
+        partList = aspCtx.getParseContext().getPrunedPartitions(tsop.getName(), tsop);
+      } catch (HiveException e1) {
+        throw new SemanticException(e1);
+      }
+      Table table = aspCtx.getParseContext().getTopToTable().get(tsop);
+
+      // gather statistics for the first time and the attach it to table scan operator
+      Statistics stats = StatsUtils.collectStatistics(aspCtx.getConf(), partList, table, tsop);
+      try {
+        tsop.setStatistics(stats.clone());
+      } catch (CloneNotSupportedException e) {
+        throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
+      }
+      return null;
+    }
+  }
+
+  /**
+   * SELECT operator doesn't change the number of rows emitted from the parent
+   * operator. It changes the size of each tuple emitted. In a typical case,
+   * where only subset of columns are selected the average row size will
+   * reduce as some of the columns will be pruned. In order to accurately
+   * compute the average row size, column level statistics is required.
+   * Column level statistics stores average size of values in column which
+   * can be used to more reliably estimate the reduction in size of each
+   * tuple. In the absence of column level statistics, size of columns will be
+   * based on data type. For primitive data types size from
+   * {@link org.apache.hadoop.hive.ql.util.JavaDataModel} will be
+   * used and for variable length data types worst case will be assumed.
+   *
+   * <p>
+   * <i>For more information, refer 'Estimating The Cost Of Operations' chapter in
+   * "Database Systems: The Complete Book" by Garcia-Molina et. al.</i>
+   * </p>
+   *
+   */
+  public static class SelectStatsRule extends DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+
+      SelectOperator sop = (SelectOperator) nd;
+      Operator<? extends OperatorDesc> parent = sop.getParentOperators().get(0);
+      Statistics parentStats = parent.getStatistics();
+      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
+      HiveConf conf = aspCtx.getConf();
+
+      // SELECT (*) does not change the statistics. Just pass on the parent statistics
+      if (sop.getConf().isSelectStar()) {
+        try {
+          if (parentStats != null) {
+            sop.setStatistics(parentStats.clone());
+          }
+        } catch (CloneNotSupportedException e) {
+          throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
+        }
+        return null;
+      }
+
+      try {
+        if (satisfyPrecondition(parentStats)) {
+          Statistics stats = parentStats.clone();
+          List<ColStatistics> colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats,
+              sop.getColumnExprMap(), sop.getSchema());
+          long dataSize = StatsUtils.getDataSizeFromColumnStats(stats.getNumRows(), colStats);
+          stats.setColumnStats(colStats);
+          stats.setDataSize(dataSize);
+          sop.setStatistics(stats);
+        } else {
+          if (parentStats != null) {
+            sop.setStatistics(parentStats.clone());
+          }
+        }
+      } catch (CloneNotSupportedException e) {
+        throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
+      }
+      return null;
+    }
+
+  }
+
+  /**
+   * FILTER operator does not change the average row size but it does change
+   * the number of rows emitted. The reduction in the number of rows emitted
+   * is dependent on the filter expression.
+   *
+   * <ul>
+   * <i>Notations:</i>
+   * <li>T(S) - Number of tuples in relations S</li>
+   * <li>V(S,A) - Number of distinct values of attribute A in relation S</li>
+   * </ul>
+   *
+   * <ul>
+   * <i>Rules:</i> <b>
+   * <li>Column equals a constant</li></b> T(S) = T(R) / V(R,A)
+   * <p>
+   * <b>
+   * <li>Inequality conditions</li></b> T(S) = T(R) / 3
+   * <p>
+   * <b>
+   * <li>Not equals comparison</li></b> - Simple formula T(S) = T(R)
+   * <p>
+   * - Alternate formula T(S) = T(R) (V(R,A) - 1) / V(R,A)
+   * <p>
+   * <b>
+   * <li>NOT condition</li></b> T(S) = 1 - T(S'), where T(S') is the satisfying condition
+   * <p>
+   * <b>
+   * <li>Multiple AND conditions</li></b> Cascadingly apply the rules 1 to 3 (order doesn't matter)
+   * <p>
+   * <b>
+   * <li>Multiple OR conditions</li></b> - Simple formula is to evaluate conditions independently
+   * and sum the results T(S) = m1 + m2
+   * <p>
+   *
+   * - Alternate formula T(S) = T(R) * ( 1 - ( 1 - m1/T(R) ) * ( 1 - m2/T(R) ))
+   * <p>
+   * where, m1 is the number of tuples that satisfy condition1 and m2 is the number of tuples that
+   * satisfy condition2
+   * </ul>
+   * <p>
+   * <i>For more information, refer 'Estimating The Cost Of Operations' chapter in
+   * "Database Systems: The Complete Book" by Garcia-Molina et. al.</i>
+   * </p>
+   *
+   */
+  public static class FilterStatsRule extends DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
+      FilterOperator fop = (FilterOperator) nd;
+      Operator<? extends OperatorDesc> parent = fop.getParentOperators().get(0);
+      Statistics parentStats = parent.getStatistics();
+
+      try {
+        if (satisfyPrecondition(parentStats)) {
+          ExprNodeDesc pred = fop.getConf().getPredicate();
+
+          // evaluate filter expression and update statistics
+          long newNumRows = evaluateExpression(parentStats, pred, aspCtx);
+          Statistics st = parentStats.clone();
+          updateStats(st, newNumRows);
+          fop.setStatistics(st);
+        } else {
+          if (parentStats != null) {
+            fop.setStatistics(parentStats.clone());
+          }
+        }
+
+        aspCtx.setAndExprStats(null);
+      } catch (CloneNotSupportedException e) {
+        throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
+      }
+      return null;
+    }
+
+    private long evaluateExpression(Statistics stats, ExprNodeDesc pred,
+        AnnotateStatsProcCtx aspCtx) throws CloneNotSupportedException {
+      long newNumRows = 0;
+      Statistics andStats = null;
+      if (pred instanceof ExprNodeGenericFuncDesc) {
+        ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
+        GenericUDF udf = genFunc.getGenericUDF();
+
+        // for AND condition cascadingly update stats
+        if (udf instanceof GenericUDFOPAnd) {
+          andStats = stats.clone();
+          aspCtx.setAndExprStats(andStats);
+
+          // evaluate children
+          for (ExprNodeDesc child : genFunc.getChildren()) {
+            newNumRows = evaluateChildExpr(aspCtx.getAndExprStats(), child, aspCtx);
+            updateStats(aspCtx.getAndExprStats(), newNumRows);
+          }
+        } else {
+
+          // for OR condition independently compute and update stats
+          if (udf instanceof GenericUDFOPOr) {
+            for (ExprNodeDesc child : genFunc.getChildren()) {
+              newNumRows += evaluateChildExpr(stats, child, aspCtx);
+            }
+          } else if (udf instanceof GenericUDFOPNot) {
+            newNumRows = evaluateNotExpr(stats, pred, aspCtx);
+          } else if (udf instanceof GenericUDFOPNotNull) {
+            newNumRows = evaluateColEqualsNullExpr(stats, pred, aspCtx);
+            newNumRows = stats.getNumRows() - newNumRows;
+          } else if (udf instanceof GenericUDFOPNull) {
+            newNumRows = evaluateColEqualsNullExpr(stats, pred, aspCtx);
+          } else {
+
+            // single predicate condition
+            newNumRows = evaluateChildExpr(stats, pred, aspCtx);
+          }
+        }
+      } else if (pred instanceof ExprNodeColumnDesc) {
+
+        // can be boolean column in which case return true count
+        ExprNodeColumnDesc encd = (ExprNodeColumnDesc) pred;
+        String colName = encd.getColumn();
+        String tabAlias = encd.getTabAlias();
+        String colType = encd.getTypeString();
+        if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
+          ColStatistics cs = stats.getColumnStatisticsForColumn(tabAlias, colName);
+          return cs.getNumTrues();
+        } else {
+
+          // if not boolean column return half the number of rows
+          return stats.getNumRows() / 2;
+        }
+      }
+
+      return newNumRows;
+    }
+
+    private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx)
+        throws CloneNotSupportedException {
+
+      long numRows = stats.getNumRows();
+
+      // if the evaluate yields true then pass all rows else pass 0 rows
+      if (pred instanceof ExprNodeGenericFuncDesc) {
+        ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
+        for (ExprNodeDesc leaf : genFunc.getChildren()) {
+          if (leaf instanceof ExprNodeGenericFuncDesc) {
+
+            // GenericUDF
+            long newNumRows = 0;
+            for (ExprNodeDesc child : ((ExprNodeGenericFuncDesc) pred).getChildren()) {
+              newNumRows = evaluateChildExpr(stats, child, aspCtx);
+            }
+            return numRows - newNumRows;
+          } else if (leaf instanceof ExprNodeConstantDesc) {
+            ExprNodeConstantDesc encd = (ExprNodeConstantDesc) leaf;
+            if (encd.getValue().equals(true)) {
+              return 0;
+            } else {
+              return numRows;
+            }
+          } else if (leaf instanceof ExprNodeColumnDesc) {
+
+            // NOT on boolean columns is possible. in which case return false count.
+            ExprNodeColumnDesc encd = (ExprNodeColumnDesc) leaf;
+            String colName = encd.getColumn();
+            String tabAlias = encd.getTabAlias();
+            String colType = encd.getTypeString();
+            if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
+              ColStatistics cs = stats.getColumnStatisticsForColumn(tabAlias, colName);
+              return cs.getNumFalses();
+            } else {
+
+              // if not boolean column return half the number of rows
+              return numRows / 2;
+            }
+          }
+        }
+      }
+
+      // worst case
+      return numRows;
+    }
+
+    private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred,
+        AnnotateStatsProcCtx aspCtx) {
+
+      long numRows = stats.getNumRows();
+
+      // evaluate similar to "col = constant" expr
+      if (pred instanceof ExprNodeGenericFuncDesc) {
+
+        ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
+        for (ExprNodeDesc leaf : genFunc.getChildren()) {
+
+          if (leaf instanceof ExprNodeColumnDesc) {
+            ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf;
+            String colName = colDesc.getColumn();
+            String tabAlias = colDesc.getTabAlias();
+            ColStatistics cs = stats.getColumnStatisticsForColumn(tabAlias, colName);
+            if (cs != null) {
+              long dvs = cs.getCountDistint();
+              // if NULLs exists, add 1 to distinct count
+              if (cs.getNumNulls() > 0) {
+                dvs += 1;
+              }
+              if (dvs != 0) {
+                return numRows / dvs;
+              } else {
+                return numRows;
+              }
+            }
+          }
+        }
+      }
+
+      // worst case
+      return numRows;
+    }
+
+    private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, AnnotateStatsProcCtx aspCtx)
+        throws CloneNotSupportedException {
+
+      long numRows = stats.getNumRows();
+
+      if (child instanceof ExprNodeGenericFuncDesc) {
+
+        ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) child;
+        GenericUDF udf = genFunc.getGenericUDF();
+
+        if (udf instanceof GenericUDFOPEqual || udf instanceof GenericUDFOPEqualNS) {
+          String colName = null;
+          String tabAlias = null;
+          boolean isConst = false;
+
+          for (ExprNodeDesc leaf : genFunc.getChildren()) {
+            if (leaf instanceof ExprNodeConstantDesc) {
+
+              // if the first argument is const then just set the flag and continue
+              if (colName == null) {
+                isConst = true;
+                continue;
+              }
+              ColStatistics cs = stats.getColumnStatisticsForColumn(tabAlias, colName);
+              if (cs != null) {
+                long dvs = cs.getCountDistint();
+                // if NULLs exists, add 1 to distinct count
+                if (cs.getNumNulls() > 0) {
+                  dvs += 1;
+                }
+
+                if (dvs != 0) {
+                  return numRows / dvs;
+                } else {
+                  return numRows;
+                }
+              }
+            } else if (leaf instanceof ExprNodeColumnDesc) {
+              ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf;
+              colName = colDesc.getColumn();
+              tabAlias = colDesc.getTabAlias();
+
+              // if const is first argument then evaluate the result
+              if (isConst) {
+                ColStatistics cs = stats.getColumnStatisticsForColumn(tabAlias, colName);
+                if (cs != null) {
+                  long dvs = cs.getCountDistint();
+                  // if NULLs exists, add 1 to distinct count
+                  if (cs.getNumNulls() > 0) {
+                    dvs += 1;
+                  }
+
+                  if (dvs != 0) {
+                    return numRows / dvs;
+                  } else {
+                    return numRows;
+                  }
+                }
+              }
+            }
+          }
+        } else if (udf instanceof GenericUDFOPNotEqual) {
+          return numRows;
+        } else if (udf instanceof GenericUDFOPEqualOrGreaterThan ||
+            udf instanceof GenericUDFOPEqualOrLessThan ||
+            udf instanceof GenericUDFOPGreaterThan ||
+            udf instanceof GenericUDFOPLessThan) {
+          return numRows / 3;
+        } else {
+          return evaluateExpression(stats, genFunc, aspCtx);
+        }
+      }
+
+      // worst case
+      return numRows;
+    }
+
+  }
+
+  /**
+   * GROUPBY operator changes the number of rows. The number of rows emitted
+   * by GBY operator will be atleast 1 or utmost T(R) (number of rows in relation T)
+   * based on the aggregation. A better estimate can be found if we have column statistics
+   * on the columns that we are grouping on.
+   * <p>
+   * Suppose if we are grouping by attributes A,B,C and if statistics for columns A,B,C are
+   * available then a better estimate can be found by taking the smaller of product of V(R,[A,B,C])
+   * (product of distinct cardinalities of A,B,C) and T(R)/2.
+   * <p>
+   * T(R) = min (T(R)/2 , V(R,[A,B,C]) ---> [1]
+   *
+   * <p>
+   * In the presence of grouping sets, map-side GBY will emit more rows depending on the size of
+   * grouping set (input rows * size of grouping set). These rows will get reduced because of
+   * map-side hash aggregation. Hash aggregation is an optimization in hive to reduce the number of
+   * rows shuffled between map and reduce stage. This optimization will be disabled if the memory
+   * used for hash aggregation exceeds 90% of max available memory for hash aggregation. The number
+   * of rows emitted from map-side will vary if hash aggregation is enabled throughout execution or
+   * disabled. In the presence of grouping sets, following rules will be applied
+   * <p>
+   * If <b>hash-aggregation is enabled</b>, for query SELECT * FROM table GROUP BY (A,B) WITH CUBE
+   * <p>
+   * T(R) = min(T(R)/2, T(R, GBY(A,B)) + T(R, GBY(A)) + T(R, GBY(B)) + 1))
+   * <p>
+   * where, GBY(A,B), GBY(B), GBY(B) are the GBY rules mentioned above [1]
+   *
+   * <p>
+   * If <b>hash-aggregation is disabled</b>, apply the GBY rule [1] and then multiply the result by
+   * number of elements in grouping set T(R) = T(R) * length_of_grouping_set. Since we do not know
+   * if hash-aggregation is enabled or disabled during compile time, we will assume worst-case i.e,
+   * hash-aggregation is disabled
+   *
+   * <p>
+   * NOTE: The number of rows from map-side GBY operator is dependent on map-side parallelism i.e,
+   * number of mappers. The map-side parallelism is expected from hive config
+   * "hive.stats.map.parallelism". If the config is not set then default parallelism of 1 will be
+   * assumed.
+   *
+   * <p>
+   * <i>For more information, refer 'Estimating The Cost Of Operations' chapter in
+   * "Database Systems: The Complete Book" by Garcia-Molina et. al.</i>
+   * </p>
+   *
+   */
+  public static class GroupByStatsRule extends DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      GroupByOperator gop = (GroupByOperator) nd;
+      Operator<? extends OperatorDesc> parent = gop.getParentOperators().get(0);
+      Statistics parentStats = parent.getStatistics();
+      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
+      HiveConf conf = aspCtx.getConf();
+      int mapSideParallelism = HiveConf.getIntVar(conf,
+          HiveConf.ConfVars.HIVE_STATS_MAP_SIDE_PARALLELISM);
+
+      try {
+        if (satisfyPrecondition(parentStats)) {
+          Statistics stats = parentStats.clone();
+          RowSchema rs = gop.getSchema();
+          List<AggregationDesc> aggDesc = gop.getConf().getAggregators();
+          Map<String, ExprNodeDesc> colExprMap = gop.getColumnExprMap();
+          List<ColStatistics> colStats = StatsUtils.getColStatisticsFromExprMap(conf, parentStats,
+              colExprMap, rs);
+          stats.setColumnStats(colStats);
+          long dvProd = 1;
+          long newNumRows = 0;
+
+          // compute product of distinct values of grouping columns
+          for (ColStatistics cs : colStats) {
+            if (cs != null) {
+              long dv = cs.getCountDistint();
+              if (cs.getNumNulls() > 0) {
+                dv += 1;
+              }
+              dvProd *= dv;
+            }
+          }
+
+          // map side
+          if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator) {
+
+            // since we do not know if hash-aggregation will be enabled or disabled
+            // at runtime we will assume that map-side group by does not do any reduction.
+            // hence no group by rule will be applied
+
+            // map-side grouping set present. if grouping set is present then
+            // multiply the number of rows by number of elements in grouping set
+            if (gop.getConf().isGroupingSetsPresent()) {
+              int multiplier = gop.getConf().getListGroupingSets().size();
+
+              // take into account the map-side parallelism as well, default is 1
+              multiplier *= mapSideParallelism;
+              newNumRows = multiplier * stats.getNumRows();
+              long dataSize = multiplier * stats.getDataSize();
+              stats.setNumRows(newNumRows);
+              stats.setDataSize(dataSize);
+              for (ColStatistics cs : colStats) {
+                if (cs != null) {
+                  long oldNumNulls = cs.getNumNulls();
+                  long newNumNulls = multiplier * oldNumNulls;
+                  cs.setNumNulls(newNumNulls);
+                }
+              }
+            } else {
+
+              // map side no grouping set
+              newNumRows = stats.getNumRows() * mapSideParallelism;
+              updateStats(stats, newNumRows);
+            }
+          } else {
+
+            // reduce side
+            newNumRows = applyGBYRule(stats.getNumRows(), dvProd);
+            updateStats(stats, newNumRows);
+          }
+
+          // if UDAFs are present, new columns needs to be added
+          if (!aggDesc.isEmpty()) {
+            List<ColStatistics> aggColStats = Lists.newArrayList();
+            for (ColumnInfo ci : rs.getSignature()) {
+
+              // if the columns in row schema is not contained in column
+              // expression map, then those are the aggregate columns that
+              // are added GBY operator. we will estimate the column statistics
+              // for those newly added columns
+              if (!colExprMap.containsKey(ci.getInternalName())) {
+                String colName = ci.getInternalName();
+                colName = StatsUtils.stripPrefixFromColumnName(colName);
+                String tabAlias = ci.getTabAlias();
+                String colType = ci.getTypeName();
+                ColStatistics cs = new ColStatistics(tabAlias, colName, colType);
+                cs.setCountDistint(stats.getNumRows());
+                cs.setNumNulls(0);
+                cs.setAvgColLen(StatsUtils.getAvgColLenOfFixedLengthTypes(colType));
+                aggColStats.add(cs);
+              }
+            }
+            stats.addToColumnStats(aggColStats);
+
+            // if UDAF present and if column expression map is empty then it must
+            // be full aggregation query like count(*) in which case number of rows will be 1
+            if (colExprMap.isEmpty()) {
+              stats.setNumRows(1);
+              updateStats(stats, 1);
+            }
+          }
+
+          gop.setStatistics(stats);
+        } else {
+          if (parentStats != null) {
+            gop.setStatistics(parentStats.clone());
+          }
+        }
+      } catch (CloneNotSupportedException e) {
+        throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
+      }
+      return null;
+    }
+
+    private long applyGBYRule(long numRows, long dvProd) {
+      long newNumRows = numRows;
+
+      // to avoid divide by 2 to become 0
+      if (numRows > 1) {
+        if (dvProd != 0) {
+          newNumRows = Math.min(numRows / 2, dvProd);
+        } else {
+          newNumRows = numRows / 2;
+        }
+      }
+      return newNumRows;
+    }
+  }
+
+  /**
+   * JOIN operator can yield any of the following three cases <li>The values of join keys are
+   * disjoint in both relations in which case T(RXS) = 0 (we need histograms for this)</li> <li>Join
+   * key is primary key on relation R and foreign key on relation S in which case every tuple in S
+   * will have a tuple in R T(RXS) = T(S) (we need histograms for this)</li> <li>Both R & S relation
+   * have same value for join-key. Ex: bool column with all true values T(RXS) = T(R) * T(S) (we
+   * need histograms for this. counDistinct = 1 and same value)</li>
+   *
+   * <p>
+   * In the absence of histograms, we can use the following general case
+   * <p>
+   * <b>Single attribute</b>
+   * <p>
+   * T(RXS) = (T(R)*T(S))/max(V(R,Y), V(S,Y)) where Y is the join attribute
+   * <p>
+   * <b>Multiple attributes</b>
+   * <p>
+   * T(RXS) = T(R)*T(S)/max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)), where y1 and y2 are the join
+   * attributes
+   *
+   * <p>
+   * <i>For more information, refer 'Estimating The Cost Of Operations' chapter in
+   * "Database Systems: The Complete Book" by Garcia-Molina et. al.</i>
+   * </p>
+   */
+  public static class JoinStatsRule extends DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      CommonJoinOperator<? extends JoinDesc> jop = (CommonJoinOperator<? extends JoinDesc>) nd;
+      List<Operator<? extends OperatorDesc>> parents = jop.getParentOperators();
+      AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
+      HiveConf conf = aspCtx.getConf();
+      boolean allStatsAvail = true;
+      boolean allSatisfyPreCondition = true;
+
+      for (Operator<? extends OperatorDesc> op : parents) {
+        if (op.getStatistics() == null) {
+          allStatsAvail = false;
+        }
+      }
+
+      if (allStatsAvail) {
+
+        for (Operator<? extends OperatorDesc> op : parents) {
+          if (!satisfyPrecondition(op.getStatistics())) {
+            allSatisfyPreCondition = false;
+          }
+        }
+
+        if (allSatisfyPreCondition) {
+          // statistics object that is combination of statistics from all relations involved in JOIN
+          Statistics stats = new Statistics();
+          long prodRows = 1;
+          List<Long> distinctVals = Lists.newArrayList();
+          boolean multiAttr = false;
+
+
+          Map<String, ColStatistics> joinedColStats = Maps.newHashMap();
+          Map<Integer, List<String>> joinKeys = Maps.newHashMap();
+
+          // get the join keys from parent ReduceSink operators
+          for (int pos = 0; pos < parents.size(); pos++) {
+            ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
+
+            Statistics parentStats = parent.getStatistics();
+            prodRows *= parentStats.getNumRows();
+            List<ExprNodeDesc> keyExprs = parent.getConf().getKeyCols();
+
+            // multi-attribute join key
+            if (keyExprs.size() > 1) {
+              multiAttr = true;
+            }
+
+            // compute fully qualified join key column names. this name will be used to
+            // quickly look-up for column statistics of join key.
+            // TODO: expressions in join condition will be ignored. assign internal name
+            // for expressions and estimate column statistics for expression.
+            List<String> fqCols = StatsUtils.getFullQualifedColNameFromExprs(keyExprs,
+                parent.getColumnExprMap());
+            joinKeys.put(pos, fqCols);
+
+            Map<String, ExprNodeDesc> colExprMap = parent.getColumnExprMap();
+            RowSchema rs = parent.getSchema();
+
+            // get column statistics for all output columns
+            List<ColStatistics> cs = StatsUtils.getColStatisticsFromExprMap(conf, parentStats,
+                colExprMap, rs);
+            for (ColStatistics c : cs) {
+              if (c != null) {
+                joinedColStats.put(c.getFullyQualifiedColName(), c);
+              }
+            }
+
+            // since new statistics is derived from all relations involved in JOIN,
+            // we need to update the state information accordingly
+            stats.updateBasicStatsState(parentStats.getBasicStatsState());
+            stats.updateColumnStatsState(parentStats.getColumnStatsState());
+          }
+
+          // compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single attribute join.
+          // else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) in case of multi-attribute join
+          long denom = 1;
+          if (multiAttr) {
+            List<Long> perAttrDVs = Lists.newArrayList();
+            int numAttr = joinKeys.get(0).size();
+            for (int idx = 0; idx < numAttr; idx++) {
+              for (Integer i : joinKeys.keySet()) {
+                String col = joinKeys.get(i).get(idx);
+                ColStatistics cs = joinedColStats.get(col);
+                if (cs != null) {
+                  perAttrDVs.add(cs.getCountDistint());
+                }
+              }
+              distinctVals.add(getDenominator(perAttrDVs));
+              perAttrDVs.clear();
+            }
+
+            for (Long l : distinctVals) {
+              denom *= l;
+            }
+          } else {
+            for (List<String> jkeys : joinKeys.values()) {
+              for (String jk : jkeys) {
+                ColStatistics cs = joinedColStats.get(jk);
+                if (cs != null) {
+                  distinctVals.add(cs.getCountDistint());
+                }
+              }
+            }
+            denom = getDenominator(distinctVals);
+          }
+
+          // column statistics from different sources are put together and rename
+          // fully qualified column names based on output schema of join operator
+          Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
+          RowSchema rs = jop.getSchema();
+          List<ColStatistics> outColStats = Lists.newArrayList();
+          for (ColumnInfo ci : rs.getSignature()) {
+            String key = ci.getInternalName();
+            ExprNodeDesc end = colExprMap.get(key);
+            if (end instanceof ExprNodeColumnDesc) {
+              String colName = ((ExprNodeColumnDesc) end).getColumn();
+              colName = StatsUtils.stripPrefixFromColumnName(colName);
+              String tabAlias = ((ExprNodeColumnDesc) end).getTabAlias();
+              String fqColName = StatsUtils.getFullyQualifiedColumnName(tabAlias, colName);
+              ColStatistics cs = joinedColStats.get(fqColName);
+              String outColName = key;
+              String outTabAlias = ci.getTabAlias();
+              outColName = StatsUtils.stripPrefixFromColumnName(outColName);
+              if (cs != null) {
+                cs.setColumnName(outColName);
+                cs.setTableAlias(outTabAlias);
+              }
+              outColStats.add(cs);
+            }
+          }
+
+          // update join statistics
+          stats.setColumnStats(outColStats);
+          long newRowCount = prodRows / denom;
+          stats.setNumRows(newRowCount);
+          stats.setDataSize(StatsUtils.getDataSizeFromColumnStats(newRowCount, outColStats));
+          jop.setStatistics(stats);
+        }
+      }
+      return null;
+    }
+
+    private long getDenominator(List<Long> distinctVals) {
+
+      // simple join from 2 relations
+      // denom = max(v1, v2)
+      if (distinctVals.size() <= 2) {
+        return Collections.max(distinctVals);
+      } else {
+
+        // join from multiple relations
+        // denom = max(v1, v2) * max(v2, v3) * max(v3, v4)
+        long denom = 1;
+        for (int i = 0; i < distinctVals.size() - 1; i++) {
+          long v1 = distinctVals.get(i);
+          long v2 = distinctVals.get(i + 1);
+          if (v1 >= v2) {
+            denom *= v1;
+          } else {
+            denom *= v2;
+          }
+        }
+        return denom;
+      }
+    }
+
+  }
+
+  /**
+   * LIMIT operator changes the number of rows and thereby the data size.
+   *
+   */
+  public static class LimitStatsRule extends DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      LimitOperator lop = (LimitOperator) nd;
+      Operator<? extends OperatorDesc> parent = lop.getParentOperators().get(0);
+      Statistics parentStats = parent.getStatistics();
+
+      try {
+        if (satisfyPrecondition(parentStats)) {
+          Statistics stats = parentStats.clone();
+          long limit = -1;
+          limit = lop.getConf().getLimit();
+          if (limit == -1) {
+            limit = lop.getConf().getLeastRows();
+          }
+
+          // if limit is greate than available rows then do not update statistics
+          if (limit <= parentStats.getNumRows()) {
+            updateStats(stats, limit);
+          }
+          lop.setStatistics(stats);
+        } else {
+          if (parentStats != null) {
+            lop.setStatistics(parentStats.clone());
+          }
+        }
+      } catch (CloneNotSupportedException e) {
+        throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg());
+      }
+      return null;
+    }
+
+  }
+
+  /**
+   * Default rule is to aggregate the statistics from all its parent operators.
+   *
+   */
+  public static class DefaultStatsRule implements NodeProcessor {
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      Operator<? extends OperatorDesc> op = (Operator<? extends OperatorDesc>) nd;
+      OperatorDesc conf = op.getConf();
+
+      if (conf != null) {
+        Statistics stats = conf.getStatistics();
+        if (stats == null) {
+          if (op.getParentOperators() != null) {
+
+            // if parent statistics is null then that branch of the tree is not walked yet.
+            // don't update the stats until all branches are walked
+            if (isAllParentsContainStatistics(op)) {
+              stats = new Statistics();
+              for (Operator<? extends OperatorDesc> parent : op.getParentOperators()) {
+                if (parent.getStatistics() != null) {
+                  Statistics parentStats = parent.getStatistics();
+                  stats.addToNumRows(parentStats.getNumRows());
+                  stats.addToDataSize(parentStats.getDataSize());
+                  stats.updateBasicStatsState(parentStats.getBasicStatsState());
+                  stats.updateColumnStatsState(parentStats.getColumnStatsState());
+                  stats.addToColumnStats(parentStats.getColumnStats());
+                  op.getConf().setStatistics(stats);
+                }
+              }
+            }
+          }
+        }
+      }
+      return null;
+    }
+
+    // check if all parent statistics are available
+    private boolean isAllParentsContainStatistics(Operator<? extends OperatorDesc> op) {
+      for (Operator<? extends OperatorDesc> parent : op.getParentOperators()) {
+        if (parent.getStatistics() == null) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+  }
+
+  public static NodeProcessor getTableScanRule() {
+    return new TableScanStatsRule();
+  }
+
+  public static NodeProcessor getSelectRule() {
+    return new SelectStatsRule();
+  }
+
+  public static NodeProcessor getFilterRule() {
+    return new FilterStatsRule();
+  }
+
+  public static NodeProcessor getGroupByRule() {
+    return new GroupByStatsRule();
+  }
+
+  public static NodeProcessor getJoinRule() {
+    return new JoinStatsRule();
+  }
+
+  public static NodeProcessor getLimitRule() {
+    return new LimitStatsRule();
+  }
+
+  public static NodeProcessor getDefaultRule() {
+    return new DefaultStatsRule();
+  }
+
+  /**
+   * Update the basic statistics of the statistics object based on the row number
+   *
+   * @param stats
+   *          - statistics to be updated
+   * @param newNumRows
+   *          - new number of rows
+   */
+  static void updateStats(Statistics stats, long newNumRows) {
+    long oldRowCount = stats.getNumRows();
+    double ratio = (double) newNumRows / (double) oldRowCount;
+    stats.setNumRows(newNumRows);
+
+    List<ColStatistics> colStats = stats.getColumnStats();
+    for (ColStatistics cs : colStats) {
+      long oldNumNulls = cs.getNumNulls();
+      long oldDV = cs.getCountDistint();
+      long newNumNulls = Math.round(ratio * oldNumNulls);
+      long newDV = oldDV;
+
+      // if ratio is greater than 1, then number of rows increases. This can happen
+      // when some operators like GROUPBY duplicates the input rows in which case
+      // number of distincts should not change. Update the distinct count only when
+      // the output number of rows is less than input number of rows.
+      if (ratio <= 1.0) {
+        newDV = Math.round(ratio * oldDV);
+      }
+      cs.setNumNulls(newNumNulls);
+      cs.setCountDistint(newDV);
+    }
+    stats.setColumnStats(colStats);
+    long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
+    stats.setDataSize(newDataSize);
+  }
+
+  static boolean satisfyPrecondition(Statistics stats) {
+    return stats != null && stats.getBasicStatsState().equals(Statistics.State.COMPLETE)
+        && !stats.getColumnStatsState().equals(Statistics.State.NONE);
+  }
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java Mon Nov 18 19:29:24 2013
@@ -21,6 +21,18 @@ package org.apache.hadoop.hive.ql.plan;
 public class AbstractOperatorDesc implements OperatorDesc {
 
   private boolean vectorMode = false;
+  protected transient Statistics statistics;
+
+  @Override
+  @Explain(displayName = "Statistics", normalExplain = false)
+  public Statistics getStatistics() {
+    return statistics;
+  }
+
+  @Override
+  public void setStatistics(Statistics statistics) {
+    this.statistics = statistics;
+  }
 
   @Override
   public Object clone() throws CloneNotSupportedException {

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.plan;
+
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+
+
+public class ColStatistics {
+
+  private String tabAlias;
+  private String colName;
+  private String colType;
+  private String fqColName;
+  private long countDistint;
+  private long numNulls;
+  private double avgColLen;
+  private long numTrues;
+  private long numFalses;
+
+  public ColStatistics(String tabAlias, String colName, String colType) {
+    this.setTableAlias(tabAlias);
+    this.setColumnName(colName);
+    this.setColumnType(colType);
+    this.setFullyQualifiedColName(StatsUtils.getFullyQualifiedColumnName(tabAlias, colName));
+  }
+
+  public ColStatistics() {
+    this(null, null, null);
+  }
+
+  public String getColumnName() {
+    return colName;
+  }
+
+  public void setColumnName(String colName) {
+    this.colName = colName;
+    this.fqColName = StatsUtils.getFullyQualifiedColumnName(tabAlias, colName);
+  }
+
+  public String getColumnType() {
+    return colType;
+  }
+
+  public void setColumnType(String colType) {
+    this.colType = colType;
+  }
+
+  public long getCountDistint() {
+    return countDistint;
+  }
+
+  public void setCountDistint(long countDistint) {
+    this.countDistint = countDistint;
+  }
+
+  public long getNumNulls() {
+    return numNulls;
+  }
+
+  public void setNumNulls(long numNulls) {
+    this.numNulls = numNulls;
+  }
+
+  public double getAvgColLen() {
+    return avgColLen;
+  }
+
+  public void setAvgColLen(double avgColLen) {
+    this.avgColLen = avgColLen;
+  }
+
+  public String getFullyQualifiedColName() {
+    return fqColName;
+  }
+
+  public void setFullyQualifiedColName(String fqColName) {
+    this.fqColName = fqColName;
+  }
+
+  public String getTableAlias() {
+    return tabAlias;
+  }
+
+  public void setTableAlias(String tabName) {
+    this.tabAlias = tabName;
+    this.fqColName = StatsUtils.getFullyQualifiedColumnName(tabName, colName);
+  }
+
+  public long getNumTrues() {
+    return numTrues;
+  }
+
+  public void setNumTrues(long numTrues) {
+    this.numTrues = numTrues;
+  }
+
+  public long getNumFalses() {
+    return numFalses;
+  }
+
+  public void setNumFalses(long numFalses) {
+    this.numFalses = numFalses;
+  }
+
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(" fqColName: ");
+    sb.append(fqColName);
+    sb.append(" colName: ");
+    sb.append(colName);
+    sb.append(" colType: ");
+    sb.append(colType);
+    sb.append(" countDistincts: ");
+    sb.append(countDistint);
+    sb.append(" numNulls: ");
+    sb.append(numNulls);
+    sb.append(" avgColLen: ");
+    sb.append(avgColLen);
+    sb.append(" numTrues: ");
+    sb.append(numTrues);
+    sb.append(" numFalses: ");
+    sb.append(numFalses);
+    return sb.toString();
+  }
+
+  @Override
+  public ColStatistics clone() throws CloneNotSupportedException {
+    ColStatistics clone = new ColStatistics(tabAlias, colName, colType);
+    clone.setFullyQualifiedColName(fqColName);
+    clone.setAvgColLen(avgColLen);
+    clone.setCountDistint(countDistint);
+    clone.setNumNulls(numNulls);
+    clone.setNumTrues(numTrues);
+    clone.setNumFalses(numFalses);
+    return clone;
+  }
+
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java?rev=1543120&r1=1543119&r2=1543120&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java Mon Nov 18 19:29:24 2013
@@ -22,4 +22,6 @@ import java.io.Serializable;
 
 public interface OperatorDesc extends Serializable, Cloneable {
   public Object clone() throws CloneNotSupportedException;
+  public Statistics getStatistics();
+  public void setStatistics(Statistics statistics);
 }

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java?rev=1543120&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java Mon Nov 18 19:29:24 2013
@@ -0,0 +1,252 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.plan;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * Statistics. Describes the output of an operator in terms of size, rows, etc
+ * based on estimates.
+ */
+@SuppressWarnings("serial")
+public class Statistics implements Serializable {
+
+  public enum State {
+    COMPLETE, PARTIAL, NONE
+  }
+
+  private long numRows;
+  private long dataSize;
+  private State basicStatsState;
+  private Map<String, ColStatistics> columnStats;
+  private State columnStatsState;
+
+  public Statistics() {
+    this(0, 0);
+  }
+
+  public Statistics(long nr, long ds) {
+    this.numRows = nr;
+    this.dataSize = ds;
+    this.basicStatsState = State.NONE;
+    this.columnStats = null;
+    this.columnStatsState = State.NONE;
+  }
+
+  public long getNumRows() {
+    return numRows;
+  }
+
+  public void setNumRows(long numRows) {
+    this.numRows = numRows;
+  }
+
+  public long getDataSize() {
+    return dataSize;
+  }
+
+  public void setDataSize(long dataSize) {
+    this.dataSize = dataSize;
+  }
+
+  public State getBasicStatsState() {
+    return basicStatsState;
+  }
+
+  public void setBasicStatsState(State basicStatsState) {
+    this.basicStatsState = basicStatsState;
+  }
+
+  public State getColumnStatsState() {
+    return columnStatsState;
+  }
+
+  public void setColumnStatsState(State columnStatsState) {
+    this.columnStatsState = columnStatsState;
+  }
+
+  @Override
+  @Explain(displayName = "")
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(" numRows: ");
+    sb.append(numRows);
+    sb.append(" dataSize: ");
+    sb.append(dataSize);
+    sb.append(" basicStatsState: ");
+    sb.append(basicStatsState);
+    sb.append(" colStatsState: ");
+    sb.append(columnStatsState);
+    return sb.toString();
+  }
+
+  @Override
+  public Statistics clone() throws CloneNotSupportedException {
+    Statistics clone = new Statistics(numRows, dataSize);
+    clone.setBasicStatsState(basicStatsState);
+    clone.setColumnStatsState(columnStatsState);
+    if (columnStats != null) {
+      Map<String, ColStatistics> cloneColStats = Maps.newHashMap();
+      for (Map.Entry<String, ColStatistics> entry : columnStats.entrySet()) {
+        cloneColStats.put(entry.getKey(), entry.getValue().clone());
+      }
+      clone.setColumnStats(cloneColStats);
+    }
+    return clone;
+  }
+
+  public void addToNumRows(long nr) {
+    numRows += nr;
+  }
+
+  public void addToDataSize(long rds) {
+    dataSize += rds;
+  }
+
+  public void setColumnStats(Map<String, ColStatistics> colStats) {
+    this.columnStats = colStats;
+  }
+
+  public void setColumnStats(List<ColStatistics> colStats) {
+    columnStats = Maps.newHashMap();
+    addToColumnStats(colStats);
+  }
+
+  public void addToColumnStats(List<ColStatistics> colStats) {
+
+    if (columnStats == null) {
+      columnStats = Maps.newHashMap();
+    }
+
+    if (colStats != null) {
+      for (ColStatistics cs : colStats) {
+        ColStatistics updatedCS = null;
+        if (cs != null) {
+
+          String key = cs.getFullyQualifiedColName();
+          // if column statistics for a column is already found then merge the statistics
+          if (columnStats.containsKey(key) && columnStats.get(key) != null) {
+            updatedCS = columnStats.get(key);
+            updatedCS.setAvgColLen(Math.max(updatedCS.getAvgColLen(), cs.getAvgColLen()));
+            updatedCS.setNumNulls(updatedCS.getNumNulls() + cs.getNumNulls());
+            updatedCS.setCountDistint(Math.max(updatedCS.getCountDistint(), cs.getCountDistint()));
+            columnStats.put(key, updatedCS);
+          } else {
+            columnStats.put(key, cs);
+          }
+        }
+      }
+    }
+  }
+
+  // newState
+  //                  -----------------------------------------
+  // basicStatsState  | COMPLETE          PARTIAL      NONE    |
+  //                  |________________________________________|
+  //         COMPLETE | COMPLETE          PARTIAL      PARTIAL |
+  //          PARTIAL | PARTIAL           PARTIAL      PARTIAL |
+  //             NONE | COMPLETE          PARTIAL      NONE    |
+  //                  -----------------------------------------
+  public void updateBasicStatsState(State newState) {
+    if (newState.equals(State.PARTIAL)) {
+      basicStatsState = State.PARTIAL;
+    }
+
+    if (newState.equals(State.NONE)) {
+      if (basicStatsState.equals(State.NONE)) {
+        basicStatsState = State.NONE;
+      } else {
+        basicStatsState = State.PARTIAL;
+      }
+    }
+
+    if (newState.equals(State.COMPLETE)) {
+      if (basicStatsState.equals(State.PARTIAL)) {
+        basicStatsState = State.PARTIAL;
+      } else {
+        basicStatsState = State.COMPLETE;
+      }
+    }
+  }
+
+  // similar to the table above for basic stats
+  public void updateColumnStatsState(State newState) {
+    if (newState.equals(State.PARTIAL)) {
+      columnStatsState = State.PARTIAL;
+    }
+
+    if (newState.equals(State.NONE)) {
+      if (columnStatsState.equals(State.NONE)) {
+        columnStatsState = State.NONE;
+      } else {
+        columnStatsState = State.PARTIAL;
+      }
+    }
+
+    if (newState.equals(State.COMPLETE)) {
+      if (columnStatsState.equals(State.PARTIAL)) {
+        columnStatsState = State.PARTIAL;
+      } else {
+        columnStatsState = State.COMPLETE;
+      }
+    }
+  }
+
+  public long getAvgRowSize() {
+    if (basicStatsState.equals(State.COMPLETE) && numRows != 0) {
+      return dataSize / numRows;
+    }
+
+    return 0;
+  }
+
+  public ColStatistics getColumnStatisticsFromFQColName(String fqColName) {
+    return columnStats.get(fqColName);
+  }
+
+  public ColStatistics getColumnStatisticsFromColName(String colName) {
+    for (ColStatistics cs : columnStats.values()) {
+      if (cs.getColumnName().equalsIgnoreCase(colName)) {
+        return cs;
+      }
+    }
+
+    return null;
+  }
+
+  public ColStatistics getColumnStatisticsForColumn(String tabAlias, String colName) {
+    String fqColName = StatsUtils.getFullyQualifiedColumnName(tabAlias, colName);
+    return getColumnStatisticsFromFQColName(fqColName);
+  }
+
+  public List<ColStatistics> getColumnStats() {
+    if (columnStats != null) {
+      return Lists.newArrayList(columnStats.values());
+    }
+    return null;
+  }
+
+}