You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2015/12/02 18:39:38 UTC
[2/2] hive git commit: HIVE-12491 : Improve ndv heuristic for
functions (Ashutosh Chauhan via Prasanth J, Pengcheng Xiong)
HIVE-12491 : Improve ndv heuristic for functions (Ashutosh Chauhan via Prasanth J, Pengcheng Xiong)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/cc4709f6
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/cc4709f6
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/cc4709f6
Branch: refs/heads/master
Commit: cc4709f64cd3b8d9ca97529b6d80a7c366984250
Parents: 232af9f
Author: Ashutosh Chauhan <ha...@apache.org>
Authored: Tue Dec 1 14:39:13 2015 -0800
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Wed Dec 2 09:38:14 2015 -0800
----------------------------------------------------------------------
.../stats/annotation/StatsRulesProcFactory.java | 346 ++++++++-----------
.../apache/hadoop/hive/ql/stats/StatsUtils.java | 52 ++-
.../hadoop/hive/ql/udf/UDFDayOfMonth.java | 2 +
.../org/apache/hadoop/hive/ql/udf/UDFHour.java | 2 +
.../apache/hadoop/hive/ql/udf/UDFMinute.java | 2 +
.../org/apache/hadoop/hive/ql/udf/UDFMonth.java | 4 +-
.../apache/hadoop/hive/ql/udf/UDFSecond.java | 2 +
.../hadoop/hive/ql/udf/UDFWeekOfYear.java | 2 +
.../org/apache/hadoop/hive/ql/udf/UDFYear.java | 2 +
.../ql/udf/generic/GenericUDFAddMonths.java | 1 +
.../ql/udf/generic/GenericUDFArrayContains.java | 1 +
.../hive/ql/udf/generic/GenericUDFBetween.java | 1 +
.../ql/udf/generic/GenericUDFCurrentDate.java | 2 +
.../udf/generic/GenericUDFCurrentTimestamp.java | 1 +
.../ql/udf/generic/GenericUDFCurrentUser.java | 1 +
.../hive/ql/udf/generic/GenericUDFOPAnd.java | 1 +
.../hive/ql/udf/generic/GenericUDFOPEqual.java | 1 +
.../ql/udf/generic/GenericUDFOPEqualNS.java | 1 +
.../generic/GenericUDFOPEqualOrGreaterThan.java | 1 +
.../generic/GenericUDFOPEqualOrLessThan.java | 1 +
.../ql/udf/generic/GenericUDFOPGreaterThan.java | 1 +
.../ql/udf/generic/GenericUDFOPLessThan.java | 1 +
.../hive/ql/udf/generic/GenericUDFOPNot.java | 1 +
.../ql/udf/generic/GenericUDFOPNotEqual.java | 3 +-
.../ql/udf/generic/GenericUDFOPNotNull.java | 1 +
.../hive/ql/udf/generic/GenericUDFOPNull.java | 1 +
.../hive/ql/udf/generic/GenericUDFOPOr.java | 1 +
.../apache/hadoop/hive/ql/udf/generic/NDV.java | 27 ++
.../hive/ql/udf/generic/UDFCurrentDB.java | 1 +
.../clientpositive/cbo_rp_auto_join0.q.out | 16 +-
.../llap/tez_union_multiinsert.q.out | 88 ++---
.../test/results/clientpositive/orc_llap.q.out | 2 +-
.../results/clientpositive/spark/union17.q.out | 16 +-
.../clientpositive/tez/explainuser_1.q.out | 16 +-
.../clientpositive/tez/explainuser_2.q.out | 40 +--
.../tez/tez_union_multiinsert.q.out | 88 ++---
.../test/results/clientpositive/union17.q.out | 14 +-
37 files changed, 405 insertions(+), 338 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index a8ff158..c1e314f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -967,20 +967,6 @@ public class StatsRulesProcFactory {
// worst-case, hash aggregation disabled
return false;
}
-
- private long applyGBYRule(long numRows, long dvProd) {
- long newNumRows = numRows;
-
- // to avoid divide by 2 to become 0
- if (numRows > 1) {
- if (dvProd != 0) {
- newNumRows = Math.min(numRows / 2, dvProd);
- } else {
- newNumRows = numRows / 2;
- }
- }
- return newNumRows;
- }
}
/**
@@ -1032,170 +1018,156 @@ public class StatsRulesProcFactory {
int numAttr = 1;
AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
HiveConf conf = aspCtx.getConf();
- boolean allStatsAvail = true;
boolean allSatisfyPreCondition = true;
for (Operator<? extends OperatorDesc> op : parents) {
if (op.getStatistics() == null) {
- allStatsAvail = false;
+ return null;
}
}
- if (allStatsAvail) {
-
- for (Operator<? extends OperatorDesc> op : parents) {
- if (!satisfyPrecondition(op.getStatistics())) {
- allSatisfyPreCondition = false;
- }
+ for (Operator<? extends OperatorDesc> op : parents) {
+ if (!satisfyPrecondition(op.getStatistics())) {
+ allSatisfyPreCondition = false;
+ break;
}
+ }
- if (allSatisfyPreCondition) {
-
- // statistics object that is combination of statistics from all
- // relations involved in JOIN
- Statistics stats = new Statistics();
- List<Long> distinctVals = Lists.newArrayList();
- int numParent = parents.size();
- Map<Integer, Long> rowCountParents = Maps.newHashMap();
- Map<Integer, Statistics> joinStats = Maps.newHashMap();
- Map<Integer, List<String>> joinKeys = Maps.newHashMap();
- List<Long> rowCounts = Lists.newArrayList();
-
- // detect if there are multiple attributes in join key
- ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0);
- List<String> keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf()
+ if (allSatisfyPreCondition) {
+
+ // statistics object that is combination of statistics from all
+ // relations involved in JOIN
+ Statistics stats = new Statistics();
+ int numParent = parents.size();
+ Map<Integer, Long> rowCountParents = Maps.newHashMap();
+ Map<Integer, Statistics> joinStats = Maps.newHashMap();
+ Map<Integer, List<String>> joinKeys = Maps.newHashMap();
+ List<Long> rowCounts = Lists.newArrayList();
+
+ // detect if there are multiple attributes in join key
+ ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0);
+ List<String> keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf()
+ .getOutputKeyColumnNames());
+ numAttr = keyExprs.size();
+
+ // infer PK-FK relationship in single attribute join case
+ long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop);
+ // get the join keys from parent ReduceSink operators
+ for (int pos = 0; pos < parents.size(); pos++) {
+ ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
+ Statistics parentStats = parent.getStatistics();
+ keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf()
.getOutputKeyColumnNames());
- numAttr = keyExprs.size();
- // infer PK-FK relationship in single attribute join case
- long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop);
- // get the join keys from parent ReduceSink operators
- for (int pos = 0; pos < parents.size(); pos++) {
- ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos);
- Statistics parentStats = parent.getStatistics();
- keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf()
- .getOutputKeyColumnNames());
-
- rowCountParents.put(pos, parentStats.getNumRows());
- rowCounts.add(parentStats.getNumRows());
+ rowCountParents.put(pos, parentStats.getNumRows());
+ rowCounts.add(parentStats.getNumRows());
- // internal name for expressions and estimate column statistics for expression.
- joinKeys.put(pos, keyExprs);
+ // internal name for expressions and estimate column statistics for expression.
+ joinKeys.put(pos, keyExprs);
- // get column statistics for all output columns
- joinStats.put(pos, parentStats);
+ // get column statistics for all output columns
+ joinStats.put(pos, parentStats);
- // since new statistics is derived from all relations involved in
- // JOIN, we need to update the state information accordingly
- stats.updateColumnStatsState(parentStats.getColumnStatsState());
- }
+ // since new statistics is derived from all relations involved in
+ // JOIN, we need to update the state information accordingly
+ stats.updateColumnStatsState(parentStats.getColumnStatsState());
+ }
- // compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single
- // attribute join, else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2))
+ List<Long> distinctVals = Lists.newArrayList();
+ long denom = 1;
+ if (inferredRowCount == -1) {
+ // failed to infer PK-FK relationship for row count estimation fall-back on default logic
+ // compute denominator max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2))
// in case of multi-attribute join
- long denom = 1;
- if (numAttr > 1) {
- List<Long> perAttrDVs = Lists.newArrayList();
- for (int idx = 0; idx < numAttr; idx++) {
- for (Integer i : joinKeys.keySet()) {
- String col = joinKeys.get(i).get(idx);
- ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col);
- if (cs != null) {
- perAttrDVs.add(cs.getCountDistint());
- }
+ List<Long> perAttrDVs = Lists.newArrayList();
+ for (int idx = 0; idx < numAttr; idx++) {
+ for (Integer i : joinKeys.keySet()) {
+ String col = joinKeys.get(i).get(idx);
+ ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col);
+ if (cs != null) {
+ perAttrDVs.add(cs.getCountDistint());
}
- distinctVals.add(getDenominator(perAttrDVs));
- perAttrDVs.clear();
}
+ distinctVals.add(getDenominator(perAttrDVs));
+ perAttrDVs.clear();
+ }
- if (numAttr > numParent) {
- // To avoid denominator getting larger and aggressively reducing
- // number of rows, we will ease out denominator.
- denom = getEasedOutDenominator(distinctVals);
- } else {
- for (Long l : distinctVals) {
- denom = StatsUtils.safeMult(denom, l);
- }
- }
+ if (numAttr > numParent) {
+ // To avoid denominator getting larger and aggressively reducing
+ // number of rows, we will ease out denominator.
+ denom = StatsUtils.addWithExpDecay(distinctVals);
} else {
- if (numAttr == 1) {
- for (Integer i : joinKeys.keySet()) {
- String col = joinKeys.get(i).get(0);
- ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col);
- if (cs != null) {
- distinctVals.add(cs.getCountDistint());
- }
- }
+ for (Long l : distinctVals) {
+ denom = StatsUtils.safeMult(denom, l);
}
- denom = getDenominator(distinctVals);
}
+ }
- // Update NDV of joined columns to be min(V(R,y), V(S,y))
- updateJoinColumnsNDV(joinKeys, joinStats, numAttr);
-
- // column statistics from different sources are put together and
- // rename based on output schema of join operator
- Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
- RowSchema rs = jop.getSchema();
- List<ColStatistics> outColStats = Lists.newArrayList();
- for (ColumnInfo ci : rs.getSignature()) {
- String key = ci.getInternalName();
- ExprNodeDesc end = colExprMap.get(key);
- if (end instanceof ExprNodeColumnDesc) {
- String colName = ((ExprNodeColumnDesc) end).getColumn();
- int pos = jop.getConf().getReversedExprs().get(key);
- ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName);
- String outColName = key;
- if (cs != null) {
- cs.setColumnName(outColName);
- }
- outColStats.add(cs);
+ // Update NDV of joined columns to be min(V(R,y), V(S,y))
+ updateJoinColumnsNDV(joinKeys, joinStats, numAttr);
+
+ // column statistics from different sources are put together and
+ // rename based on output schema of join operator
+ Map<String, ExprNodeDesc> colExprMap = jop.getColumnExprMap();
+ RowSchema rs = jop.getSchema();
+ List<ColStatistics> outColStats = Lists.newArrayList();
+ for (ColumnInfo ci : rs.getSignature()) {
+ String key = ci.getInternalName();
+ ExprNodeDesc end = colExprMap.get(key);
+ if (end instanceof ExprNodeColumnDesc) {
+ String colName = ((ExprNodeColumnDesc) end).getColumn();
+ int pos = jop.getConf().getReversedExprs().get(key);
+ ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName);
+ String outColName = key;
+ if (cs != null) {
+ cs.setColumnName(outColName);
}
+ outColStats.add(cs);
}
+ }
- // update join statistics
- stats.setColumnStats(outColStats);
- long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom);
- updateStatsForJoinType(stats, newRowCount, jop, rowCountParents);
- jop.setStatistics(stats);
+ // update join statistics
+ stats.setColumnStats(outColStats);
+ long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom);
+ updateStatsForJoinType(stats, newRowCount, jop, rowCountParents);
+ jop.setStatistics(stats);
- if (isDebugEnabled) {
- LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString());
- }
- } else {
+ if (isDebugEnabled) {
+ LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString());
+ }
+ } else {
- // worst case when there are no column statistics
- float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR);
- int numParents = parents.size();
- List<Long> parentRows = Lists.newArrayList();
- List<Long> parentSizes = Lists.newArrayList();
- int maxRowIdx = 0;
- long maxRowCount = 0;
- int idx = 0;
-
- for (Operator<? extends OperatorDesc> op : parents) {
- Statistics ps = op.getStatistics();
- long rowCount = ps.getNumRows();
- if (rowCount > maxRowCount) {
- maxRowCount = rowCount;
- maxRowIdx = idx;
- }
- parentRows.add(rowCount);
- parentSizes.add(ps.getDataSize());
- idx++;
+ // worst case when there are no column statistics
+ float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR);
+ int numParents = parents.size();
+ List<Long> parentRows = Lists.newArrayList();
+ List<Long> parentSizes = Lists.newArrayList();
+ int maxRowIdx = 0;
+ long maxRowCount = 0;
+ int idx = 0;
+
+ for (Operator<? extends OperatorDesc> op : parents) {
+ Statistics ps = op.getStatistics();
+ long rowCount = ps.getNumRows();
+ if (rowCount > maxRowCount) {
+ maxRowCount = rowCount;
+ maxRowIdx = idx;
}
+ parentRows.add(rowCount);
+ parentSizes.add(ps.getDataSize());
+ idx++;
+ }
- long maxDataSize = parentSizes.get(maxRowIdx);
- newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor);
- long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor);
- Statistics wcStats = new Statistics();
- wcStats.setNumRows(newNumRows);
- wcStats.setDataSize(newDataSize);
- jop.setStatistics(wcStats);
+ long maxDataSize = parentSizes.get(maxRowIdx);
+ newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor);
+ long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor);
+ Statistics wcStats = new Statistics();
+ wcStats.setNumRows(newNumRows);
+ wcStats.setDataSize(newDataSize);
+ jop.setStatistics(wcStats);
- if (isDebugEnabled) {
- LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString());
- }
+ if (isDebugEnabled) {
+ LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString());
}
}
return null;
@@ -1204,44 +1176,46 @@ public class StatsRulesProcFactory {
private long inferPKFKRelationship(int numAttr, List<Operator<? extends OperatorDesc>> parents,
CommonJoinOperator<? extends JoinDesc> jop) {
long newNumRows = -1;
- if (numAttr == 1) {
- // If numAttr is 1, this means we join on one single key column.
- Map<Integer, ColStatistics> parentsWithPK = getPrimaryKeyCandidates(parents);
-
- // We only allow one single PK.
- if (parentsWithPK.size() != 1) {
- LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents.");
- return newNumRows;
- }
- Integer pkPos = parentsWithPK.keySet().iterator().next();
- ColStatistics csPK = parentsWithPK.values().iterator().next();
+ if (numAttr != 1) {
+ return newNumRows;
+ }
- // infer foreign key candidates positions
- Map<Integer, ColStatistics> csFKs = getForeignKeyCandidates(parents, csPK);
+ // If numAttr is 1, this means we join on one single key column.
+ Map<Integer, ColStatistics> parentsWithPK = getPrimaryKeyCandidates(parents);
- // we allow multiple foreign keys (snowflake schema)
- // csfKs.size() + 1 == parents.size() means we have a single PK and all
- // the rest ops are FKs.
- if (csFKs.size() + 1 == parents.size()) {
- newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop);
+ // We only allow one single PK.
+ if (parentsWithPK.size() != 1) {
+ LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents.");
+ return newNumRows;
+ }
+ Integer pkPos = parentsWithPK.keySet().iterator().next();
+ ColStatistics csPK = parentsWithPK.values().iterator().next();
- // some debug information
- if (isDebugEnabled) {
- List<String> parentIds = Lists.newArrayList();
+ // infer foreign key candidates positions
+ Map<Integer, ColStatistics> csFKs = getForeignKeyCandidates(parents, csPK);
- // print primary key containing parents
- for (Integer i : parentsWithPK.keySet()) {
- parentIds.add(parents.get(i).toString());
- }
- LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds);
- parentIds.clear();
+ // we allow multiple foreign keys (snowflake schema)
+ // csfKs.size() + 1 == parents.size() means we have a single PK and all
+ // the rest ops are FKs.
+ if (csFKs.size() + 1 == parents.size()) {
+ newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop);
- // print foreign key containing parents
- for (Integer i : csFKs.keySet()) {
- parentIds.add(parents.get(i).toString());
- }
- LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds);
+ // some debug information
+ if (isDebugEnabled) {
+ List<String> parentIds = Lists.newArrayList();
+
+ // print primary key containing parents
+ for (Integer i : parentsWithPK.keySet()) {
+ parentIds.add(parents.get(i).toString());
}
+ LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds);
+ parentIds.clear();
+
+ // print foreign key containing parents
+ for (Integer i : csFKs.keySet()) {
+ parentIds.add(parents.get(i).toString());
+ }
+ LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds);
}
}
return newNumRows;
@@ -1425,20 +1399,6 @@ public class StatsRulesProcFactory {
return result;
}
- private Long getEasedOutDenominator(List<Long> distinctVals) {
- // Exponential back-off for NDVs.
- // 1) Descending order sort of NDVs
- // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * ....
- Collections.sort(distinctVals, Collections.reverseOrder());
-
- long denom = distinctVals.get(0);
- for (int i = 1; i < distinctVals.size(); i++) {
- denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i)));
- }
-
- return denom;
- }
-
private void updateStatsForJoinType(Statistics stats, long newNumRows,
CommonJoinOperator<? extends JoinDesc> jop,
Map<Integer, Long> rowCountParents) {
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 71ed31c..149cbc1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
@@ -43,6 +44,7 @@ import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
@@ -54,6 +56,9 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.hive.ql.plan.Statistics.State;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
@@ -85,11 +90,13 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestamp
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BytesWritable;
+import org.apache.hive.common.util.AnnotationUtils;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@@ -1247,7 +1254,7 @@ public class StatsUtils {
// null projection
if (encd.getValue() == null) {
colName = encd.getName();
- colType = "null";
+ colType = serdeConstants.VOID_TYPE_NAME;
numNulls = numRows;
} else {
colName = encd.getName();
@@ -1261,14 +1268,14 @@ public class StatsUtils {
ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
colName = engfd.getName();
colType = engfd.getTypeString();
- countDistincts = numRows;
+ countDistincts = getNDVFor(engfd, numRows, parentStats);
oi = engfd.getWritableObjectInspector();
} else if (end instanceof ExprNodeColumnListDesc) {
// column list
ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
colName = Joiner.on(",").join(encd.getCols());
- colType = "array";
+ colType = serdeConstants.LIST_TYPE_NAME;
countDistincts = numRows;
oi = encd.getWritableObjectInspector();
} else if (end instanceof ExprNodeFieldDesc) {
@@ -1305,6 +1312,45 @@ public class StatsUtils {
return colStats;
}
+
+ public static Long addWithExpDecay (List<Long> distinctVals) {
+ // Exponential back-off for NDVs.
+ // 1) Descending order sort of NDVs
+ // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * ....
+ Collections.sort(distinctVals, Collections.reverseOrder());
+
+ long denom = distinctVals.get(0);
+ for (int i = 1; i < distinctVals.size(); i++) {
+ denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i)));
+ }
+
+ return denom;
+ }
+
+ private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) {
+
+ GenericUDF udf = engfd.getGenericUDF();
+ if (!FunctionRegistry.isDeterministic(udf)){
+ return numRows;
+ }
+ List<Long> ndvs = Lists.newArrayList();
+ Class<?> udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass();
+ NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class);
+ long udfNDV = Long.MAX_VALUE;
+ if (ndv != null) {
+ udfNDV = ndv.maxNdv();
+ } else {
+ for (String col : engfd.getCols()) {
+ ColStatistics stats = parentStats.getColumnStatisticsFromColName(col);
+ if (stats != null) {
+ ndvs.add(stats.getCountDistint());
+ }
+ }
+ }
+ long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs);
+ return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows));
+ }
+
/**
* Get number of rows of a give table
* @return number of rows
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java
index 21e6ff7..79825fc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDayOfMonthLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDayOfMonthString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
@@ -48,6 +49,7 @@ import org.apache.hadoop.io.Text;
+ "Example:\n "
+ " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 30")
@VectorizedExpressions({VectorUDFDayOfMonthLong.class, VectorUDFDayOfMonthString.class})
+@NDV(maxNdv = 31)
public class UDFDayOfMonth extends UDF {
private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
private final Calendar calendar = Calendar.getInstance();
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java
index 835cecc..87e19ec 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFHourLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFHourString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.io.IntWritable;
@@ -48,6 +49,7 @@ import org.apache.hadoop.io.Text;
+ " 12\n"
+ " > SELECT _FUNC_('12:58:59') FROM src LIMIT 1;\n" + " 12")
@VectorizedExpressions({VectorUDFHourLong.class, VectorUDFHourString.class})
+@NDV(maxNdv = 24)
public class UDFHour extends UDF {
private final SimpleDateFormat formatter1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private final SimpleDateFormat formatter2 = new SimpleDateFormat("HH:mm:ss");
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java
index a9f5393..0f55266 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMinuteLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMinuteString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.io.IntWritable;
@@ -48,6 +49,7 @@ import org.apache.hadoop.io.Text;
+ " 58\n"
+ " > SELECT _FUNC_('12:58:59') FROM src LIMIT 1;\n" + " 58")
@VectorizedExpressions({VectorUDFMinuteLong.class, VectorUDFMinuteString.class})
+@NDV(maxNdv = 60)
public class UDFMinute extends UDF {
private final SimpleDateFormat formatter1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private final SimpleDateFormat formatter2 = new SimpleDateFormat("HH:mm:ss");
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java
index 3365804..efe5ee2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMonthLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMonthString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
@@ -48,11 +49,12 @@ import org.apache.hadoop.io.Text;
+ "Example:\n"
+ " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 7")
@VectorizedExpressions({VectorUDFMonthLong.class, VectorUDFMonthString.class})
+@NDV(maxNdv = 31)
public class UDFMonth extends UDF {
private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
private final Calendar calendar = Calendar.getInstance();
- private IntWritable result = new IntWritable();
+ private final IntWritable result = new IntWritable();
public UDFMonth() {
}
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java
index e7c3d67..b724970 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java
@@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFSecondLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFSecondString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
@@ -51,6 +52,7 @@ import org.apache.hive.common.util.DateUtils;
+ " 59\n"
+ " > SELECT _FUNC_('12:58:59') FROM src LIMIT 1;\n" + " 59")
@VectorizedExpressions({VectorUDFSecondLong.class, VectorUDFSecondString.class})
+@NDV(maxNdv = 60)
public class UDFSecond extends UDF {
private final SimpleDateFormat formatter1 = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java
index f076d1d..42ee1bf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFWeekOfYearLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFWeekOfYearString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.io.IntWritable;
@@ -45,6 +46,7 @@ import org.apache.hadoop.io.Text;
+ " 8\n"
+ " > SELECT _FUNC_('1980-12-31 12:59:59') FROM src LIMIT 1;\n" + " 1")
@VectorizedExpressions({VectorUDFWeekOfYearLong.class, VectorUDFWeekOfYearString.class})
+@NDV(maxNdv = 52)
public class UDFWeekOfYear extends UDF {
private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
private final Calendar calendar = Calendar.getInstance();
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java
index 34b0c47..de46104 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFYearLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFYearString;
+import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
@@ -48,6 +49,7 @@ import org.apache.hadoop.io.Text;
+ "Example:\n "
+ " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 2009")
@VectorizedExpressions({VectorUDFYearLong.class, VectorUDFYearString.class})
+@NDV(maxNdv = 20) // although technically its unbounded, its unlikely we will ever see ndv > 20
public class UDFYear extends UDF {
private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
private final Calendar calendar = Calendar.getInstance();
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java
index 82e5811..dd88473 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java
@@ -49,6 +49,7 @@ import org.apache.hive.common.util.DateUtils;
+ " 'yyyy-MM-dd'. num_months is a number. The time part of start_date is "
+ "ignored.\n"
+ "Example:\n " + " > SELECT _FUNC_('2009-08-31', 1) FROM src LIMIT 1;\n" + " '2009-09-30'")
+@NDV(maxNdv = 250) // 250 seems to be reasonable upper limit for this
public class GenericUDFAddMonths extends GenericUDF {
private transient Converter[] converters = new Converter[2];
private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[2];
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java
index 510f367..c031c61 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java
@@ -37,6 +37,7 @@ import org.apache.hadoop.io.BooleanWritable;
extended="Example:\n"
+ " > SELECT _FUNC_(array(1, 2, 3), 2) FROM src LIMIT 1;\n"
+ " true")
+@NDV(maxNdv = 2)
public class GenericUDFArrayContains extends GenericUDF {
private static final int ARRAY_IDX = 0;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java
index 9d9ee57..04f72a6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
import org.apache.hadoop.io.BooleanWritable;
@Description(name = "between", value = "_FUNC_ a [NOT] BETWEEN b AND c - evaluate if a is [not] in between b and c")
+@NDV(maxNdv = 2)
public class GenericUDFBetween extends GenericUDF {
GenericUDFOPEqualOrGreaterThan egt = new GenericUDFOPEqualOrGreaterThan();
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java
index 67f3c64..1f027a2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java
@@ -18,6 +18,7 @@
package org.apache.hadoop.hive.ql.udf.generic;
import java.sql.Date;
+
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
@@ -34,6 +35,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
@Description(name = "current_date",
value = "_FUNC_() - Returns the current date at the start of query evaluation."
+ " All calls of current_date within the same query return the same value.")
+@NDV(maxNdv = 1)
public class GenericUDFCurrentDate extends GenericUDF {
protected DateWritable currentDate;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java
index cc7d0d4..2f13a22 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java
@@ -33,6 +33,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
@Description(name = "current_timestamp",
value = "_FUNC_() - Returns the current timestamp at the start of query evaluation."
+ " All calls of current_timestamp within the same query return the same value.")
+@NDV(maxNdv = 1)
public class GenericUDFCurrentTimestamp extends GenericUDF {
protected TimestampWritable currentTimestamp;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java
index 4a1514b..d97583d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.io.Text;
@UDFType(deterministic = true)
@Description(name = "current_user", value = "_FUNC_() - Returns current user name", extended = "SessionState UserFromAuthenticator")
+@NDV(maxNdv = 1)
public class GenericUDFCurrentUser extends GenericUDF {
protected Text currentUser;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java
index eb33d98..fa0cda8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java
@@ -38,6 +38,7 @@ import org.apache.hadoop.io.BooleanWritable;
@Description(name = "and", value = "a1 _FUNC_ a2 _FUNC_ ... _FUNC_ an - Logical and")
@VectorizedExpressions({ColAndCol.class, FilterExprAndExpr.class, FilterColAndScalar.class,
FilterScalarAndColumn.class})
+@NDV(maxNdv = 2)
public class GenericUDFOPAnd extends GenericUDF {
private final BooleanWritable result = new BooleanWritable();
private transient BooleanObjectInspector boi[];
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java
index 27537bf..e82627d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java
@@ -66,6 +66,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
DateColEqualDateScalar.class,FilterDateColEqualDateScalar.class,
DateScalarEqualDateColumn.class,FilterDateScalarEqualDateColumn.class,
})
+@NDV(maxNdv = 2)
public class GenericUDFOPEqual extends GenericUDFBaseCompare {
public GenericUDFOPEqual(){
this.opName = "EQUAL";
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java
index d0b35a7..3707a33 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
@Description(name = "<=>", value = "a _FUNC_ b - Returns same result with EQUAL(=) operator " +
"for non-null operands, but returns TRUE if both are NULL, FALSE if one of the them is NULL")
+@NDV(maxNdv = 2)
public class GenericUDFOPEqualNS extends GenericUDFOPEqual {
@Override
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java
index 90d98bb..bfd71c7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java
@@ -67,6 +67,7 @@ import org.apache.hadoop.io.Text;
DateColGreaterEqualDateScalar.class,FilterDateColGreaterEqualDateScalar.class,
DateScalarGreaterEqualDateColumn.class,FilterDateScalarGreaterEqualDateColumn.class,
})
+@NDV(maxNdv = 2)
public class GenericUDFOPEqualOrGreaterThan extends GenericUDFBaseCompare {
public GenericUDFOPEqualOrGreaterThan(){
this.opName = "EQUAL OR GREATER THAN";
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java
index 35133d4..1e69ee6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java
@@ -67,6 +67,7 @@ import org.apache.hadoop.io.Text;
DateColLessEqualDateScalar.class,FilterDateColLessEqualDateScalar.class,
DateScalarLessEqualDateColumn.class,FilterDateScalarLessEqualDateColumn.class,
})
+@NDV(maxNdv = 2)
public class GenericUDFOPEqualOrLessThan extends GenericUDFBaseCompare {
public GenericUDFOPEqualOrLessThan(){
this.opName = "EQUAL OR LESS THAN";
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java
index be05b4e..bba4d97 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java
@@ -67,6 +67,7 @@ import org.apache.hadoop.io.Text;
DateColGreaterDateScalar.class,FilterDateColGreaterDateScalar.class,
DateScalarGreaterDateColumn.class,FilterDateScalarGreaterDateColumn.class,
})
+@NDV(maxNdv = 2)
public class GenericUDFOPGreaterThan extends GenericUDFBaseCompare {
public GenericUDFOPGreaterThan(){
this.opName = "GREATER THAN";
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java
index 9d72f9e..b992fe6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java
@@ -67,6 +67,7 @@ import org.apache.hadoop.io.Text;
DateColLessDateScalar.class,FilterDateColLessDateScalar.class,
DateScalarLessDateColumn.class,FilterDateScalarLessDateColumn.class,
})
+@NDV(maxNdv = 2)
public class GenericUDFOPLessThan extends GenericUDFBaseCompare {
public GenericUDFOPLessThan(){
this.opName = "LESS THAN";
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java
index ea8fa71..2d1b013 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.io.BooleanWritable;
*/
@Description(name = "not", value = "_FUNC_ a - Logical not")
@VectorizedExpressions({NotCol.class, SelectColumnIsFalse.class})
+@NDV(maxNdv = 2)
public class GenericUDFOPNot extends GenericUDF {
private final BooleanWritable result = new BooleanWritable();
private transient BooleanObjectInspector boi;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java
index 7023225..ad47681 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java
@@ -41,7 +41,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
StringGroupColNotEqualStringScalar.class,
StringGroupColNotEqualVarCharScalar.class, StringGroupColNotEqualCharScalar.class,
StringScalarNotEqualStringGroupColumn.class,
- VarCharScalarNotEqualStringGroupColumn.class, CharScalarNotEqualStringGroupColumn.class,
+ VarCharScalarNotEqualStringGroupColumn.class, CharScalarNotEqualStringGroupColumn.class,
FilterStringGroupColNotEqualStringScalar.class, FilterStringScalarNotEqualStringGroupColumn.class,
FilterStringGroupColNotEqualVarCharScalar.class, FilterVarCharScalarNotEqualStringGroupColumn.class,
FilterStringGroupColNotEqualCharScalar.class, FilterCharScalarNotEqualStringGroupColumn.class,
@@ -66,6 +66,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
DateColNotEqualDateScalar.class,FilterDateColNotEqualDateScalar.class,
DateScalarNotEqualDateColumn.class,FilterDateScalarNotEqualDateColumn.class,
})
+@NDV(maxNdv = 2)
public class GenericUDFOPNotEqual extends GenericUDFBaseCompare {
public GenericUDFOPNotEqual(){
this.opName = "NOT EQUAL";
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java
index 2b67c38..e208d59 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java
@@ -36,6 +36,7 @@ import org.apache.hadoop.io.BooleanWritable;
@Description(name = "isnotnull",
value = "_FUNC_ a - Returns true if a is not NULL and false otherwise")
@VectorizedExpressions({IsNotNull.class, SelectColumnIsNotNull.class})
+@NDV(maxNdv = 2)
public class GenericUDFOPNotNull extends GenericUDF {
private final BooleanWritable result = new BooleanWritable();
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java
index 4eb92eb..8c4b478 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.io.BooleanWritable;
*/
@Description(name = "isnull", value = "_FUNC_ a - Returns true if a is NULL and false otherwise")
@VectorizedExpressions({IsNull.class, SelectColumnIsNull.class})
+@NDV(maxNdv = 2)
public class GenericUDFOPNull extends GenericUDF {
private final BooleanWritable result = new BooleanWritable();
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java
index 8de59c1..af38c97 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java
@@ -38,6 +38,7 @@ import org.apache.hadoop.io.BooleanWritable;
@Description(name = "or", value = "a1 _FUNC_ a2 _FUNC_ ... _FUNC_ an - Logical or")
@VectorizedExpressions({ColOrCol.class, FilterExprOrExpr.class, FilterColOrScalar.class,
FilterScalarOrColumn.class})
+@NDV(maxNdv = 2)
public class GenericUDFOPOr extends GenericUDF {
private final BooleanWritable result = new BooleanWritable();
private transient BooleanObjectInspector[] boi;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java
new file mode 100644
index 0000000..307135b
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+
+@Retention(RetentionPolicy.RUNTIME)
+public @interface NDV {
+
+ long maxNdv();
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java
index 5f484cf..a5bab4f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.io.Text;
// deterministic in the query range
@Description(name = "current_database",
value = "_FUNC_() - returns currently using database name")
+@NDV(maxNdv = 1)
public class UDFCurrentDB extends GenericUDF {
private MapredContext context;
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out
index 7822ad9..878175f 100644
--- a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out
+++ b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out
@@ -71,22 +71,22 @@ STAGE PLANS:
0
1
outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string)
sort order: ++++
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Local Work:
Map Reduce Local Work
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: sum(hash(_col0,_col1,_col2,_col3))
mode: hash
@@ -204,22 +204,22 @@ STAGE PLANS:
0
1
outputColumnNames: _col0, _col1, _col5, _col6
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string)
sort order: ++++
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Local Work:
Map Reduce Local Work
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: sum(hash(_col0,_col1,_col2,_col3))
mode: hash
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/test/results/clientpositive/llap/tez_union_multiinsert.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/tez_union_multiinsert.q.out b/ql/src/test/results/clientpositive/llap/tez_union_multiinsert.q.out
index c319e88..784a0c2 100644
--- a/ql/src/test/results/clientpositive/llap/tez_union_multiinsert.q.out
+++ b/ql/src/test/results/clientpositive/llap/tez_union_multiinsert.q.out
@@ -98,23 +98,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Map 7
Map Operator Tree:
@@ -130,23 +130,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Reducer 2
Execution mode: uber
@@ -169,23 +169,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reducer 4
Execution mode: uber
Reduce Operator Tree:
@@ -980,23 +980,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Map 5
Map Operator Tree:
@@ -1033,23 +1033,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Reducer 3
Execution mode: uber
@@ -1106,23 +1106,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Union 2
Vertex: Union 2
@@ -1879,23 +1879,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Map 5
Map Operator Tree:
@@ -1928,23 +1928,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Reducer 3
Execution mode: uber
@@ -1997,23 +1997,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 500 Data size: 140000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 500 Data size: 232000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Union 2
Vertex: Union 2
@@ -2778,23 +2778,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 250 Data size: 70000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 250 Data size: 70000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 250 Data size: 116000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 250 Data size: 116000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Execution mode: llap
Reducer 2
Execution mode: uber
@@ -2813,23 +2813,23 @@ STAGE PLANS:
keys: _col0 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 250 Data size: 70000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string)
sort order: ++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 250 Data size: 70000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 280 Basic stats: COMPLETE Column stats: PARTIAL
Group By Operator
aggregations: count(DISTINCT substr(_col1, 5))
keys: _col0 (type: string), _col1 (type: string), substr(_col1, 5) (type: string)
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 250 Data size: 116000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string)
sort order: +++
Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
- Statistics: Num rows: 250 Data size: 116000 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 1 Data size: 464 Basic stats: COMPLETE Column stats: PARTIAL
Reducer 4
Execution mode: uber
Reduce Operator Tree:
http://git-wip-us.apache.org/repos/asf/hive/blob/cc4709f6/ql/src/test/results/clientpositive/orc_llap.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/orc_llap.q.out b/ql/src/test/results/clientpositive/orc_llap.q.out
index d0b07a0..c9bb3c8 100644
--- a/ql/src/test/results/clientpositive/orc_llap.q.out
+++ b/ql/src/test/results/clientpositive/orc_llap.q.out
@@ -168,7 +168,7 @@ STAGE PLANS:
keys:
0
1
- Statistics: Num rows: 112 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
+ Statistics: Num rows: 225 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE
Group By Operator
aggregations: count(1)
mode: hash