You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kg...@apache.org on 2018/05/29 10:47:58 UTC
hive git commit: HIVE-19460: Improve stats estimations for NOT IN
operator (Zoltan Haindrich reviewed by Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 9d23f7185 -> 99ed2bcbc
HIVE-19460: Improve stats estimations for NOT IN operator (Zoltan Haindrich reviewed by Ashutosh Chauhan)
Signed-off-by: Zoltan Haindrich <ki...@rxd.hu>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/99ed2bcb
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/99ed2bcb
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/99ed2bcb
Branch: refs/heads/master
Commit: 99ed2bcbcb408cbcd81e77a1ca76c50a3bd43260
Parents: 9d23f71
Author: Zoltan Haindrich <ki...@rxd.hu>
Authored: Tue May 29 12:47:25 2018 +0200
Committer: Zoltan Haindrich <ki...@rxd.hu>
Committed: Tue May 29 12:47:25 2018 +0200
----------------------------------------------------------------------
.../org/apache/hadoop/hive/conf/HiveConf.java | 2 +
.../stats/annotation/StatsRulesProcFactory.java | 158 ++++++++++++++++++-
.../hadoop/hive/ql/plan/ColStatistics.java | 1 -
.../ql/plan/mapping/TestStatEstimations.java | 113 +++++++++++++
.../clientpositive/llap/acid_no_buckets.q.out | 20 +--
.../clientpositive/llap/explainuser_2.q.out | 26 +--
.../clientpositive/llap/vector_between_in.q.out | 14 +-
.../clientpositive/llap/vector_struct_in.q.out | 6 +-
.../clientpositive/llap/vectorization_0.q.out | 16 +-
9 files changed, 312 insertions(+), 44 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index f48d004..7942608 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2374,6 +2374,8 @@ public class HiveConf extends Configuration {
"in the number of rows filtered by a certain operator, which in turn might lead to overprovision or\n" +
"underprovision of resources. This factor is applied to the cardinality estimation of IN clauses in\n" +
"filter operators."),
+ HIVE_STATS_IN_MIN_RATIO("hive.stats.filter.in.min.ratio", (float) 0.05,
+ "Output estimation of an IN filter can't be lower than this ratio"),
// Concurrency
HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false,
"Whether Hive supports concurrency control or not. \n" +
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 91cccfb..d0be33b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -19,8 +19,8 @@
package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
import java.lang.reflect.Field;
-import java.util.Arrays;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@@ -30,7 +30,6 @@ import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.Stack;
-
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.Context;
@@ -60,6 +59,7 @@ import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
@@ -494,6 +494,19 @@ public class StatsRulesProcFactory {
}
}
+ boolean allColsFilteredByStats = true;
+ for (int i = 0; i < columnStats.size(); i++) {
+ ValuePruner vp = new ValuePruner(columnStats.get(i));
+ allColsFilteredByStats &= vp.isValid();
+ Set<ExprNodeDescEqualityWrapper> newValues = Sets.newHashSet();
+ for (ExprNodeDescEqualityWrapper v : values.get(i)) {
+ if (vp.accept(v)) {
+ newValues.add(v);
+ }
+ }
+ values.set(i, newValues);
+ }
+
// 3. Calculate IN selectivity
double factor = 1d;
for (int i = 0; i < columnStats.size(); i++) {
@@ -503,10 +516,151 @@ public class StatsRulesProcFactory {
// max can be 1, even when ndv is larger in IN clause than in column stats
factor *= columnFactor > 1d ? 1d : columnFactor;
}
+ if (!allColsFilteredByStats) {
+ factor = Double.max(factor, HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_MIN_RATIO));
+ }
float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
return Math.round( numRows * factor * inFactor);
}
+ static class RangeOps {
+
+ private String colType;
+ private Range range;
+
+ public RangeOps(String colType, Range range) {
+ this.colType = colType;
+ this.range = range;
+ }
+
+ public static RangeOps build(String colType, Range range) {
+ if (range == null || range.minValue == null || range.maxValue == null) {
+ return null;
+ }
+ return new RangeOps(colType, range);
+ }
+
+ enum RangeResult {
+ BELOW, AT_MIN, BETWEEN, AT_MAX, ABOVE;
+
+ public static RangeResult of(boolean ltMin, boolean ltMax, boolean eqMin, boolean eqMax) {
+ if (ltMin) {
+ return RangeResult.BELOW;
+ }
+ if (eqMin) {
+ return RangeResult.AT_MIN;
+ }
+ if (ltMax) {
+ return RangeResult.BETWEEN;
+ }
+ if (eqMax) {
+ return AT_MAX;
+ }
+ return ABOVE;
+ }
+ }
+
+ public boolean contains(ExprNodeDesc exprNode) {
+ RangeResult intersection = intersect(exprNode);
+ return intersection != RangeResult.ABOVE && intersection != RangeResult.BELOW;
+ }
+
+ public RangeResult intersect(ExprNodeDesc exprNode) {
+ if (!(exprNode instanceof ExprNodeConstantDesc)) {
+ return null;
+ }
+ try {
+
+ ExprNodeConstantDesc constantDesc = (ExprNodeConstantDesc) exprNode;
+
+ String stringVal = constantDesc.getValue().toString();
+
+ @Deprecated
+ String boundValue = stringVal;
+ switch (colType) {
+ case serdeConstants.TINYINT_TYPE_NAME: {
+ byte value = new Byte(stringVal);
+ byte maxValue = range.maxValue.byteValue();
+ byte minValue = range.minValue.byteValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ case serdeConstants.SMALLINT_TYPE_NAME: {
+ short value = new Short(boundValue);
+ short maxValue = range.maxValue.shortValue();
+ short minValue = range.minValue.shortValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ case serdeConstants.DATE_TYPE_NAME: {
+ DateWritable dateWriteable = new DateWritable(java.sql.Date.valueOf(boundValue));
+ int value = dateWriteable.getDays();
+ int maxValue = range.maxValue.intValue();
+ int minValue = range.minValue.intValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ case serdeConstants.INT_TYPE_NAME: {
+ int value = new Integer(boundValue);
+ int maxValue = range.maxValue.intValue();
+ int minValue = range.minValue.intValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ case serdeConstants.BIGINT_TYPE_NAME: {
+ long value = new Long(boundValue);
+ long maxValue = range.maxValue.longValue();
+ long minValue = range.minValue.longValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ case serdeConstants.FLOAT_TYPE_NAME: {
+ float value = new Float(boundValue);
+ float maxValue = range.maxValue.floatValue();
+ float minValue = range.minValue.floatValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ case serdeConstants.DOUBLE_TYPE_NAME: {
+ double value = new Double(boundValue);
+ double maxValue = range.maxValue.doubleValue();
+ double minValue = range.minValue.doubleValue();
+ return RangeResult.of(value < minValue, value < maxValue, value == minValue, value == maxValue);
+ }
+ default:
+ return null;
+ }
+ } catch (Exception e) {
+ // NumberFormatException value out of range
+ // other unknown cases
+ return null;
+ }
+ }
+
+ }
+
+ private static class ValuePruner {
+
+ private boolean valid;
+ private RangeOps colRange;
+
+ ValuePruner(ColStatistics colStatistics) {
+ if (colStatistics == null) {
+ valid = false;
+ return;
+ }
+ colRange = RangeOps.build(colStatistics.getColumnType(), colStatistics.getRange());
+ if (colRange == null) {
+ valid = false;
+ return;
+ }
+ valid = true;
+ }
+
+ public boolean isValid() {
+ return valid;
+ }
+
+ public boolean accept(ExprNodeDescEqualityWrapper e) {
+ /** removes all values which are outside of the scope of the column */
+ return !valid || colRange.contains(e.getExprNodeDesc());
+ }
+ }
+
private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, long currNumRows, AnnotateStatsProcCtx aspCtx,
List<String> neededCols, Operator<?> op) throws SemanticException {
final ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) pred;
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
index 106e59f..a31f965 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
@@ -196,5 +196,4 @@ public class ColStatistics {
return sb.toString();
}
}
-
}
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java b/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java
new file mode 100644
index 0000000..e5233ce
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/plan/mapping/TestStatEstimations.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.plan.mapping;
+
+import static org.junit.Assert.assertEquals;
+import java.util.List;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
+import org.apache.hadoop.hive.ql.DriverFactory;
+import org.apache.hadoop.hive.ql.IDriver;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.parse.ParseException;
+import org.apache.hadoop.hive.ql.plan.mapper.PlanMapper;
+import org.apache.hadoop.hive.ql.session.SessionState;
+import org.apache.hive.testutils.HiveTestEnvSetup;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestRule;
+
+public class TestStatEstimations {
+
+ @ClassRule
+ public static HiveTestEnvSetup env_setup = new HiveTestEnvSetup();
+
+ @Rule
+ public TestRule methodRule = env_setup.getMethodRule();
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ IDriver driver = createDriver();
+ dropTables(driver);
+ String cmds[] = {
+ // @formatter:off
+ "create table t2(a integer, b string) STORED AS ORC",
+ "insert into t2 values(1, 'AAA'),(2, 'AAA'),(3, 'AAA'),(4, 'AAA'),(5, 'AAA')," +
+ "(6, 'BBB'),(7, 'BBB'),(8, 'BBB'),(9, 'BBB'),(10, 'BBB')",
+ "analyze table t2 compute statistics for columns"
+ // @formatter:on
+ };
+ for (String cmd : cmds) {
+ int ret = driver.run(cmd).getResponseCode();
+ assertEquals("Checking command success", 0, ret);
+ }
+ }
+
+ @AfterClass
+ public static void afterClass() throws Exception {
+ IDriver driver = createDriver();
+ dropTables(driver);
+ }
+
+ public static void dropTables(IDriver driver) throws Exception {
+ String tables[] = {"t2" };
+ for (String t : tables) {
+ int ret = driver.run("drop table if exists " + t).getResponseCode();
+ assertEquals("Checking command success", 0, ret);
+ }
+ }
+
+ private PlanMapper getMapperForQuery(IDriver driver, String query) {
+ int ret = driver.run(query).getResponseCode();
+ assertEquals("Checking command success", 0, ret);
+ PlanMapper pm0 = driver.getContext().getPlanMapper();
+ return pm0;
+ }
+
+ @Test
+ public void testFilterIntIn() throws ParseException {
+ IDriver driver = createDriver();
+ String query = "explain select a from t2 where a IN (-1,0,1,2,10,20,30,40) order by a";
+
+ PlanMapper pm = getMapperForQuery(driver, query);
+ List<FilterOperator> fos = pm.getAll(FilterOperator.class);
+ // the same operator is present 2 times
+ fos.sort(TestCounterMapping.OPERATOR_ID_COMPARATOR.reversed());
+ assertEquals(1, fos.size());
+ FilterOperator fop = fos.get(0);
+
+ // all outside elements should be ignored from stat estimation
+ assertEquals(3, fop.getStatistics().getNumRows());
+
+ }
+
+ private static IDriver createDriver() {
+ HiveConf conf = env_setup.getTestCtx().hiveConf;
+
+ conf.setBoolVar(ConfVars.HIVE_VECTORIZATION_ENABLED, false);
+ conf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER,
+ "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
+ SessionState.start(conf);
+
+ IDriver driver = DriverFactory.newDriver(conf);
+ return driver;
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out b/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
index f03e64b..eb4a8cb 100644
--- a/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
+++ b/ql/src/test/results/clientpositive/llap/acid_no_buckets.q.out
@@ -304,15 +304,15 @@ STAGE PLANS:
Statistics: Num rows: 2015 Data size: 916825 Basic stats: COMPLETE Column stats: PARTIAL
Filter Operator
predicate: (key) IN ('1001', '213', '43') (type: boolean)
- Statistics: Num rows: 20 Data size: 9100 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 45955 Basic stats: COMPLETE Column stats: PARTIAL
Select Operator
expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), ds (type: string), hr (type: string)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
sort order: +
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col1 (type: string), _col2 (type: string)
Execution mode: llap
LLAP IO: may be used (ACID table)
@@ -322,10 +322,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), VALUE._col1 (type: string)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
table:
input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
@@ -804,16 +804,16 @@ STAGE PLANS:
Statistics: Num rows: 2015 Data size: 916825 Basic stats: COMPLETE Column stats: PARTIAL
Filter Operator
predicate: (key) IN ('1001', '213', '43') (type: boolean)
- Statistics: Num rows: 20 Data size: 9100 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 45955 Basic stats: COMPLETE Column stats: PARTIAL
Select Operator
expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), ds (type: string), hr (type: string)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
Reduce Output Operator
key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
sort order: +
Map-reduce partition columns: UDFToInteger(_col0) (type: int)
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
value expressions: _col1 (type: string), _col2 (type: string)
Execution mode: llap
LLAP IO: may be used (ACID table)
@@ -823,10 +823,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: string), VALUE._col1 (type: string)
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
File Output Operator
compressed: false
- Statistics: Num rows: 20 Data size: 8880 Basic stats: COMPLETE Column stats: PARTIAL
+ Statistics: Num rows: 101 Data size: 44844 Basic stats: COMPLETE Column stats: PARTIAL
table:
input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
index 3930a14..361dc07 100644
--- a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
+++ b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
@@ -304,39 +304,39 @@ Stage-0
Stage-1
Reducer 5 vectorized, llap
File Output Operator [FS_126]
- Limit [LIM_125] (rows=5 width=285)
+ Limit [LIM_125] (rows=12 width=285)
Number of rows:100
- Select Operator [SEL_124] (rows=5 width=285)
+ Select Operator [SEL_124] (rows=12 width=285)
Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
<-Reducer 4 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_123]
- Group By Operator [GBY_122] (rows=5 width=285)
+ Group By Operator [GBY_122] (rows=12 width=285)
Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"],keys:KEY._col0, KEY._col1, KEY._col2
<-Reducer 3 [SIMPLE_EDGE] llap
SHUFFLE [RS_49]
PartitionCols:_col0, _col1, _col2
- Group By Operator [GBY_48] (rows=5 width=285)
+ Group By Operator [GBY_48] (rows=12 width=285)
Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(_col11)","count(_col21)","count(_col3)"],keys:_col10, _col20, _col2
- Merge Join Operator [MERGEJOIN_97] (rows=4704 width=534)
+ Merge Join Operator [MERGEJOIN_97] (rows=9275 width=534)
Conds:RS_44._col1, _col3=RS_45._col15, _col17(Inner),Output:["_col2","_col3","_col10","_col11","_col20","_col21"]
<-Reducer 10 [SIMPLE_EDGE] llap
SHUFFLE [RS_45]
PartitionCols:_col15, _col17
- Select Operator [SEL_40] (rows=336 width=447)
+ Select Operator [SEL_40] (rows=420 width=447)
Output:["_col4","_col5","_col14","_col15","_col17"]
- Merge Join Operator [MERGEJOIN_96] (rows=336 width=447)
+ Merge Join Operator [MERGEJOIN_96] (rows=420 width=447)
Conds:RS_37._col4, _col2=RS_38._col4, _col2(Inner),Output:["_col0","_col1","_col14","_col15","_col17"]
<-Reducer 11 [SIMPLE_EDGE] llap
SHUFFLE [RS_38]
PartitionCols:_col4, _col2
- Merge Join Operator [MERGEJOIN_95] (rows=8 width=356)
+ Merge Join Operator [MERGEJOIN_95] (rows=10 width=356)
Conds:RS_121._col0=RS_109._col0(Inner),Output:["_col2","_col3","_col4","_col5"]
<-Map 6 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_109]
PartitionCols:_col0
- Select Operator [SEL_106] (rows=5 width=178)
+ Select Operator [SEL_106] (rows=25 width=178)
Output:["_col0"]
- Filter Operator [FIL_103] (rows=5 width=178)
+ Filter Operator [FIL_103] (rows=25 width=178)
predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null)
TableScan [TS_3] (rows=500 width=178)
default@src,d3,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
@@ -402,14 +402,14 @@ Stage-0
<-Reducer 2 [SIMPLE_EDGE] llap
SHUFFLE [RS_44]
PartitionCols:_col1, _col3
- Merge Join Operator [MERGEJOIN_91] (rows=70 width=269)
+ Merge Join Operator [MERGEJOIN_91] (rows=265 width=269)
Conds:RS_100._col0=RS_107._col0(Inner),Output:["_col1","_col2","_col3"]
<-Map 6 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_107]
PartitionCols:_col0
- Select Operator [SEL_104] (rows=5 width=178)
+ Select Operator [SEL_104] (rows=25 width=178)
Output:["_col0"]
- Filter Operator [FIL_101] (rows=5 width=178)
+ Filter Operator [FIL_101] (rows=25 width=178)
predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null)
Please refer to the previous TableScan [TS_3]
<-Map 1 [SIMPLE_EDGE] vectorized, llap
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
index f76053e..b1c0bab 100644
--- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
@@ -57,7 +57,7 @@ STAGE PLANS:
native: true
predicateExpression: FilterLongColumnInList(col 3:date, values [-67, -171])
predicate: (cdate) IN (DATE'1969-10-26', DATE'1969-07-14') (type: boolean)
- Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: cdate (type: date)
outputColumnNames: _col0
@@ -65,7 +65,7 @@ STAGE PLANS:
className: VectorSelectOperator
native: true
projectedOutputColumnNums: [3]
- Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: date)
sort order: +
@@ -73,7 +73,7 @@ STAGE PLANS:
className: VectorReduceSinkObjectHashOperator
native: true
nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
- Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE
Execution mode: vectorized, llap
LLAP IO: all inputs
Map Vectorization:
@@ -101,13 +101,13 @@ STAGE PLANS:
className: VectorSelectOperator
native: true
projectedOutputColumnNums: [0]
- Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
File Sink Vectorization:
className: VectorFileSinkOperator
native: false
- Statistics: Num rows: 10 Data size: 532 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 1 Data size: 53 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -152,13 +152,13 @@ STAGE PLANS:
native: true
predicateExpression: SelectColumnIsFalse(col 5:boolean)(children: LongColumnInList(col 3, values [-67, -171, 20]) -> 5:boolean)
predicate: (not (cdate) IN (DATE'1969-10-26', DATE'1969-07-14', DATE'1970-01-21')) (type: boolean)
- Statistics: Num rows: 12274 Data size: 653057 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 12284 Data size: 653589 Basic stats: COMPLETE Column stats: NONE
Select Operator
Select Vectorization:
className: VectorSelectOperator
native: true
projectedOutputColumnNums: []
- Statistics: Num rows: 12274 Data size: 653057 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 12284 Data size: 653589 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: count()
Group By Vectorization:
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out b/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
index 5afa99d..f210b72 100644
--- a/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_struct_in.q.out
@@ -847,7 +847,7 @@ STAGE PLANS:
native: true
predicateExpression: FilterStructColumnInList(structExpressions [col 0:bigint, col 1:string, col 2:double], fieldVectorColumnTypes [LONG, BYTES, DOUBLE], structColumnMap [0, 1, 2])
predicate: (struct(my_bigint,my_string,my_double)) IN (const struct(1L,'a',1.5D), const struct(1L,'b',-0.5D), const struct(3L,'b',1.5D), const struct(1L,'d',1.5D), const struct(1L,'c',1.5D), const struct(1L,'b',2.5D), const struct(1L,'b',0.5D), const struct(5L,'b',1.5D), const struct(1L,'a',0.5D), const struct(3L,'b',1.5D)) (type: boolean)
- Statistics: Num rows: 3 Data size: 303 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 202 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: my_bigint (type: bigint), my_string (type: string), my_double (type: double)
outputColumnNames: _col0, _col1, _col2
@@ -855,13 +855,13 @@ STAGE PLANS:
className: VectorSelectOperator
native: true
projectedOutputColumnNums: [0, 1, 2]
- Statistics: Num rows: 3 Data size: 303 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 202 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
File Sink Vectorization:
className: VectorFileSinkOperator
native: false
- Statistics: Num rows: 3 Data size: 303 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 202 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
http://git-wip-us.apache.org/repos/asf/hive/blob/99ed2bcb/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
index c3d810e..3d00bbe 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
@@ -30975,19 +30975,19 @@ STAGE PLANS:
Filter Operator
isSamplingPred: false
predicate: (cstring1) IN ('biology', 'history', 'topology') (type: boolean)
- Statistics: Num rows: 6 Data size: 470 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 614 Data size: 43146 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
keys: cstring1 (type: string)
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: a
sort order: +
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
tag: -1
value expressions: _col1 (type: bigint)
auto parallelism: true
@@ -31055,16 +31055,16 @@ STAGE PLANS:
keys: KEY._col0 (type: string)
mode: mergepartial
outputColumnNames: _col0, _col1
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col1 (type: bigint), _col0 (type: string)
outputColumnNames: _col0, _col1
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col1 (type: string)
null sort order: a
sort order: +
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
tag: -1
value expressions: _col0 (type: bigint)
auto parallelism: false
@@ -31075,13 +31075,13 @@ STAGE PLANS:
Select Operator
expressions: VALUE._col0 (type: bigint), KEY.reducesinkkey0 (type: string)
outputColumnNames: _col0, _col1
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
- Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 304 Data size: 23864 Basic stats: COMPLETE Column stats: COMPLETE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat