You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2018/05/23 16:57:25 UTC

[2/4] hive git commit: Revert "HIVE-19557: stats: filters for dates are not taking advantage of min/max values (Zoltan Haindrich reviewed by Ashutosh Chauhan)"

Revert "HIVE-19557: stats: filters for dates are not taking advantage of min/max values (Zoltan Haindrich reviewed by Ashutosh Chauhan)"

This reverts commit 0b2d364aa5977c7a15a4b5082cf9ef431dcc394e.


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7056445c
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7056445c
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7056445c

Branch: refs/heads/master
Commit: 7056445c1e9a1b040ae44dca99b351296325b047
Parents: 29f5784
Author: Jesus Camacho Rodriguez <jc...@apache.org>
Authored: Wed May 23 09:51:40 2018 -0700
Committer: Jesus Camacho Rodriguez <jc...@apache.org>
Committed: Wed May 23 09:51:40 2018 -0700

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 -
 .../stats/annotation/StatsRulesProcFactory.java |  10 +-
 .../clientpositive/colstats_date_min_max.q      |  30 ---
 .../llap/colstats_date_min_max.q.out            | 193 -------------------
 .../clientpositive/llap/vector_between_in.q.out |  20 +-
 5 files changed, 11 insertions(+), 243 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/7056445c/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 6007d5a..6528ec6 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -488,7 +488,6 @@ minillaplocal.query.files=\
   cbo_subq_not_in.q,\
   column_table_stats.q,\
   column_table_stats_orc.q,\
-  colstats_date_min_max.q,\
   compare_double_bigint_2.q,\
   constprog_dpp.q,\
   current_date_timestamp.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/7056445c/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 91cccfb..c770227 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -100,8 +100,6 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct;
 import org.apache.hadoop.hive.serde.serdeConstants;
-import org.apache.hadoop.hive.serde2.io.DateWritable;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
 import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
@@ -753,14 +751,8 @@ public class StatsRulesProcFactory {
             }
           } else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) ||
                   colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
-            int value;
-            if (colTypeLowerCase == serdeConstants.DATE_TYPE_NAME) {
-              DateWritable writableVal = new DateWritable(java.sql.Date.valueOf(boundValue));
-              value = writableVal.getDays();
-            } else {
-              value = new Integer(boundValue);
-            }
             // Date is an integer internally
+            int value = new Integer(boundValue);
             int maxValue = cs.getRange().maxValue.intValue();
             int minValue = cs.getRange().minValue.intValue();
             if (upperBound) {

http://git-wip-us.apache.org/repos/asf/hive/blob/7056445c/ql/src/test/queries/clientpositive/colstats_date_min_max.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/colstats_date_min_max.q b/ql/src/test/queries/clientpositive/colstats_date_min_max.q
deleted file mode 100644
index 7f5be6a..0000000
--- a/ql/src/test/queries/clientpositive/colstats_date_min_max.q
+++ /dev/null
@@ -1,30 +0,0 @@
-set hive.explain.user=true;
-
-create table d1(d date);
---  tblproperties('transactional'='false');
-
-insert into d1 values
-	('2010-10-01'),
-	('2010-10-02'),
-	('2010-10-03'),
-	('2010-10-04'),
-	('2010-10-05'),
-	('2010-10-06'),
-	('2010-10-07'),
-	('2010-10-08'),
-	('2010-10-09'),
-	('2010-10-10');
-
-analyze table d1 compute statistics for columns;
-
-desc formatted d1;
-desc formatted d1 d;
-
-explain
-select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01';
-
-explain
-select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03';
-
-explain
-select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03';

http://git-wip-us.apache.org/repos/asf/hive/blob/7056445c/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out b/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out
deleted file mode 100644
index 7754f3e..0000000
--- a/ql/src/test/results/clientpositive/llap/colstats_date_min_max.q.out
+++ /dev/null
@@ -1,193 +0,0 @@
-PREHOOK: query: create table d1(d date)
-PREHOOK: type: CREATETABLE
-PREHOOK: Output: database:default
-PREHOOK: Output: default@d1
-POSTHOOK: query: create table d1(d date)
-POSTHOOK: type: CREATETABLE
-POSTHOOK: Output: database:default
-POSTHOOK: Output: default@d1
-PREHOOK: query: insert into d1 values
-	('2010-10-01'),
-	('2010-10-02'),
-	('2010-10-03'),
-	('2010-10-04'),
-	('2010-10-05'),
-	('2010-10-06'),
-	('2010-10-07'),
-	('2010-10-08'),
-	('2010-10-09'),
-	('2010-10-10')
-PREHOOK: type: QUERY
-PREHOOK: Input: _dummy_database@_dummy_table
-PREHOOK: Output: default@d1
-POSTHOOK: query: insert into d1 values
-	('2010-10-01'),
-	('2010-10-02'),
-	('2010-10-03'),
-	('2010-10-04'),
-	('2010-10-05'),
-	('2010-10-06'),
-	('2010-10-07'),
-	('2010-10-08'),
-	('2010-10-09'),
-	('2010-10-10')
-POSTHOOK: type: QUERY
-POSTHOOK: Input: _dummy_database@_dummy_table
-POSTHOOK: Output: default@d1
-POSTHOOK: Lineage: d1.d SCRIPT []
-PREHOOK: query: analyze table d1 compute statistics for columns
-PREHOOK: type: ANALYZE_TABLE
-PREHOOK: Input: default@d1
-PREHOOK: Output: default@d1
-#### A masked pattern was here ####
-POSTHOOK: query: analyze table d1 compute statistics for columns
-POSTHOOK: type: ANALYZE_TABLE
-POSTHOOK: Input: default@d1
-POSTHOOK: Output: default@d1
-#### A masked pattern was here ####
-PREHOOK: query: desc formatted d1
-PREHOOK: type: DESCTABLE
-PREHOOK: Input: default@d1
-POSTHOOK: query: desc formatted d1
-POSTHOOK: type: DESCTABLE
-POSTHOOK: Input: default@d1
-# col_name            	data_type           	comment             
-d                   	date                	                    
-	 	 
-# Detailed Table Information	 	 
-Database:           	default             	 
-#### A masked pattern was here ####
-Retention:          	0                   	 
-#### A masked pattern was here ####
-Table Type:         	MANAGED_TABLE       	 
-Table Parameters:	 	 
-	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\"}}
-	bucketing_version   	2                   
-	numFiles            	1                   
-	numRows             	10                  
-	rawDataSize         	100                 
-	totalSize           	110                 
-#### A masked pattern was here ####
-	 	 
-# Storage Information	 	 
-SerDe Library:      	org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe	 
-InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
-OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
-Compressed:         	No                  	 
-Num Buckets:        	-1                  	 
-Bucket Columns:     	[]                  	 
-Sort Columns:       	[]                  	 
-Storage Desc Params:	 	 
-	serialization.format	1                   
-PREHOOK: query: desc formatted d1 d
-PREHOOK: type: DESCTABLE
-PREHOOK: Input: default@d1
-POSTHOOK: query: desc formatted d1 d
-POSTHOOK: type: DESCTABLE
-POSTHOOK: Input: default@d1
-col_name            	d                   	 	 	 	 	 	 	 	 	 	 
-data_type           	date                	 	 	 	 	 	 	 	 	 	 
-min                 	2010-10-01          	 	 	 	 	 	 	 	 	 	 
-max                 	2010-10-10          	 	 	 	 	 	 	 	 	 	 
-num_nulls           	0                   	 	 	 	 	 	 	 	 	 	 
-distinct_count      	10                  	 	 	 	 	 	 	 	 	 	 
-avg_col_len         	                    	 	 	 	 	 	 	 	 	 	 
-max_col_len         	                    	 	 	 	 	 	 	 	 	 	 
-num_trues           	                    	 	 	 	 	 	 	 	 	 	 
-num_falses          	                    	 	 	 	 	 	 	 	 	 	 
-bitVector           	HL                  	 	 	 	 	 	 	 	 	 	 
-comment             	from deserializer   	 	 	 	 	 	 	 	 	 	 
-COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"d\":\"true\"}}	 	 	 	 	 	 	 	 	 	 
-PREHOOK: query: explain
-select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01'
-PREHOOK: type: QUERY
-POSTHOOK: query: explain
-select 'stats: FIL ~0 read',count(1) from d1 where d < '2010-03-01'
-POSTHOOK: type: QUERY
-Plan optimized by CBO.
-
-Vertex dependency in root stage
-Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
-
-Stage-0
-  Fetch Operator
-    limit:-1
-    Stage-1
-      Reducer 2 vectorized, llap
-      File Output Operator [FS_15]
-        Select Operator [SEL_14] (rows=1 width=110)
-          Output:["_col0","_col1"]
-          Group By Operator [GBY_13] (rows=1 width=8)
-            Output:["_col0"],aggregations:["count(VALUE._col0)"]
-          <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap
-            PARTITION_ONLY_SHUFFLE [RS_12]
-              Group By Operator [GBY_11] (rows=1 width=8)
-                Output:["_col0"],aggregations:["count()"]
-                Select Operator [SEL_10] (rows=1 width=56)
-                  Filter Operator [FIL_9] (rows=1 width=56)
-                    predicate:(d < DATE'2010-03-01')
-                    TableScan [TS_0] (rows=10 width=56)
-                      default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"]
-
-PREHOOK: query: explain
-select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03'
-PREHOOK: type: QUERY
-POSTHOOK: query: explain
-select 'stats: FIL estimate some read',count(1) from d1 where d < '2010-10-03'
-POSTHOOK: type: QUERY
-Plan optimized by CBO.
-
-Vertex dependency in root stage
-Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
-
-Stage-0
-  Fetch Operator
-    limit:-1
-    Stage-1
-      Reducer 2 vectorized, llap
-      File Output Operator [FS_15]
-        Select Operator [SEL_14] (rows=1 width=121)
-          Output:["_col0","_col1"]
-          Group By Operator [GBY_13] (rows=1 width=8)
-            Output:["_col0"],aggregations:["count(VALUE._col0)"]
-          <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap
-            PARTITION_ONLY_SHUFFLE [RS_12]
-              Group By Operator [GBY_11] (rows=1 width=8)
-                Output:["_col0"],aggregations:["count()"]
-                Select Operator [SEL_10] (rows=3 width=56)
-                  Filter Operator [FIL_9] (rows=3 width=56)
-                    predicate:(d < DATE'2010-10-03')
-                    TableScan [TS_0] (rows=10 width=56)
-                      default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"]
-
-PREHOOK: query: explain
-select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03'
-PREHOOK: type: QUERY
-POSTHOOK: query: explain
-select 'stats: FIL estimate all read',count(1) from d1 where d < '2010-11-03'
-POSTHOOK: type: QUERY
-Plan optimized by CBO.
-
-Vertex dependency in root stage
-Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
-
-Stage-0
-  Fetch Operator
-    limit:-1
-    Stage-1
-      Reducer 2 vectorized, llap
-      File Output Operator [FS_15]
-        Select Operator [SEL_14] (rows=1 width=120)
-          Output:["_col0","_col1"]
-          Group By Operator [GBY_13] (rows=1 width=8)
-            Output:["_col0"],aggregations:["count(VALUE._col0)"]
-          <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap
-            PARTITION_ONLY_SHUFFLE [RS_12]
-              Group By Operator [GBY_11] (rows=1 width=8)
-                Output:["_col0"],aggregations:["count()"]
-                Select Operator [SEL_10] (rows=10 width=56)
-                  Filter Operator [FIL_9] (rows=10 width=56)
-                    predicate:(d < DATE'2010-11-03')
-                    TableScan [TS_0] (rows=10 width=56)
-                      default@d1,d1,Tbl:COMPLETE,Col:COMPLETE,Output:["d"]
-

http://git-wip-us.apache.org/repos/asf/hive/blob/7056445c/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
index f76053e..6093beb 100644
--- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out
@@ -465,7 +465,7 @@ STAGE PLANS:
                         native: true
                         predicateExpression: FilterLongColumnBetween(col 3:date, left -2, right 1)
                     predicate: cdate BETWEEN DATE'1969-12-30' AND DATE'1970-01-02' (type: boolean)
-                    Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE
+                    Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cdate (type: date)
                       outputColumnNames: _col0
@@ -473,7 +473,7 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [3]
-                      Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: date)
                         sort order: +
@@ -481,7 +481,7 @@ STAGE PLANS:
                             className: VectorReduceSinkObjectHashOperator
                             native: true
                             nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
             Map Vectorization:
@@ -509,13 +509,13 @@ STAGE PLANS:
                     className: VectorSelectOperator
                     native: true
                     projectedOutputColumnNums: [0]
-                Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE
+                Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   File Sink Vectorization:
                       className: VectorFileSinkOperator
                       native: false
-                  Statistics: Num rows: 4096 Data size: 217934 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 1365 Data size: 72627 Basic stats: COMPLETE Column stats: NONE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -560,7 +560,7 @@ STAGE PLANS:
                         native: true
                         predicateExpression: FilterLongColumnNotBetween(col 3:date, left -610, right 608)
                     predicate: cdate NOT BETWEEN DATE'1968-05-01' AND DATE'1971-09-01' (type: boolean)
-                    Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE
+                    Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cdate (type: date)
                       outputColumnNames: _col0
@@ -568,7 +568,7 @@ STAGE PLANS:
                           className: VectorSelectOperator
                           native: true
                           projectedOutputColumnNums: [3]
-                      Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE
                       Reduce Output Operator
                         key expressions: _col0 (type: date)
                         sort order: +
@@ -576,7 +576,7 @@ STAGE PLANS:
                             className: VectorReduceSinkObjectHashOperator
                             native: true
                             nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
             Map Vectorization:
@@ -604,13 +604,13 @@ STAGE PLANS:
                     className: VectorSelectOperator
                     native: true
                     projectedOutputColumnNums: [0]
-                Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE
+                Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
                   File Sink Vectorization:
                       className: VectorFileSinkOperator
                       native: false
-                  Statistics: Num rows: 8193 Data size: 435921 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 10924 Data size: 581228 Basic stats: COMPLETE Column stats: NONE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat