You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2020/02/29 16:52:27 UTC

[hive] branch master updated: HIVE-22941 : Empty files are inserted into external tables after HIVE-21714 - workaround (Laszlo Bodor via Ashutosh Chauhan)

This is an automated email from the ASF dual-hosted git repository.

hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 87c88de  HIVE-22941 : Empty files are inserted into external tables after HIVE-21714 - workaround (Laszlo Bodor via Ashutosh Chauhan)
87c88de is described below

commit 87c88def6d66dc0b9afca3835ed2f0777a3b061b
Author: Laszlo Bodor <bo...@gmail.com>
AuthorDate: Sat Feb 29 08:51:29 2020 -0800

    HIVE-22941 : Empty files are inserted into external tables after HIVE-21714 - workaround (Laszlo Bodor via Ashutosh Chauhan)
    
    Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
 .../test/resources/testconfiguration.properties    |  1 +
 .../hadoop/hive/ql/exec/FileSinkOperator.java      |  3 +-
 .../clientpositive/empty_files_external_table.q    | 12 ++++
 .../llap/empty_files_external_table.q.out          | 70 ++++++++++++++++++++++
 .../llap/tez_fixed_bucket_pruning.q.out            |  8 +--
 5 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 743fe2c..18697cc 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -449,6 +449,7 @@ minillap.query.files=acid_bucket_pruning.q,\
   temp_table_drop_partitions_filter4.q
 
 minillaplocal.query.files=\
+  empty_files_external_table.q,\
   bucket_num_reducers_acid.q,\
   dec_str.q,\
   dp_counter_non_mm.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
index d0f452b..1bb52b0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java
@@ -1336,7 +1336,8 @@ public class FileSinkOperator extends TerminalOperator<FileSinkDesc> implements
       Class<?> clazz = conf.getTableInfo().getOutputFileFormatClass();
       boolean isStreaming = StreamingOutputFormat.class.isAssignableFrom(clazz);
 
-      if (!isTez || isStreaming || this.isInsertOverwrite) {
+      // let empty file generation for mm/acid table as a quick and dirty workaround for HIVE-22941
+      if (!isTez || isStreaming || (this.isInsertOverwrite && (conf.isMmTable() || conf.isFullAcidTable()))) {
         createBucketFiles(fsp);
       }
     }
diff --git a/ql/src/test/queries/clientpositive/empty_files_external_table.q b/ql/src/test/queries/clientpositive/empty_files_external_table.q
new file mode 100644
index 0000000..28b2c01
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/empty_files_external_table.q
@@ -0,0 +1,12 @@
+create external table empty_external_table(age int, name string, id int) stored as orc;
+create external table t6 like empty_external_table;
+create external table t5 like empty_external_table;
+
+-- this below is not supposed to put an empty file into the external table (instead only clear its contents)
+insert overwrite table empty_external_table select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age;
+select count(*) from empty_external_table;
+dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/empty_external_table/;
+
+drop table t5;
+drop table t6;
+drop table empty_external_table;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/empty_files_external_table.q.out b/ql/src/test/results/clientpositive/llap/empty_files_external_table.q.out
new file mode 100644
index 0000000..582d7bf
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/empty_files_external_table.q.out
@@ -0,0 +1,70 @@
+PREHOOK: query: create external table empty_external_table(age int, name string, id int) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@empty_external_table
+POSTHOOK: query: create external table empty_external_table(age int, name string, id int) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@empty_external_table
+PREHOOK: query: create external table t6 like empty_external_table
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t6
+POSTHOOK: query: create external table t6 like empty_external_table
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t6
+PREHOOK: query: create external table t5 like empty_external_table
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t5
+POSTHOOK: query: create external table t5 like empty_external_table
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t5
+PREHOOK: query: insert overwrite table empty_external_table select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t5
+PREHOOK: Input: default@t6
+PREHOOK: Output: default@empty_external_table
+POSTHOOK: query: insert overwrite table empty_external_table select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t5
+POSTHOOK: Input: default@t6
+POSTHOOK: Output: default@empty_external_table
+POSTHOOK: Lineage: empty_external_table.age SIMPLE [(t5)a.FieldSchema(name:age, type:int, comment:null), ]
+POSTHOOK: Lineage: empty_external_table.id SIMPLE [(t5)a.FieldSchema(name:id, type:int, comment:null), ]
+POSTHOOK: Lineage: empty_external_table.name SIMPLE [(t5)a.FieldSchema(name:name, type:string, comment:null), ]
+PREHOOK: query: select count(*) from empty_external_table
+PREHOOK: type: QUERY
+PREHOOK: Input: default@empty_external_table
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from empty_external_table
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@empty_external_table
+#### A masked pattern was here ####
+0
+PREHOOK: query: drop table t5
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t5
+PREHOOK: Output: default@t5
+POSTHOOK: query: drop table t5
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t5
+POSTHOOK: Output: default@t5
+PREHOOK: query: drop table t6
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t6
+PREHOOK: Output: default@t6
+POSTHOOK: query: drop table t6
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t6
+POSTHOOK: Output: default@t6
+PREHOOK: query: drop table empty_external_table
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@empty_external_table
+PREHOOK: Output: default@empty_external_table
+POSTHOOK: query: drop table empty_external_table
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@empty_external_table
+POSTHOOK: Output: default@empty_external_table
diff --git a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
index cfbbf82..0dc9821 100644
--- a/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
+++ b/ql/src/test/results/clientpositive/llap/tez_fixed_bucket_pruning.q.out
@@ -738,7 +738,7 @@ STAGE PLANS:
                     columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
 #### A masked pattern was here ####
                     name default.l3_monthly_dw_dimplan
-                    numFiles 64
+                    numFiles 1
                     numRows 180340
                     rawDataSize 269826156
                     serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_desc [...]
@@ -761,7 +761,7 @@ STAGE PLANS:
                       columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
 #### A masked pattern was here ####
                       name default.l3_monthly_dw_dimplan
-                      numFiles 64
+                      numFiles 1
                       numRows 180340
                       rawDataSize 269826156
                       serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_de [...]
@@ -1253,7 +1253,7 @@ STAGE PLANS:
                     columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
 #### A masked pattern was here ####
                     name default.l3_monthly_dw_dimplan
-                    numFiles 64
+                    numFiles 1
                     numRows 180340
                     rawDataSize 269826156
                     serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_desc [...]
@@ -1276,7 +1276,7 @@ STAGE PLANS:
                       columns.types bigint:bigint:date:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:bigint:varchar(1500):varchar(500):varchar(50):varchar(50):varchar(3000):varchar(50):varchar(50):varchar(50):varchar(1):decimal(32,6):timestamp:varchar(30):varchar(50):timestamp:bigint:bigint:varchar(70):varchar(250)
 #### A masked pattern was here ####
                       name default.l3_monthly_dw_dimplan
-                      numFiles 64
+                      numFiles 1
                       numRows 180340
                       rawDataSize 269826156
                       serialization.ddl struct l3_monthly_dw_dimplan { i64 idp_warehouse_id, i64 idp_audit_id, date idp_data_date, i64 l3_snapshot_number, i64 plan_key, i64 project_key, i64 charge_code_key, i64 transclass_key, i64 resource_key, i64 finplan_detail_object_id, i64 project_object_id, i64 txn_class_object_id, i64 charge_code_object_id, i64 resoruce_object_id, varchar(1500) plan_name, varchar(500) plan_code, varchar(50) plan_type, varchar(50) period_type, varchar(3000) plan_de [...]