You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pg...@apache.org on 2021/10/20 11:37:45 UTC

[hive] branch master updated: HIVE-25505: Incorrect results with header. skip.header.line.count if first line is blank (Panos Garefalakis, reviewed by Laszlo Bodor)

This is an automated email from the ASF dual-hosted git repository.

pgaref pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new d4a61dd  HIVE-25505: Incorrect results with header. skip.header.line.count if first line is blank (Panos Garefalakis, reviewed by Laszlo Bodor)
d4a61dd is described below

commit d4a61ddf0ab9c22f9c24f723d8a1861cbbe26573
Author: Panagiotis Garefalakis <pg...@apache.org>
AuthorDate: Wed Oct 20 14:37:31 2021 +0300

    HIVE-25505: Incorrect results with header. skip.header.line.count if first line is blank (Panos Garefalakis, reviewed by Laszlo Bodor)
    
    Closes #2717
---
 data/files/emptyhead_4line_file1.csv               |   4 +
 data/files/emptyhead_4line_file1.csv.bz2           | Bin 0 -> 59 bytes
 .../test/resources/testconfiguration.properties    |   1 +
 .../hive/llap/io/encoded/LineRrOffsetReader.java   |   9 +-
 .../llap/io/encoded/SerDeEncodedDataReader.java    |   6 +-
 .../hadoop/hive/ql/io/RecordReaderWrapper.java     |   5 +
 .../clientpositive/empty_skip_header_footer_aggr.q |  45 ++++++++
 .../llap/empty_skip_header_footer_aggr.q.out       | 118 +++++++++++++++++++++
 .../tez/empty_skip_header_footer_aggr.q.out        | 118 +++++++++++++++++++++
 9 files changed, 302 insertions(+), 4 deletions(-)

diff --git a/data/files/emptyhead_4line_file1.csv b/data/files/emptyhead_4line_file1.csv
new file mode 100644
index 0000000..98331fe
--- /dev/null
+++ b/data/files/emptyhead_4line_file1.csv
@@ -0,0 +1,4 @@
+
+1,2019-12-31
+2,2018-12-31
+3,2017-12-31
\ No newline at end of file
diff --git a/data/files/emptyhead_4line_file1.csv.bz2 b/data/files/emptyhead_4line_file1.csv.bz2
new file mode 100644
index 0000000..e16bfee
Binary files /dev/null and b/data/files/emptyhead_4line_file1.csv.bz2 differ
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 3e56b86..71b6fed 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -8,6 +8,7 @@ minimr.query.files=\
 minitez.query.files.shared=\
   compressed_skip_header_footer_aggr.q,\
   dynpart_sort_optimization_distribute_by.q,\
+  empty_skip_header_footer_aggr.q,\
   hybridgrace_hashjoin_1.q,\
   hybridgrace_hashjoin_2.q,\
   substr_estimate_range.q
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java
index a0f949b..45503c1 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java
@@ -24,6 +24,7 @@ import java.lang.reflect.Method;
 import org.apache.hadoop.hive.llap.io.api.impl.LlapIoImpl;
 import org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.ReaderWithOffsets;
 import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.LineRecordReader;
 
@@ -44,7 +45,8 @@ final class LineRrOffsetReader extends PassThruOffsetReader {
     isCompressedMethod = isCompressedMethodTmp;
   }
 
-  static ReaderWithOffsets create(LineRecordReader sourceReader, JobConf jobConf, int skipHeaderCnt, int skipFooterCnt) {
+  static ReaderWithOffsets create(LineRecordReader sourceReader, JobConf jobConf,
+      int skipHeaderCnt, int skipFooterCnt, FileSplit split) {
     // File not compressed, skipping is already done as part of SkippingTextInputFormat
     if (isCompressedMethod == null) {
       return new PassThruOffsetReader(sourceReader, jobConf, 0, 0);
@@ -61,6 +63,11 @@ final class LineRrOffsetReader extends PassThruOffsetReader {
       LlapIoImpl.LOG.info("Reader is compressed; offsets not supported");
       return new PassThruOffsetReader(sourceReader, jobConf, skipHeaderCnt, skipFooterCnt);
     }
+    if (skipHeaderCnt > 0 && split.getStart() == 0) {
+      // Skipping empty/null lines leading to Split start -1 being zero
+      LlapIoImpl.LOG.info("Reader with blank head line(s)");
+      return new PassThruOffsetReader(sourceReader, jobConf, skipHeaderCnt, skipFooterCnt);
+    }
     // For non-compressed Text Files Header/Footer Skipping is already done as part of SkippingTextInputFormat
     return new LineRrOffsetReader(sourceReader, jobConf);
   }
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java
index 192eb1e..f9058ef 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java
@@ -1442,7 +1442,7 @@ public class SerDeEncodedDataReader extends CallableWithNdc<Void>
     Path path = split.getPath().getFileSystem(daemonConf).makeQualified(split.getPath());
     PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
     try {
-      offsetReader = createOffsetReader(sourceReader, partDesc.getTableDesc());
+      offsetReader = createOffsetReader(sourceReader, partDesc.getTableDesc(), split);
       sourceReader = null;
     } finally {
       if (sourceReader != null) {
@@ -1650,7 +1650,7 @@ public class SerDeEncodedDataReader extends CallableWithNdc<Void>
     }
   }
 
-  private ReaderWithOffsets createOffsetReader(RecordReader<?, ?> sourceReader, TableDesc tableDesc)
+  private ReaderWithOffsets createOffsetReader(RecordReader<?, ?> sourceReader, TableDesc tableDesc, FileSplit split)
       throws IOException {
     int headerCount = Utilities.getHeaderCount(tableDesc);
     int footerCount = Utilities.getFooterCount(tableDesc, jobConf);
@@ -1661,7 +1661,7 @@ public class SerDeEncodedDataReader extends CallableWithNdc<Void>
     // Handle the special cases here. Perhaps we could have a more general structure, or even
     // a configurable set (like storage handlers), but for now we only have one.
     if (isLrrEnabled && sourceReader instanceof LineRecordReader) {
-      return LineRrOffsetReader.create((LineRecordReader)sourceReader, jobConf, headerCount, footerCount);
+      return LineRrOffsetReader.create((LineRecordReader)sourceReader, jobConf, headerCount, footerCount, split);
     }
     return new PassThruOffsetReader(sourceReader, jobConf, headerCount, footerCount);
   }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java
index 811c314..523c770 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java
@@ -94,6 +94,11 @@ class RecordReaderWrapper extends LineRecordReader {
         LOG.info("Reader is compressed; offsets not supported");
         return new RecordReaderWrapper(split, jobConf, headerCount, footerCount);
       }
+      if (headerCount > 0 && split.getStart() == 0) {
+        // Skipping empty/null lines leading to Split start -1 being zero
+        LOG.info("Reader with blank head line(s)");
+        return new RecordReaderWrapper(split, jobConf, headerCount, footerCount);
+      }
     }
     return innerReader;
   }
diff --git a/ql/src/test/queries/clientpositive/empty_skip_header_footer_aggr.q b/ql/src/test/queries/clientpositive/empty_skip_header_footer_aggr.q
new file mode 100644
index 0000000..7d09ac4
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/empty_skip_header_footer_aggr.q
@@ -0,0 +1,45 @@
+SET hive.query.results.cache.enabled=false;
+SET hive.mapred.mode=nonstrict;
+SET hive.explain.user=false;
+
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase1;
+dfs -rmr ${system:test.tmp.dir}/testcase1;
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase1;
+dfs -copyFromLocal ../../data/files/emptyhead_4line_file1.csv  ${system:test.tmp.dir}/testcase1/;
+--
+--
+DROP TABLE IF EXISTS `testcase1`;
+CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+  LOCATION '${system:test.tmp.dir}/testcase1'
+  TBLPROPERTIES ("skip.header.line.count"="1");
+
+-- Make sure empty Head lines are skipped
+-- With Fetch task optimization
+SET hive.fetch.task.conversion = more;
+select * from testcase1;
+select count(*) from testcase1;
+
+-- Make sure empty Head lines are skipped
+-- And NO Fetch task optimization
+set hive.fetch.task.conversion=none;
+select * from testcase1;
+select count(*) from testcase1;
+
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase2;
+dfs -rmr ${system:test.tmp.dir}/testcase2;
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase2;
+dfs -copyFromLocal ../../data/files/emptyhead_4line_file1.csv.bz2  ${system:test.tmp.dir}/testcase2/;
+--
+--
+DROP TABLE IF EXISTS `testcase2`;
+CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+ LOCATION '${system:test.tmp.dir}/testcase2'
+ TBLPROPERTIES ("skip.header.line.count"="1");
+
+SET hive.fetch.task.conversion = more;
+select * from testcase2;
+select count(*) from testcase2;
+
+set hive.fetch.task.conversion=none;
+select * from testcase2;
+select count(*) from testcase2;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/empty_skip_header_footer_aggr.q.out b/ql/src/test/results/clientpositive/llap/empty_skip_header_footer_aggr.q.out
new file mode 100644
index 0000000..b883595
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/empty_skip_header_footer_aggr.q.out
@@ -0,0 +1,118 @@
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase1`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase1`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+  TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase1
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+  TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase1
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+3
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+3
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase2`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase2`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase2
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase2
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+3
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+3
diff --git a/ql/src/test/results/clientpositive/tez/empty_skip_header_footer_aggr.q.out b/ql/src/test/results/clientpositive/tez/empty_skip_header_footer_aggr.q.out
new file mode 100644
index 0000000..fdb18c5
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/empty_skip_header_footer_aggr.q.out
@@ -0,0 +1,118 @@
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase1`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase1`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+  TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+PREHOOK: Input: hdfs://### HDFS PATH ###
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase1
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+  TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Input: hdfs://### HDFS PATH ###
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase1
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase2`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase2`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+PREHOOK: Input: hdfs://### HDFS PATH ###
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase2
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Input: hdfs://### HDFS PATH ###
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase2
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1	2019-12-31
+2	2018-12-31
+3	2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3