You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pg...@apache.org on 2021/10/20 11:37:45 UTC
[hive] branch master updated: HIVE-25505: Incorrect results with
header. skip.header.line.count if first line is blank (Panos Garefalakis,
reviewed by Laszlo Bodor)
This is an automated email from the ASF dual-hosted git repository.
pgaref pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new d4a61dd HIVE-25505: Incorrect results with header. skip.header.line.count if first line is blank (Panos Garefalakis, reviewed by Laszlo Bodor)
d4a61dd is described below
commit d4a61ddf0ab9c22f9c24f723d8a1861cbbe26573
Author: Panagiotis Garefalakis <pg...@apache.org>
AuthorDate: Wed Oct 20 14:37:31 2021 +0300
HIVE-25505: Incorrect results with header. skip.header.line.count if first line is blank (Panos Garefalakis, reviewed by Laszlo Bodor)
Closes #2717
---
data/files/emptyhead_4line_file1.csv | 4 +
data/files/emptyhead_4line_file1.csv.bz2 | Bin 0 -> 59 bytes
.../test/resources/testconfiguration.properties | 1 +
.../hive/llap/io/encoded/LineRrOffsetReader.java | 9 +-
.../llap/io/encoded/SerDeEncodedDataReader.java | 6 +-
.../hadoop/hive/ql/io/RecordReaderWrapper.java | 5 +
.../clientpositive/empty_skip_header_footer_aggr.q | 45 ++++++++
.../llap/empty_skip_header_footer_aggr.q.out | 118 +++++++++++++++++++++
.../tez/empty_skip_header_footer_aggr.q.out | 118 +++++++++++++++++++++
9 files changed, 302 insertions(+), 4 deletions(-)
diff --git a/data/files/emptyhead_4line_file1.csv b/data/files/emptyhead_4line_file1.csv
new file mode 100644
index 0000000..98331fe
--- /dev/null
+++ b/data/files/emptyhead_4line_file1.csv
@@ -0,0 +1,4 @@
+
+1,2019-12-31
+2,2018-12-31
+3,2017-12-31
\ No newline at end of file
diff --git a/data/files/emptyhead_4line_file1.csv.bz2 b/data/files/emptyhead_4line_file1.csv.bz2
new file mode 100644
index 0000000..e16bfee
Binary files /dev/null and b/data/files/emptyhead_4line_file1.csv.bz2 differ
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 3e56b86..71b6fed 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -8,6 +8,7 @@ minimr.query.files=\
minitez.query.files.shared=\
compressed_skip_header_footer_aggr.q,\
dynpart_sort_optimization_distribute_by.q,\
+ empty_skip_header_footer_aggr.q,\
hybridgrace_hashjoin_1.q,\
hybridgrace_hashjoin_2.q,\
substr_estimate_range.q
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java
index a0f949b..45503c1 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/LineRrOffsetReader.java
@@ -24,6 +24,7 @@ import java.lang.reflect.Method;
import org.apache.hadoop.hive.llap.io.api.impl.LlapIoImpl;
import org.apache.hadoop.hive.llap.io.encoded.SerDeEncodedDataReader.ReaderWithOffsets;
import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LineRecordReader;
@@ -44,7 +45,8 @@ final class LineRrOffsetReader extends PassThruOffsetReader {
isCompressedMethod = isCompressedMethodTmp;
}
- static ReaderWithOffsets create(LineRecordReader sourceReader, JobConf jobConf, int skipHeaderCnt, int skipFooterCnt) {
+ static ReaderWithOffsets create(LineRecordReader sourceReader, JobConf jobConf,
+ int skipHeaderCnt, int skipFooterCnt, FileSplit split) {
// File not compressed, skipping is already done as part of SkippingTextInputFormat
if (isCompressedMethod == null) {
return new PassThruOffsetReader(sourceReader, jobConf, 0, 0);
@@ -61,6 +63,11 @@ final class LineRrOffsetReader extends PassThruOffsetReader {
LlapIoImpl.LOG.info("Reader is compressed; offsets not supported");
return new PassThruOffsetReader(sourceReader, jobConf, skipHeaderCnt, skipFooterCnt);
}
+ if (skipHeaderCnt > 0 && split.getStart() == 0) {
+ // Skipping empty/null lines leading to Split start -1 being zero
+ LlapIoImpl.LOG.info("Reader with blank head line(s)");
+ return new PassThruOffsetReader(sourceReader, jobConf, skipHeaderCnt, skipFooterCnt);
+ }
// For non-compressed Text Files Header/Footer Skipping is already done as part of SkippingTextInputFormat
return new LineRrOffsetReader(sourceReader, jobConf);
}
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java
index 192eb1e..f9058ef 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/encoded/SerDeEncodedDataReader.java
@@ -1442,7 +1442,7 @@ public class SerDeEncodedDataReader extends CallableWithNdc<Void>
Path path = split.getPath().getFileSystem(daemonConf).makeQualified(split.getPath());
PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
try {
- offsetReader = createOffsetReader(sourceReader, partDesc.getTableDesc());
+ offsetReader = createOffsetReader(sourceReader, partDesc.getTableDesc(), split);
sourceReader = null;
} finally {
if (sourceReader != null) {
@@ -1650,7 +1650,7 @@ public class SerDeEncodedDataReader extends CallableWithNdc<Void>
}
}
- private ReaderWithOffsets createOffsetReader(RecordReader<?, ?> sourceReader, TableDesc tableDesc)
+ private ReaderWithOffsets createOffsetReader(RecordReader<?, ?> sourceReader, TableDesc tableDesc, FileSplit split)
throws IOException {
int headerCount = Utilities.getHeaderCount(tableDesc);
int footerCount = Utilities.getFooterCount(tableDesc, jobConf);
@@ -1661,7 +1661,7 @@ public class SerDeEncodedDataReader extends CallableWithNdc<Void>
// Handle the special cases here. Perhaps we could have a more general structure, or even
// a configurable set (like storage handlers), but for now we only have one.
if (isLrrEnabled && sourceReader instanceof LineRecordReader) {
- return LineRrOffsetReader.create((LineRecordReader)sourceReader, jobConf, headerCount, footerCount);
+ return LineRrOffsetReader.create((LineRecordReader)sourceReader, jobConf, headerCount, footerCount, split);
}
return new PassThruOffsetReader(sourceReader, jobConf, headerCount, footerCount);
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java
index 811c314..523c770 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/RecordReaderWrapper.java
@@ -94,6 +94,11 @@ class RecordReaderWrapper extends LineRecordReader {
LOG.info("Reader is compressed; offsets not supported");
return new RecordReaderWrapper(split, jobConf, headerCount, footerCount);
}
+ if (headerCount > 0 && split.getStart() == 0) {
+ // Skipping empty/null lines leading to Split start -1 being zero
+ LOG.info("Reader with blank head line(s)");
+ return new RecordReaderWrapper(split, jobConf, headerCount, footerCount);
+ }
}
return innerReader;
}
diff --git a/ql/src/test/queries/clientpositive/empty_skip_header_footer_aggr.q b/ql/src/test/queries/clientpositive/empty_skip_header_footer_aggr.q
new file mode 100644
index 0000000..7d09ac4
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/empty_skip_header_footer_aggr.q
@@ -0,0 +1,45 @@
+SET hive.query.results.cache.enabled=false;
+SET hive.mapred.mode=nonstrict;
+SET hive.explain.user=false;
+
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase1;
+dfs -rmr ${system:test.tmp.dir}/testcase1;
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase1;
+dfs -copyFromLocal ../../data/files/emptyhead_4line_file1.csv ${system:test.tmp.dir}/testcase1/;
+--
+--
+DROP TABLE IF EXISTS `testcase1`;
+CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+ LOCATION '${system:test.tmp.dir}/testcase1'
+ TBLPROPERTIES ("skip.header.line.count"="1");
+
+-- Make sure empty Head lines are skipped
+-- With Fetch task optimization
+SET hive.fetch.task.conversion = more;
+select * from testcase1;
+select count(*) from testcase1;
+
+-- Make sure empty Head lines are skipped
+-- And NO Fetch task optimization
+set hive.fetch.task.conversion=none;
+select * from testcase1;
+select count(*) from testcase1;
+
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase2;
+dfs -rmr ${system:test.tmp.dir}/testcase2;
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/testcase2;
+dfs -copyFromLocal ../../data/files/emptyhead_4line_file1.csv.bz2 ${system:test.tmp.dir}/testcase2/;
+--
+--
+DROP TABLE IF EXISTS `testcase2`;
+CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+ LOCATION '${system:test.tmp.dir}/testcase2'
+ TBLPROPERTIES ("skip.header.line.count"="1");
+
+SET hive.fetch.task.conversion = more;
+select * from testcase2;
+select count(*) from testcase2;
+
+set hive.fetch.task.conversion=none;
+select * from testcase2;
+select count(*) from testcase2;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/empty_skip_header_footer_aggr.q.out b/ql/src/test/results/clientpositive/llap/empty_skip_header_footer_aggr.q.out
new file mode 100644
index 0000000..b883595
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/empty_skip_header_footer_aggr.q.out
@@ -0,0 +1,118 @@
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase1`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase1`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase1
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase1
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+3
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+#### A masked pattern was here ####
+3
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase2`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase2`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase2
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase2
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+3
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+#### A masked pattern was here ####
+3
diff --git a/ql/src/test/results/clientpositive/tez/empty_skip_header_footer_aggr.q.out b/ql/src/test/results/clientpositive/tez/empty_skip_header_footer_aggr.q.out
new file mode 100644
index 0000000..fdb18c5
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/empty_skip_header_footer_aggr.q.out
@@ -0,0 +1,118 @@
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase1`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase1`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+PREHOOK: Input: hdfs://### HDFS PATH ###
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase1
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase1`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Input: hdfs://### HDFS PATH ###
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase1
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: select * from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase1
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase1
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+#### A masked pattern was here ####
+PREHOOK: query: DROP TABLE IF EXISTS `testcase2`
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE IF EXISTS `testcase2`
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+PREHOOK: type: CREATETABLE
+PREHOOK: Input: hdfs://### HDFS PATH ###
+PREHOOK: Output: database:default
+PREHOOK: Output: default@testcase2
+POSTHOOK: query: CREATE EXTERNAL TABLE `testcase2`(id int, name string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
+#### A masked pattern was here ####
+ TBLPROPERTIES ("skip.header.line.count"="1")
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Input: hdfs://### HDFS PATH ###
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@testcase2
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3
+PREHOOK: query: select * from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 2019-12-31
+2 2018-12-31
+3 2017-12-31
+PREHOOK: query: select count(*) from testcase2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@testcase2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from testcase2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@testcase2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+3