You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by om...@apache.org on 2013/04/19 17:40:09 UTC
svn commit: r1469908 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/io/orc/ test/queries/clientpositive/
test/results/clientpositive/
Author: omalley
Date: Fri Apr 19 15:40:09 2013
New Revision: 1469908
URL: http://svn.apache.org/r1469908
Log:
HIVE-4178 : ORC fails with files with different numbers of columns
Added:
hive/trunk/ql/src/test/queries/clientpositive/orc_diff_part_cols.q
hive/trunk/ql/src/test/queries/clientpositive/orc_empty_files.q
hive/trunk/ql/src/test/results/clientpositive/orc_diff_part_cols.q.out
hive/trunk/ql/src/test/results/clientpositive/orc_empty_files.q.out
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java?rev=1469908&r1=1469907&r2=1469908&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java Fri Apr 19 15:40:09 2013
@@ -17,6 +17,13 @@
*/
package org.apache.hadoop.hive.ql.io.orc;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
@@ -31,16 +38,9 @@ import org.apache.hadoop.hive.serde2.typ
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.Writable;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
final class OrcStruct implements Writable {
- private final Object[] fields;
+ private Object[] fields;
OrcStruct(int children) {
fields = new Object[children];
@@ -54,6 +54,14 @@ final class OrcStruct implements Writabl
fields[fieldIndex] = value;
}
+ public int getNumFields() {
+ return fields.length;
+ }
+
+ public void setNumFields(int numFields) {
+ fields = new Object[numFields];
+ }
+
@Override
public void write(DataOutput dataOutput) throws IOException {
throw new UnsupportedOperationException("write unsupported");
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java?rev=1469908&r1=1469907&r2=1469908&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java Fri Apr 19 15:40:09 2013
@@ -733,8 +733,9 @@ class RecordReaderImpl implements Record
} else {
length = dictionaryBuffer.size() - offset;
}
- // If the column is just empty strings, the size will be zero, so the buffer will be null,
- // in that case just return result as it will default to empty
+ // If the column is just empty strings, the size will be zero,
+ // so the buffer will be null, in that case just return result
+ // as it will default to empty
if (dictionaryBuffer != null) {
dictionaryBuffer.setText(result, offset, length);
} else {
@@ -788,6 +789,13 @@ class RecordReaderImpl implements Record
result = new OrcStruct(fields.length);
} else {
result = (OrcStruct) previous;
+
+ // If the input format was initialized with a file with a
+ // different number of fields, the number of fields needs to
+ // be updated to the correct number
+ if (result.getNumFields() != fields.length) {
+ result.setNumFields(fields.length);
+ }
}
for(int i=0; i < fields.length; ++i) {
if (fields[i] != null) {
Added: hive/trunk/ql/src/test/queries/clientpositive/orc_diff_part_cols.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/orc_diff_part_cols.q?rev=1469908&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/orc_diff_part_cols.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/orc_diff_part_cols.q Fri Apr 19 15:40:09 2013
@@ -0,0 +1,19 @@
+CREATE TABLE test_orc (key STRING)
+PARTITIONED BY (part STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
+
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+
+-- Create a table with one column write to a partition, then add an additional column and write
+-- to another partition
+-- This can produce unexpected results with CombineHiveInputFormat
+
+INSERT OVERWRITE TABLE test_orc PARTITION (part = '1') SELECT key FROM src LIMIT 5;
+
+ALTER TABLE test_orc ADD COLUMNS (cnt INT);
+
+INSERT OVERWRITE TABLE test_orc PARTITION (part = '2') SELECT key, count(*) FROM src GROUP BY key LIMIT 5;
+
+SELECT * FROM test_orc;
Added: hive/trunk/ql/src/test/queries/clientpositive/orc_empty_files.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/orc_empty_files.q?rev=1469908&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/orc_empty_files.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/orc_empty_files.q Fri Apr 19 15:40:09 2013
@@ -0,0 +1,18 @@
+CREATE TABLE test_orc (key STRING, cnt INT)
+CLUSTERED BY (key) INTO 3 BUCKETS
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
+
+set hive.enforce.bucketing=true;
+set hive.exec.reducers.max = 1;
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+
+-- Creates a table bucketed into 3 buckets, but only one contains data, specifically bucket 1,
+-- buckets 0 and 2 are empty, so this tests reading from and empty file followed by a file
+-- containing data and a file containing data followed by an empty file.
+-- This can produce unexpected results with CombineHiveInputFormat
+
+INSERT OVERWRITE TABLE test_orc SELECT one, COUNT(*) FROM (SELECT 1 AS one FROM src) a GROUP BY one;
+
+SELECT count(*) FROM test_orc;
Added: hive/trunk/ql/src/test/results/clientpositive/orc_diff_part_cols.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/orc_diff_part_cols.q.out?rev=1469908&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/orc_diff_part_cols.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/orc_diff_part_cols.q.out Fri Apr 19 15:40:09 2013
@@ -0,0 +1,75 @@
+PREHOOK: query: CREATE TABLE test_orc (key STRING)
+PARTITIONED BY (part STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE test_orc (key STRING)
+PARTITIONED BY (part STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@test_orc
+PREHOOK: query: -- Create a table with one column write to a partition, then add an additional column and write
+-- to another partition
+-- This can produce unexpected results with CombineHiveInputFormat
+
+INSERT OVERWRITE TABLE test_orc PARTITION (part = '1') SELECT key FROM src LIMIT 5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@test_orc@part=1
+POSTHOOK: query: -- Create a table with one column write to a partition, then add an additional column and write
+-- to another partition
+-- This can produce unexpected results with CombineHiveInputFormat
+
+INSERT OVERWRITE TABLE test_orc PARTITION (part = '1') SELECT key FROM src LIMIT 5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@test_orc@part=1
+POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: ALTER TABLE test_orc ADD COLUMNS (cnt INT)
+PREHOOK: type: ALTERTABLE_ADDCOLS
+PREHOOK: Input: default@test_orc
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: ALTER TABLE test_orc ADD COLUMNS (cnt INT)
+POSTHOOK: type: ALTERTABLE_ADDCOLS
+POSTHOOK: Input: default@test_orc
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: INSERT OVERWRITE TABLE test_orc PARTITION (part = '2') SELECT key, count(*) FROM src GROUP BY key LIMIT 5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@test_orc@part=2
+POSTHOOK: query: INSERT OVERWRITE TABLE test_orc PARTITION (part = '2') SELECT key, count(*) FROM src GROUP BY key LIMIT 5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@test_orc@part=2
+POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_orc PARTITION(part=2).cnt EXPRESSION [(src)src.null, ]
+POSTHOOK: Lineage: test_orc PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: SELECT * FROM test_orc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc
+PREHOOK: Input: default@test_orc@part=1
+PREHOOK: Input: default@test_orc@part=2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM test_orc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc
+POSTHOOK: Input: default@test_orc@part=1
+POSTHOOK: Input: default@test_orc@part=2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: test_orc PARTITION(part=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: test_orc PARTITION(part=2).cnt EXPRESSION [(src)src.null, ]
+POSTHOOK: Lineage: test_orc PARTITION(part=2).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+238 NULL 1
+86 NULL 1
+311 NULL 1
+27 NULL 1
+165 NULL 1
+0 3 2
+10 1 2
+100 2 2
+103 2 2
+104 2 2
Added: hive/trunk/ql/src/test/results/clientpositive/orc_empty_files.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/orc_empty_files.q.out?rev=1469908&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/orc_empty_files.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/orc_empty_files.q.out Fri Apr 19 15:40:09 2013
@@ -0,0 +1,44 @@
+PREHOOK: query: CREATE TABLE test_orc (key STRING, cnt INT)
+CLUSTERED BY (key) INTO 3 BUCKETS
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE test_orc (key STRING, cnt INT)
+CLUSTERED BY (key) INTO 3 BUCKETS
+ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@test_orc
+PREHOOK: query: -- Creates a table bucketed into 3 buckets, but only one contains data, specifically bucket 1,
+-- buckets 0 and 2 are empty, so this tests reading from and empty file followed by a file
+-- containing data and a file containing data followed by an empty file.
+-- This can produce unexpected results with CombineHiveInputFormat
+
+INSERT OVERWRITE TABLE test_orc SELECT one, COUNT(*) FROM (SELECT 1 AS one FROM src) a GROUP BY one
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@test_orc
+POSTHOOK: query: -- Creates a table bucketed into 3 buckets, but only one contains data, specifically bucket 1,
+-- buckets 0 and 2 are empty, so this tests reading from and empty file followed by a file
+-- containing data and a file containing data followed by an empty file.
+-- This can produce unexpected results with CombineHiveInputFormat
+
+INSERT OVERWRITE TABLE test_orc SELECT one, COUNT(*) FROM (SELECT 1 AS one FROM src) a GROUP BY one
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@test_orc
+POSTHOOK: Lineage: test_orc.cnt EXPRESSION []
+POSTHOOK: Lineage: test_orc.key EXPRESSION []
+PREHOOK: query: SELECT count(*) FROM test_orc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(*) FROM test_orc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test_orc
+#### A masked pattern was here ####
+POSTHOOK: Lineage: test_orc.cnt EXPRESSION []
+POSTHOOK: Lineage: test_orc.key EXPRESSION []
+1