You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/06/20 03:22:38 UTC
svn commit: r1494833 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/exec/MapOperator.java
java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
test/queries/clientpositive/avro_partitioned.q
test/results/clientpositive/avro_partitioned.q.out
Author: hashutosh
Date: Thu Jun 20 01:22:38 2013
New Revision: 1494833
URL: http://svn.apache.org/r1494833
Log:
HIVE-3953 : Reading of partitioned Avro data fails because of missing properties (Mark Wagner via Ashutosh Chauhan)
Added:
hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q
hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java?rev=1494833&r1=1494832&r2=1494833&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java Thu Jun 20 01:22:38 2013
@@ -242,10 +242,12 @@ public class MapOperator extends Operato
SerDeException {
PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile);
LinkedHashMap<String, String> partSpec = pd.getPartSpec();
- // Use tblProps in case of unpartitioned tables
+ // Use table properties in case of unpartitioned tables,
+ // and the union of table properties and partition properties, with partition
+ // taking precedence
Properties partProps =
(pd.getPartSpec() == null || pd.getPartSpec().isEmpty()) ?
- pd.getTableDesc().getProperties() : pd.getProperties();
+ pd.getTableDesc().getProperties() : pd.getOverlayedProperties();
Class serdeclass = pd.getDeserializerClass();
if (serdeclass == null) {
@@ -409,7 +411,7 @@ public class MapOperator extends Operato
// If the partition does not exist, use table properties
Properties partProps =
(pd.getPartSpec() == null || pd.getPartSpec().isEmpty()) ?
- tblProps : pd.getProperties();
+ tblProps : pd.getOverlayedProperties();
Class sdclass = pd.getDeserializerClass();
if (sdclass == null) {
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java?rev=1494833&r1=1494832&r2=1494833&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java Thu Jun 20 01:22:38 2013
@@ -188,6 +188,16 @@ public class PartitionDesc implements Se
return properties;
}
+ public java.util.Properties getOverlayedProperties(){
+ if (tableDesc != null) {
+ Properties overlayedProps = new Properties(tableDesc.getProperties());
+ overlayedProps.putAll(getProperties());
+ return overlayedProps;
+ } else {
+ return getProperties();
+ }
+ }
+
public void setProperties(final java.util.Properties properties) {
this.properties = properties;
}
Added: hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q?rev=1494833&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q Thu Jun 20 01:22:38 2013
@@ -0,0 +1,66 @@
+-- verify that new joins bring in correct schemas (including evolved schemas)
+CREATE TABLE episodes
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+ "namespace": "testing.hive.avro.serde",
+ "name": "episodes",
+ "type": "record",
+ "fields": [
+ {
+ "name":"title",
+ "type":"string",
+ "doc":"episode title"
+ },
+ {
+ "name":"air_date",
+ "type":"string",
+ "doc":"initial date"
+ },
+ {
+ "name":"doctor",
+ "type":"int",
+ "doc":"main actor playing the Doctor in episode"
+ }
+ ]
+}');
+
+LOAD DATA LOCAL INPATH '../data/files/episodes.avro' INTO TABLE episodes;
+
+CREATE TABLE episodes_partitioned
+PARTITIONED BY (doctor_pt INT)
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+ "namespace": "testing.hive.avro.serde",
+ "name": "episodes",
+ "type": "record",
+ "fields": [
+ {
+ "name":"title",
+ "type":"string",
+ "doc":"episode title"
+ },
+ {
+ "name":"air_date",
+ "type":"string",
+ "doc":"initial date"
+ },
+ {
+ "name":"doctor",
+ "type":"int",
+ "doc":"main actor playing the Doctor in episode"
+ }
+ ]
+}');
+
+SET hive.exec.dynamic.partition.mode=nonstrict;
+INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes;
+
+SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 ORDER BY air_date;
Added: hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out?rev=1494833&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out Thu Jun 20 01:22:38 2013
@@ -0,0 +1,199 @@
+PREHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas)
+CREATE TABLE episodes
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+ "namespace": "testing.hive.avro.serde",
+ "name": "episodes",
+ "type": "record",
+ "fields": [
+ {
+ "name":"title",
+ "type":"string",
+ "doc":"episode title"
+ },
+ {
+ "name":"air_date",
+ "type":"string",
+ "doc":"initial date"
+ },
+ {
+ "name":"doctor",
+ "type":"int",
+ "doc":"main actor playing the Doctor in episode"
+ }
+ ]
+}')
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas)
+CREATE TABLE episodes
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+ "namespace": "testing.hive.avro.serde",
+ "name": "episodes",
+ "type": "record",
+ "fields": [
+ {
+ "name":"title",
+ "type":"string",
+ "doc":"episode title"
+ },
+ {
+ "name":"air_date",
+ "type":"string",
+ "doc":"initial date"
+ },
+ {
+ "name":"doctor",
+ "type":"int",
+ "doc":"main actor playing the Doctor in episode"
+ }
+ ]
+}')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@episodes
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/episodes.avro' INTO TABLE episodes
+PREHOOK: type: LOAD
+PREHOOK: Output: default@episodes
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/episodes.avro' INTO TABLE episodes
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@episodes
+PREHOOK: query: CREATE TABLE episodes_partitioned
+PARTITIONED BY (doctor_pt INT)
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+ "namespace": "testing.hive.avro.serde",
+ "name": "episodes",
+ "type": "record",
+ "fields": [
+ {
+ "name":"title",
+ "type":"string",
+ "doc":"episode title"
+ },
+ {
+ "name":"air_date",
+ "type":"string",
+ "doc":"initial date"
+ },
+ {
+ "name":"doctor",
+ "type":"int",
+ "doc":"main actor playing the Doctor in episode"
+ }
+ ]
+}')
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE episodes_partitioned
+PARTITIONED BY (doctor_pt INT)
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+ "namespace": "testing.hive.avro.serde",
+ "name": "episodes",
+ "type": "record",
+ "fields": [
+ {
+ "name":"title",
+ "type":"string",
+ "doc":"episode title"
+ },
+ {
+ "name":"air_date",
+ "type":"string",
+ "doc":"initial date"
+ },
+ {
+ "name":"doctor",
+ "type":"int",
+ "doc":"main actor playing the Doctor in episode"
+ }
+ ]
+}')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@episodes_partitioned
+PREHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes
+PREHOOK: type: QUERY
+PREHOOK: Input: default@episodes
+PREHOOK: Output: default@episodes_partitioned
+POSTHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@episodes
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=1
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=11
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=2
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=4
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=5
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=6
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=9
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+PREHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 ORDER BY air_date
+PREHOOK: type: QUERY
+PREHOOK: Input: default@episodes_partitioned
+PREHOOK: Input: default@episodes_partitioned@doctor_pt=11
+PREHOOK: Input: default@episodes_partitioned@doctor_pt=9
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 ORDER BY air_date
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@episodes_partitioned
+POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11
+POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9
+#### A masked pattern was here ####
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+The Doctor's Wife 14 May 2011 11 11
+Rose 26 March 2005 9 9
+The Eleventh Hour 3 April 2010 11 11