You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/06/20 03:22:38 UTC

svn commit: r1494833 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/exec/MapOperator.java java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java test/queries/clientpositive/avro_partitioned.q test/results/clientpositive/avro_partitioned.q.out

Author: hashutosh
Date: Thu Jun 20 01:22:38 2013
New Revision: 1494833

URL: http://svn.apache.org/r1494833
Log:
HIVE-3953 : Reading of partitioned Avro data fails because of missing properties (Mark Wagner via Ashutosh Chauhan)

Added:
    hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q
    hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java?rev=1494833&r1=1494832&r2=1494833&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java Thu Jun 20 01:22:38 2013
@@ -242,10 +242,12 @@ public class MapOperator extends Operato
       SerDeException {
     PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile);
     LinkedHashMap<String, String> partSpec = pd.getPartSpec();
-    // Use tblProps in case of unpartitioned tables
+    // Use table properties in case of unpartitioned tables,
+    // and the union of table properties and partition properties, with partition
+    // taking precedence
     Properties partProps =
         (pd.getPartSpec() == null || pd.getPartSpec().isEmpty()) ?
-            pd.getTableDesc().getProperties() : pd.getProperties();
+            pd.getTableDesc().getProperties() : pd.getOverlayedProperties();
 
     Class serdeclass = pd.getDeserializerClass();
     if (serdeclass == null) {
@@ -409,7 +411,7 @@ public class MapOperator extends Operato
         // If the partition does not exist, use table properties
         Properties partProps =
             (pd.getPartSpec() == null || pd.getPartSpec().isEmpty()) ?
-                tblProps : pd.getProperties();
+                tblProps : pd.getOverlayedProperties();
 
         Class sdclass = pd.getDeserializerClass();
         if (sdclass == null) {

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java?rev=1494833&r1=1494832&r2=1494833&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java Thu Jun 20 01:22:38 2013
@@ -188,6 +188,16 @@ public class PartitionDesc implements Se
     return properties;
   }
 
+  public java.util.Properties getOverlayedProperties(){
+    if (tableDesc != null) {
+      Properties overlayedProps = new Properties(tableDesc.getProperties());
+      overlayedProps.putAll(getProperties());
+      return overlayedProps;
+    } else {
+      return getProperties();
+    }
+  }
+
   public void setProperties(final java.util.Properties properties) {
     this.properties = properties;
   }

Added: hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q?rev=1494833&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/avro_partitioned.q Thu Jun 20 01:22:38 2013
@@ -0,0 +1,66 @@
+-- verify that new joins bring in correct schemas (including evolved schemas)
+CREATE TABLE episodes
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+  "namespace": "testing.hive.avro.serde",
+  "name": "episodes",
+  "type": "record",
+  "fields": [
+    {
+      "name":"title",
+      "type":"string",
+      "doc":"episode title"
+    },
+    {
+      "name":"air_date",
+      "type":"string",
+      "doc":"initial date"
+    },
+    {
+      "name":"doctor",
+      "type":"int",
+      "doc":"main actor playing the Doctor in episode"
+    }
+  ]
+}');
+
+LOAD DATA LOCAL INPATH '../data/files/episodes.avro' INTO TABLE episodes;
+
+CREATE TABLE episodes_partitioned
+PARTITIONED BY (doctor_pt INT)
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+  "namespace": "testing.hive.avro.serde",
+  "name": "episodes",
+  "type": "record",
+  "fields": [
+    {
+      "name":"title",
+      "type":"string",
+      "doc":"episode title"
+    },
+    {
+      "name":"air_date",
+      "type":"string",
+      "doc":"initial date"
+    },
+    {
+      "name":"doctor",
+      "type":"int",
+      "doc":"main actor playing the Doctor in episode"
+    }
+  ]
+}');
+
+SET hive.exec.dynamic.partition.mode=nonstrict;
+INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes;
+
+SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 ORDER BY air_date;

Added: hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out?rev=1494833&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/avro_partitioned.q.out Thu Jun 20 01:22:38 2013
@@ -0,0 +1,199 @@
+PREHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas)
+CREATE TABLE episodes
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+  "namespace": "testing.hive.avro.serde",
+  "name": "episodes",
+  "type": "record",
+  "fields": [
+    {
+      "name":"title",
+      "type":"string",
+      "doc":"episode title"
+    },
+    {
+      "name":"air_date",
+      "type":"string",
+      "doc":"initial date"
+    },
+    {
+      "name":"doctor",
+      "type":"int",
+      "doc":"main actor playing the Doctor in episode"
+    }
+  ]
+}')
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- verify that new joins bring in correct schemas (including evolved schemas)
+CREATE TABLE episodes
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+  "namespace": "testing.hive.avro.serde",
+  "name": "episodes",
+  "type": "record",
+  "fields": [
+    {
+      "name":"title",
+      "type":"string",
+      "doc":"episode title"
+    },
+    {
+      "name":"air_date",
+      "type":"string",
+      "doc":"initial date"
+    },
+    {
+      "name":"doctor",
+      "type":"int",
+      "doc":"main actor playing the Doctor in episode"
+    }
+  ]
+}')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@episodes
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/episodes.avro' INTO TABLE episodes
+PREHOOK: type: LOAD
+PREHOOK: Output: default@episodes
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/episodes.avro' INTO TABLE episodes
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@episodes
+PREHOOK: query: CREATE TABLE episodes_partitioned
+PARTITIONED BY (doctor_pt INT)
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+  "namespace": "testing.hive.avro.serde",
+  "name": "episodes",
+  "type": "record",
+  "fields": [
+    {
+      "name":"title",
+      "type":"string",
+      "doc":"episode title"
+    },
+    {
+      "name":"air_date",
+      "type":"string",
+      "doc":"initial date"
+    },
+    {
+      "name":"doctor",
+      "type":"int",
+      "doc":"main actor playing the Doctor in episode"
+    }
+  ]
+}')
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE episodes_partitioned
+PARTITIONED BY (doctor_pt INT)
+ROW FORMAT
+SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+STORED AS
+INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+TBLPROPERTIES ('avro.schema.literal'='{
+  "namespace": "testing.hive.avro.serde",
+  "name": "episodes",
+  "type": "record",
+  "fields": [
+    {
+      "name":"title",
+      "type":"string",
+      "doc":"episode title"
+    },
+    {
+      "name":"air_date",
+      "type":"string",
+      "doc":"initial date"
+    },
+    {
+      "name":"doctor",
+      "type":"int",
+      "doc":"main actor playing the Doctor in episode"
+    }
+  ]
+}')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@episodes_partitioned
+PREHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes
+PREHOOK: type: QUERY
+PREHOOK: Input: default@episodes
+PREHOOK: Output: default@episodes_partitioned
+POSTHOOK: query: INSERT OVERWRITE TABLE episodes_partitioned PARTITION (doctor_pt) SELECT title, air_date, doctor, doctor as doctor_pt FROM episodes
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@episodes
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=1
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=11
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=2
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=4
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=5
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=6
+POSTHOOK: Output: default@episodes_partitioned@doctor_pt=9
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+PREHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 ORDER BY air_date
+PREHOOK: type: QUERY
+PREHOOK: Input: default@episodes_partitioned
+PREHOOK: Input: default@episodes_partitioned@doctor_pt=11
+PREHOOK: Input: default@episodes_partitioned@doctor_pt=9
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM episodes_partitioned WHERE doctor_pt > 6 ORDER BY air_date
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@episodes_partitioned
+POSTHOOK: Input: default@episodes_partitioned@doctor_pt=11
+POSTHOOK: Input: default@episodes_partitioned@doctor_pt=9
+#### A masked pattern was here ####
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=11).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=1).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=2).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=4).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=5).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=6).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).air_date SIMPLE [(episodes)episodes.FieldSchema(name:air_date, type:string, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).doctor SIMPLE [(episodes)episodes.FieldSchema(name:doctor, type:int, comment:from deserializer), ]
+POSTHOOK: Lineage: episodes_partitioned PARTITION(doctor_pt=9).title SIMPLE [(episodes)episodes.FieldSchema(name:title, type:string, comment:from deserializer), ]
+The Doctor's Wife	14 May 2011	11	11
+Rose	26 March 2005	9	9
+The Eleventh Hour	3 April 2010	11	11