You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/09/11 05:57:40 UTC
git commit: [SQL] Add test case with workaround for reading
partitioned Avro files
Repository: spark
Updated Branches:
refs/heads/master 79cdb9b64 -> 84e2c8bfe
[SQL] Add test case with workaround for reading partitioned Avro files
In order to read from partitioned Avro files we need to also set the `SERDEPROPERTIES` since `TBLPROPERTIES` are not passed to the initialization. This PR simply adds a test to make sure we don't break this workaround.
Author: Michael Armbrust <mi...@databricks.com>
Closes #2340 from marmbrus/avroPartitioned and squashes the following commits:
6b969d6 [Michael Armbrust] fix style
fea2124 [Michael Armbrust] Add test case with workaround for reading partitioned avro files.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84e2c8bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84e2c8bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84e2c8bf
Branch: refs/heads/master
Commit: 84e2c8bfe41837baf2aeffa9741e4dbd14351981
Parents: 79cdb9b
Author: Michael Armbrust <mi...@databricks.com>
Authored: Wed Sep 10 20:57:38 2014 -0700
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Wed Sep 10 20:57:38 2014 -0700
----------------------------------------------------------------------
.../org/apache/spark/sql/hive/TestHive.scala | 69 +++++++++++++++++++-
...AvroSerDe-0-e4501461c855cc9071a872a64186c3de | 8 +++
.../sql/hive/execution/HiveSerDeSuite.scala | 2 +
3 files changed, 78 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/84e2c8bf/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
index a013f3f..6974f3e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TestHive.scala
@@ -269,7 +269,74 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
|)
""".stripMargin.cmd,
s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/episodes.avro")}' INTO TABLE episodes".cmd
- )
+ ),
+ // THIS TABLE IS NOT THE SAME AS THE HIVE TEST TABLE episodes_partitioned AS DYNAMIC PARITIONING
+ // IS NOT YET SUPPORTED
+ TestTable("episodes_part",
+ s"""CREATE TABLE episodes_part (title STRING, air_date STRING, doctor INT)
+ |PARTITIONED BY (doctor_pt INT)
+ |ROW FORMAT SERDE '${classOf[AvroSerDe].getCanonicalName}'
+ |STORED AS
+ |INPUTFORMAT '${classOf[AvroContainerInputFormat].getCanonicalName}'
+ |OUTPUTFORMAT '${classOf[AvroContainerOutputFormat].getCanonicalName}'
+ |TBLPROPERTIES (
+ | 'avro.schema.literal'='{
+ | "type": "record",
+ | "name": "episodes",
+ | "namespace": "testing.hive.avro.serde",
+ | "fields": [
+ | {
+ | "name": "title",
+ | "type": "string",
+ | "doc": "episode title"
+ | },
+ | {
+ | "name": "air_date",
+ | "type": "string",
+ | "doc": "initial date"
+ | },
+ | {
+ | "name": "doctor",
+ | "type": "int",
+ | "doc": "main actor playing the Doctor in episode"
+ | }
+ | ]
+ | }'
+ |)
+ """.stripMargin.cmd,
+ // WORKAROUND: Required to pass schema to SerDe for partitioned tables.
+ // TODO: Pass this automatically from the table to partitions.
+ s"""
+ |ALTER TABLE episodes_part SET SERDEPROPERTIES (
+ | 'avro.schema.literal'='{
+ | "type": "record",
+ | "name": "episodes",
+ | "namespace": "testing.hive.avro.serde",
+ | "fields": [
+ | {
+ | "name": "title",
+ | "type": "string",
+ | "doc": "episode title"
+ | },
+ | {
+ | "name": "air_date",
+ | "type": "string",
+ | "doc": "initial date"
+ | },
+ | {
+ | "name": "doctor",
+ | "type": "int",
+ | "doc": "main actor playing the Doctor in episode"
+ | }
+ | ]
+ | }'
+ |)
+ """.stripMargin.cmd,
+ s"""
+ INSERT OVERWRITE TABLE episodes_part PARTITION (doctor_pt=1)
+ SELECT title, air_date, doctor FROM episodes
+ """.cmd
+ )
)
hiveQTestUtilTables.foreach(registerTestTable)
http://git-wip-us.apache.org/repos/asf/spark/blob/84e2c8bf/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de b/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de
new file mode 100644
index 0000000..49c8434
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/Read Partitioned with AvroSerDe-0-e4501461c855cc9071a872a64186c3de
@@ -0,0 +1,8 @@
+The Eleventh Hour 3 April 2010 11 1
+The Doctor's Wife 14 May 2011 11 1
+Horror of Fang Rock 3 September 1977 4 1
+An Unearthly Child 23 November 1963 1 1
+The Mysterious Planet 6 September 1986 6 1
+Rose 26 March 2005 9 1
+The Power of the Daleks 5 November 1966 2 1
+Castrolava 4 January 1982 5 1
http://git-wip-us.apache.org/repos/asf/spark/blob/84e2c8bf/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
index 8bc7238..7486bfa 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala
@@ -37,4 +37,6 @@ class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
createQueryTest("Read with RegexSerDe", "SELECT * FROM sales")
createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")
+
+ createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org