You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2018/07/18 06:52:21 UTC
spark git commit: [SPARK-24576][BUILD] Upgrade Apache ORC to 1.5.2
Repository: spark
Updated Branches:
refs/heads/master fc2e18963 -> 3b59d326c
[SPARK-24576][BUILD] Upgrade Apache ORC to 1.5.2
## What changes were proposed in this pull request?
This issue aims to upgrade Apache ORC library from 1.4.4 to 1.5.2 in order to bring the following benefits into Apache Spark.
- [ORC-91](https://issues.apache.org/jira/browse/ORC-91) Support for variable length blocks in HDFS (The current space wasted in ORC to padding is known to be 5%.)
- [ORC-344](https://issues.apache.org/jira/browse/ORC-344) Support for using Decimal64ColumnVector
In addition to that, Apache Hive 3.1 and 3.2 will use ORC 1.5.1 ([HIVE-19669](https://issues.apache.org/jira/browse/HIVE-19465)) and 1.5.2 ([HIVE-19792](https://issues.apache.org/jira/browse/HIVE-19792)) respectively. This will improve the compatibility between Apache Spark and Apache Hive by sharing the common library.
## How was this patch tested?
Pass the Jenkins with all existing tests.
Author: Dongjoon Hyun <do...@apache.org>
Closes #21582 from dongjoon-hyun/SPARK-24576.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3b59d326
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3b59d326
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3b59d326
Branch: refs/heads/master
Commit: 3b59d326c77bec96e5fb856d827139e0389394ba
Parents: fc2e189
Author: Dongjoon Hyun <do...@apache.org>
Authored: Tue Jul 17 23:52:17 2018 -0700
Committer: Xiao Li <ga...@gmail.com>
Committed: Tue Jul 17 23:52:17 2018 -0700
----------------------------------------------------------------------
dev/deps/spark-deps-hadoop-2.6 | 7 ++---
dev/deps/spark-deps-hadoop-2.7 | 7 ++---
dev/deps/spark-deps-hadoop-3.1 | 7 ++---
pom.xml | 2 +-
sql/core/pom.xml | 28 ++++++++++++++++++++
.../datasources/orc/OrcFileFormat.scala | 15 ++++++++++-
.../datasources/orc/OrcSerializer.scala | 2 +-
7 files changed, 56 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/dev/deps/spark-deps-hadoop-2.6
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index f50a0aa..ff6d5c3 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
RoaringBitmap-0.5.11.jar
ST4-4.0.4.jar
activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
antlr-2.7.7.jar
antlr-runtime-3.4.jar
antlr4-runtime-4.7.jar
@@ -157,8 +157,9 @@ objenesis-2.1.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/dev/deps/spark-deps-hadoop-2.7
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 774f9dc..72a94f8 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -2,7 +2,7 @@ JavaEWAH-0.3.2.jar
RoaringBitmap-0.5.11.jar
ST4-4.0.4.jar
activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
antlr-2.7.7.jar
antlr-runtime-3.4.jar
antlr4-runtime-4.7.jar
@@ -158,8 +158,9 @@ objenesis-2.1.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/dev/deps/spark-deps-hadoop-3.1
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index 19c05ad..3409dc4 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -4,7 +4,7 @@ RoaringBitmap-0.5.11.jar
ST4-4.0.4.jar
accessors-smart-1.2.jar
activation-1.1.1.jar
-aircompressor-0.8.jar
+aircompressor-0.10.jar
antlr-2.7.7.jar
antlr-runtime-3.4.jar
antlr4-runtime-4.7.jar
@@ -176,8 +176,9 @@ okhttp-2.7.5.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.4.4-nohive.jar
-orc-mapreduce-1.4.4-nohive.jar
+orc-core-1.5.2-nohive.jar
+orc-mapreduce-1.5.2-nohive.jar
+orc-shims-1.5.2.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 1892bbe..649221d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -131,7 +131,7 @@
<hive.version.short>1.2.1</hive.version.short>
<derby.version>10.12.1.1</derby.version>
<parquet.version>1.10.0</parquet.version>
- <orc.version>1.4.4</orc.version>
+ <orc.version>1.5.2</orc.version>
<orc.classifier>nohive</orc.classifier>
<hive.parquet.version>1.6.0</hive.parquet.version>
<jetty.version>9.3.20.v20170531</jetty.version>
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/sql/core/pom.xml
----------------------------------------------------------------------
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 18ae314..8873b00 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -90,11 +90,39 @@
<groupId>org.apache.orc</groupId>
<artifactId>orc-core</artifactId>
<classifier>${orc.classifier}</classifier>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ </exclusion>
+ <!--
+ orc-core:nohive doesn't have this dependency, but we adds this to prevent
+ sbt from getting confused.
+ -->
+ <exclusion>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-storage-api</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>org.apache.orc</groupId>
<artifactId>orc-mapreduce</artifactId>
<classifier>${orc.classifier}</classifier>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ </exclusion>
+ <!--
+ orc-core:nohive doesn't have this dependency, but we adds this to prevent
+ sbt from getting confused.
+ -->
+ <exclusion>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-storage-api</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index 3a8c0ad..df1cebe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -59,6 +59,19 @@ private[sql] object OrcFileFormat {
def checkFieldNames(names: Seq[String]): Unit = {
names.foreach(checkFieldName)
}
+
+ def getQuotedSchemaString(dataType: DataType): String = dataType match {
+ case _: AtomicType => dataType.catalogString
+ case StructType(fields) =>
+ fields.map(f => s"`${f.name}`:${getQuotedSchemaString(f.dataType)}")
+ .mkString("struct<", ",", ">")
+ case ArrayType(elementType, _) =>
+ s"array<${getQuotedSchemaString(elementType)}>"
+ case MapType(keyType, valueType, _) =>
+ s"map<${getQuotedSchemaString(keyType)},${getQuotedSchemaString(valueType)}>"
+ case _ => // UDT and others
+ dataType.catalogString
+ }
}
/**
@@ -93,7 +106,7 @@ class OrcFileFormat
val conf = job.getConfiguration
- conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, dataSchema.catalogString)
+ conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, OrcFileFormat.getQuotedSchemaString(dataSchema))
conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
http://git-wip-us.apache.org/repos/asf/spark/blob/3b59d326/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
index 899af07..90d1268 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
@@ -223,6 +223,6 @@ class OrcSerializer(dataSchema: StructType) {
* Return a Orc value object for the given Spark schema.
*/
private def createOrcValue(dataType: DataType) = {
- OrcStruct.createValue(TypeDescription.fromString(dataType.catalogString))
+ OrcStruct.createValue(TypeDescription.fromString(OrcFileFormat.getQuotedSchemaString(dataType)))
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org