You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2017/08/16 06:00:17 UTC
spark git commit: [SPARK-21422][BUILD] Depend on Apache ORC 1.4.0

Repository: spark
Updated Branches:
  refs/heads/master 07549b20a -> 8c54f1eb7


[SPARK-21422][BUILD] Depend on Apache ORC 1.4.0

## What changes were proposed in this pull request?

Like Parquet, this PR aims to depend on the latest Apache ORC 1.4 for Apache Spark 2.3. There are key benefits for Apache ORC 1.4.

- Stability: Apache ORC 1.4.0 has many fixes and we can depend on ORC community more.
- Maintainability: Reduce the Hive dependency and can remove old legacy code later.

Later, we can get the following two key benefits by adding new ORCFileFormat in SPARK-20728 (#17980), too.
- Usability: User can use ORC data sources without hive module, i.e, -Phive.
- Speed: Use both Spark ColumnarBatch and ORC RowBatch together. This will be faster than the current implementation in Spark.

## How was this patch tested?

Pass the jenkins.

Author: Dongjoon Hyun <do...@apache.org>

Closes #18640 from dongjoon-hyun/SPARK-21422.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c54f1eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c54f1eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c54f1eb

Branch: refs/heads/master
Commit: 8c54f1eb71d7338d3bd8224a57a293a2e7875252
Parents: 07549b2
Author: Dongjoon Hyun <do...@apache.org>
Authored: Tue Aug 15 23:00:13 2017 -0700
Committer: gatorsmile <ga...@gmail.com>
Committed: Tue Aug 15 23:00:13 2017 -0700

----------------------------------------------------------------------
 assembly/pom.xml               |  6 +++++
 dev/deps/spark-deps-hadoop-2.6 |  3 +++
 dev/deps/spark-deps-hadoop-2.7 |  3 +++
 pom.xml                        | 44 +++++++++++++++++++++++++++++++++++++
 sql/core/pom.xml               | 10 +++++++++
 5 files changed, 66 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8c54f1eb/assembly/pom.xml
----------------------------------------------------------------------
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 464af16..cd8366a 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -221,6 +221,12 @@
       </properties>
     </profile>
     <profile>
+      <id>orc-provided</id>
+      <properties>
+        <orc.deps.scope>provided</orc.deps.scope>
+      </properties>
+    </profile>
+    <profile>
       <id>parquet-provided</id>
       <properties>
         <parquet.deps.scope>provided</parquet.deps.scope>

http://git-wip-us.apache.org/repos/asf/spark/blob/8c54f1eb/dev/deps/spark-deps-hadoop-2.6
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 83070a9..01af2c7 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
+aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.5.3.jar
@@ -148,6 +149,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.43.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
+orc-core-1.4.0-nohive.jar
+orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.6.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8c54f1eb/dev/deps/spark-deps-hadoop-2.7
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 5481e25..69f3a4b 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -2,6 +2,7 @@ JavaEWAH-0.3.2.jar
 RoaringBitmap-0.5.11.jar
 ST4-4.0.4.jar
 activation-1.1.1.jar
+aircompressor-0.3.jar
 antlr-2.7.7.jar
 antlr-runtime-3.4.jar
 antlr4-runtime-4.5.3.jar
@@ -149,6 +150,8 @@ netty-3.9.9.Final.jar
 netty-all-4.0.43.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
+orc-core-1.4.0-nohive.jar
+orc-mapreduce-1.4.0-nohive.jar
 oro-2.0.8.jar
 osgi-resource-locator-1.0.1.jar
 paranamer-2.6.jar

http://git-wip-us.apache.org/repos/asf/spark/blob/8c54f1eb/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index d544894..c0df3ef 100644
--- a/pom.xml
+++ b/pom.xml
@@ -132,6 +132,8 @@
     <hive.version.short>1.2.1</hive.version.short>
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.8.2</parquet.version>
+    <orc.version>1.4.0</orc.version>
+    <orc.classifier>nohive</orc.classifier>
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.20.v20170531</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
@@ -208,6 +210,7 @@
     <flume.deps.scope>compile</flume.deps.scope>
     <hadoop.deps.scope>compile</hadoop.deps.scope>
     <hive.deps.scope>compile</hive.deps.scope>
+    <orc.deps.scope>compile</orc.deps.scope>
     <parquet.deps.scope>compile</parquet.deps.scope>
     <parquet.test.deps.scope>test</parquet.test.deps.scope>
 
@@ -1696,6 +1699,44 @@
         </exclusions>
       </dependency>
       <dependency>
+        <groupId>org.apache.orc</groupId>
+        <artifactId>orc-core</artifactId>
+        <version>${orc.version}</version>
+        <classifier>${orc.classifier}</classifier>
+        <scope>${orc.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.orc</groupId>
+        <artifactId>orc-mapreduce</artifactId>
+        <version>${orc.version}</version>
+        <classifier>${orc.classifier}</classifier>
+        <scope>${orc.deps.scope}</scope>
+        <exclusions>
+          <exclusion>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-core</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+          </exclusion>
+        </exclusions>
+      </dependency>
+      <dependency>
         <groupId>org.apache.parquet</groupId>
         <artifactId>parquet-column</artifactId>
         <version>${parquet.version}</version>
@@ -2728,6 +2769,9 @@
       <id>hive-provided</id>
     </profile>
     <profile>
+      <id>orc-provided</id>
+    </profile>
+    <profile>
       <id>parquet-provided</id>
     </profile>
     <profile>

http://git-wip-us.apache.org/repos/asf/spark/blob/8c54f1eb/sql/core/pom.xml
----------------------------------------------------------------------
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index a16411e..9a3cacb 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -87,6 +87,16 @@
     </dependency>
 
     <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-core</artifactId>
+      <classifier>${orc.classifier}</classifier>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.orc</groupId>
+      <artifactId>orc-mapreduce</artifactId>
+      <classifier>${orc.classifier}</classifier>
+    </dependency>
+    <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-column</artifactId>
     </dependency>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org