You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/05/24 03:34:18 UTC
spark git commit: [SPARK-24322][BUILD] Upgrade Apache ORC to 1.4.4
Repository: spark
Updated Branches:
refs/heads/master 888340151 -> 486ecc680
[SPARK-24322][BUILD] Upgrade Apache ORC to 1.4.4
## What changes were proposed in this pull request?
ORC 1.4.4 includes [nine fixes](https://issues.apache.org/jira/issues/?filter=12342568&jql=project%20%3D%20ORC%20AND%20resolution%20%3D%20Fixed%20AND%20fixVersion%20%3D%201.4.4). One of the issues is about `Timestamp` bug (ORC-306) which occurs when `native` ORC vectorized reader reads ORC column vector's sub-vector `times` and `nanos`. ORC-306 fixes this according to the [original definition](https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java#L45-L46) and this PR includes the updated interpretation on ORC column vectors. Note that `hive` ORC reader and ORC MR reader is not affected.
```scala
scala> spark.version
res0: String = 2.3.0
scala> spark.sql("set spark.sql.orc.impl=native")
scala> Seq(java.sql.Timestamp.valueOf("1900-05-05 12:34:56.000789")).toDF().write.orc("/tmp/orc")
scala> spark.read.orc("/tmp/orc").show(false)
+--------------------------+
|value |
+--------------------------+
|1900-05-05 12:34:55.000789|
+--------------------------+
```
This PR aims to update Apache Spark to use it.
**FULL LIST**
ID | TITLE
-- | --
ORC-281 | Fix compiler warnings from clang 5.0
ORC-301 | `extractFileTail` should open a file in `try` statement
ORC-304 | Fix TestRecordReaderImpl to not fail with new storage-api
ORC-306 | Fix incorrect workaround for bug in java.sql.Timestamp
ORC-324 | Add support for ARM and PPC arch
ORC-330 | Remove unnecessary Hive artifacts from root pom
ORC-332 | Add syntax version to orc_proto.proto
ORC-336 | Remove avro and parquet dependency management entries
ORC-360 | Implement error checking on subtype fields in Java
## How was this patch tested?
Pass the Jenkins.
Author: Dongjoon Hyun <do...@apache.org>
Closes #21372 from dongjoon-hyun/SPARK_ORC144.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/486ecc68
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/486ecc68
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/486ecc68
Branch: refs/heads/master
Commit: 486ecc680e9a0e7b6b3c3a45fb883a61072096fc
Parents: 8883401
Author: Dongjoon Hyun <do...@apache.org>
Authored: Thu May 24 11:34:13 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Thu May 24 11:34:13 2018 +0800
----------------------------------------------------------------------
dev/deps/spark-deps-hadoop-2.6 | 4 ++--
dev/deps/spark-deps-hadoop-2.7 | 4 ++--
dev/deps/spark-deps-hadoop-3.1 | 4 ++--
pom.xml | 2 +-
.../sql/execution/datasources/orc/OrcColumnVector.java | 2 +-
.../execution/datasources/orc/OrcColumnarBatchReader.java | 2 +-
.../sql/execution/datasources/orc/OrcSourceSuite.scala | 9 +++++++++
7 files changed, 18 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/dev/deps/spark-deps-hadoop-2.6
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index e710e26..723180a 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -157,8 +157,8 @@ objenesis-2.1.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/dev/deps/spark-deps-hadoop-2.7
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 97ad17a..ea08a00 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -158,8 +158,8 @@ objenesis-2.1.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/dev/deps/spark-deps-hadoop-3.1
----------------------------------------------------------------------
diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
index e21bfef..da87402 100644
--- a/dev/deps/spark-deps-hadoop-3.1
+++ b/dev/deps/spark-deps-hadoop-3.1
@@ -176,8 +176,8 @@ okhttp-2.7.5.jar
okhttp-3.8.1.jar
okio-1.13.0.jar
opencsv-2.3.jar
-orc-core-1.4.3-nohive.jar
-orc-mapreduce-1.4.3-nohive.jar
+orc-core-1.4.4-nohive.jar
+orc-mapreduce-1.4.4-nohive.jar
oro-2.0.8.jar
osgi-resource-locator-1.0.1.jar
paranamer-2.8.jar
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 6e37e51..883c096 100644
--- a/pom.xml
+++ b/pom.xml
@@ -130,7 +130,7 @@
<hive.version.short>1.2.1</hive.version.short>
<derby.version>10.12.1.1</derby.version>
<parquet.version>1.10.0</parquet.version>
- <orc.version>1.4.3</orc.version>
+ <orc.version>1.4.4</orc.version>
<orc.classifier>nohive</orc.classifier>
<hive.parquet.version>1.6.0</hive.parquet.version>
<jetty.version>9.3.20.v20170531</jetty.version>
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index 12f4d65..9bfad1e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -136,7 +136,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
public long getLong(int rowId) {
int index = getRowIndex(rowId);
if (isTimestamp) {
- return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;
+ return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000 % 1000;
} else {
return longData.vector[index];
}
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
----------------------------------------------------------------------
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
index dcebdc3..a0d9578 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -497,7 +497,7 @@ public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
* Returns the number of micros since epoch from an element of TimestampColumnVector.
*/
private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {
- return vector.time[index] * 1000L + vector.nanos[index] / 1000L;
+ return vector.time[index] * 1000 + (vector.nanos[index] / 1000 % 1000);
}
/**
http://git-wip-us.apache.org/repos/asf/spark/blob/486ecc68/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 8a3bbd0..02bfb71 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -18,6 +18,7 @@
package org.apache.spark.sql.execution.datasources.orc
import java.io.File
+import java.sql.Timestamp
import java.util.Locale
import org.apache.orc.OrcConf.COMPRESS
@@ -169,6 +170,14 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
}
}
}
+
+ test("SPARK-24322 Fix incorrect workaround for bug in java.sql.Timestamp") {
+ withTempPath { path =>
+ val ts = Timestamp.valueOf("1900-05-05 12:34:56.000789")
+ Seq(ts).toDF.write.orc(path.getCanonicalPath)
+ checkAnswer(spark.read.orc(path.getCanonicalPath), Row(ts))
+ }
+ }
}
class OrcSourceSuite extends OrcSuite with SharedSQLContext {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org