You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/03/16 09:16:49 UTC
[spark] branch branch-3.0 updated: [SPARK-31076][SQL][FOLLOWUP]
Incapsulate date rebasing to `DaysWritable`
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 5902a23 [SPARK-31076][SQL][FOLLOWUP] Incapsulate date rebasing to `DaysWritable`
5902a23 is described below
commit 5902a232c14516f893895be0d673354933d8a5d0
Author: Maxim Gekk <ma...@gmail.com>
AuthorDate: Mon Mar 16 17:06:15 2020 +0800
[SPARK-31076][SQL][FOLLOWUP] Incapsulate date rebasing to `DaysWritable`
### What changes were proposed in this pull request?
Move the code related to days rebasing from/to Julian calendar from `HiveInspectors` to new class `DaysWritable`.
### Why are the changes needed?
To improve maintainability of the `HiveInspectors` trait which is already pretty complex.
### Does this PR introduce any user-facing change?
No
### How was this patch tested?
By `HiveOrcHadoopFsRelationSuite`.
Closes #27890 from MaxGekk/replace-DateWritable-by-DaysWritable.
Authored-by: Maxim Gekk <ma...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
(cherry picked from commit 57854c736c2ca495eb03962f61857e1600864e95)
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../org/apache/spark/sql/hive/DaysWritable.scala | 117 +++++++++++++++++++++
.../org/apache/spark/sql/hive/HiveInspectors.scala | 58 +---------
2 files changed, 120 insertions(+), 55 deletions(-)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
new file mode 100644
index 0000000..53a0deb
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive
+
+import java.io.{DataInput, DataOutput, IOException}
+import java.sql.Date
+import java.time.LocalDate
+import java.util.Calendar
+
+import org.apache.hadoop.hive.serde2.io.DateWritable
+import org.apache.hadoop.io.WritableUtils
+
+import org.apache.spark.sql.catalyst.util.{DateTimeConstants, DateTimeUtils}
+
+/**
+ * The class accepts/returns days in Gregorian calendar and rebase them
+ * via conversion to local date in Julian calendar for dates before 1582-10-15
+ * in read/write for backward compatibility with Spark 2.4 and earlier versions.
+ *
+ * @param gregorianDays The number of days since the epoch 1970-01-01 in
+ * Gregorian calendar.
+ * @param julianDays The number of days since the epoch 1970-01-01 in
+ * Julian calendar.
+ */
+private[hive] class DaysWritable(
+ var gregorianDays: Int,
+ var julianDays: Int)
+ extends DateWritable {
+
+ def this(gregorianDays: Int) =
+ this(gregorianDays, DaysWritable.rebaseGregorianToJulianDays(gregorianDays))
+ def this(dateWritable: DateWritable) = {
+ this(
+ gregorianDays = dateWritable match {
+ case daysWritable: DaysWritable => daysWritable.gregorianDays
+ case dateWritable: DateWritable =>
+ DaysWritable.rebaseJulianToGregorianDays(dateWritable.getDays)
+ },
+ julianDays = dateWritable.getDays)
+ }
+
+ override def getDays: Int = julianDays
+ override def get(): Date = new Date(DateWritable.daysToMillis(julianDays))
+
+ @throws[IOException]
+ override def write(out: DataOutput): Unit = {
+ WritableUtils.writeVInt(out, julianDays)
+ }
+
+ @throws[IOException]
+ override def readFields(in: DataInput): Unit = {
+ julianDays = WritableUtils.readVInt(in)
+ gregorianDays = DaysWritable.rebaseJulianToGregorianDays(julianDays)
+ }
+}
+
+private[hive] object DaysWritable {
+ // Rebasing days since the epoch to store the same number of days
+ // as by Spark 2.4 and earlier versions. Spark 3.0 switched to
+ // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that,
+ // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use
+ // Julian calendar for dates before 1582-10-15. So, the same local date may
+ // be mapped to different number of days since the epoch in different calendars.
+ // For example:
+ // Proleptic Gregorian calendar: 1582-01-01 -> -141714
+ // Julian calendar: 1582-01-01 -> -141704
+ // The code below converts -141714 to -141704.
+ def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = {
+ if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) {
+ val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY)
+ val utcCal = new Calendar.Builder()
+ .setCalendarType("gregory")
+ .setTimeZone(DateTimeUtils.TimeZoneUTC)
+ .setInstant(millis)
+ .build()
+ val localDate = LocalDate.of(
+ utcCal.get(Calendar.YEAR),
+ utcCal.get(Calendar.MONTH) + 1,
+ utcCal.get(Calendar.DAY_OF_MONTH))
+ Math.toIntExact(localDate.toEpochDay)
+ } else {
+ daysSinceEpoch
+ }
+ }
+
+ def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = {
+ if (daysSinceEpoch < JULIAN_CUTOVER_DAY) {
+ val localDate = LocalDate.ofEpochDay(daysSinceEpoch)
+ val utcCal = new Calendar.Builder()
+ .setCalendarType("gregory")
+ .setTimeZone(DateTimeUtils.TimeZoneUTC)
+ .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth)
+ .build()
+ Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY))
+ } else {
+ daysSinceEpoch
+ }
+ }
+
+ final val JULIAN_CUTOVER_DAY =
+ rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt)
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index e217c52..e3e9a31 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -18,8 +18,6 @@
package org.apache.spark.sql.hive
import java.lang.reflect.{ParameterizedType, Type, WildcardType}
-import java.time.LocalDate
-import java.util.Calendar
import scala.collection.JavaConverters._
@@ -182,33 +180,6 @@ import org.apache.spark.unsafe.types.UTF8String
*/
private[hive] trait HiveInspectors {
- private final val JULIAN_CUTOVER_DAY =
- rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt)
-
- private def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = {
- val localDate = LocalDate.ofEpochDay(daysSinceEpoch)
- val utcCal = new Calendar.Builder()
- .setCalendarType("gregory")
- .setTimeZone(DateTimeUtils.TimeZoneUTC)
- .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth)
- .build()
- Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY))
- }
-
- private def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = {
- val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY)
- val utcCal = new Calendar.Builder()
- .setCalendarType("gregory")
- .setTimeZone(DateTimeUtils.TimeZoneUTC)
- .setInstant(millis)
- .build()
- val localDate = LocalDate.of(
- utcCal.get(Calendar.YEAR),
- utcCal.get(Calendar.MONTH) + 1,
- utcCal.get(Calendar.DAY_OF_MONTH))
- Math.toIntExact(localDate.toEpochDay)
- }
-
def javaTypeToDataType(clz: Type): DataType = clz match {
// writable
case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType
@@ -646,14 +617,7 @@ private[hive] trait HiveInspectors {
case x: DateObjectInspector if x.preferWritable() =>
data: Any => {
if (data != null) {
- // Rebasing written days via conversion to local dates.
- // See the comment for `getDateWritable()`.
- val daysSinceEpoch = x.getPrimitiveWritableObject(data).getDays
- if (daysSinceEpoch < JULIAN_CUTOVER_DAY) {
- rebaseJulianToGregorianDays(daysSinceEpoch)
- } else {
- daysSinceEpoch
- }
+ new DaysWritable(x.getPrimitiveWritableObject(data)).gregorianDays
} else {
null
}
@@ -1045,27 +1009,11 @@ private[hive] trait HiveInspectors {
new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]])
}
- private def getDateWritable(value: Any): hiveIo.DateWritable =
+ private def getDateWritable(value: Any): DaysWritable =
if (value == null) {
null
} else {
- // Rebasing days since the epoch to store the same number of days
- // as by Spark 2.4 and earlier versions. Spark 3.0 switched to
- // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that,
- // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use
- // Julian calendar for dates before 1582-10-15. So, the same local date may
- // be mapped to different number of days since the epoch in different calendars.
- // For example:
- // Proleptic Gregorian calendar: 1582-01-01 -> -141714
- // Julian calendar: 1582-01-01 -> -141704
- // The code below converts -141714 to -141704.
- val daysSinceEpoch = value.asInstanceOf[Int]
- val rebasedDays = if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) {
- rebaseGregorianToJulianDays(daysSinceEpoch)
- } else {
- daysSinceEpoch
- }
- new hiveIo.DateWritable(rebasedDays)
+ new DaysWritable(value.asInstanceOf[Int])
}
private def getTimestampWritable(value: Any): hiveIo.TimestampWritable =
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org