You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2020/02/28 18:39:29 UTC

[hive] branch master updated: HIVE-22840: Race condition in formatters of TimestampColumnVector and DateColumnVector (Shubham Chaurasia, reviewed by Jesus Camacho Rodriguez)

This is an automated email from the ASF dual-hosted git repository.

jcamacho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new c072847  HIVE-22840: Race condition in formatters of TimestampColumnVector and DateColumnVector (Shubham Chaurasia, reviewed by Jesus Camacho Rodriguez)
c072847 is described below

commit c0728470eb4fbe446b7a1da711913ca55737e967
Author: Shubham Chaurasia <sc...@cloudera.com>
AuthorDate: Fri Feb 28 10:38:37 2020 -0800

    HIVE-22840: Race condition in formatters of TimestampColumnVector and DateColumnVector (Shubham Chaurasia, reviewed by Jesus Camacho Rodriguez)
    
    Close apache/hive#922
---
 ql/pom.xml                                         |  4 ++
 serde/pom.xml                                      |  4 ++
 .../hadoop/hive/common/type/CalendarUtils.java     | 41 ++++++++++++++-------
 .../hive/ql/exec/vector/DateColumnVector.java      | 42 +++------------------
 .../hive/ql/exec/vector/TimestampColumnVector.java | 34 ++---------------
 .../hive/ql/exec/vector/TestDateColumnVector.java  | 43 ++++++++++++++++++++++
 .../ql/exec/vector/TestTimestampColumnVector.java  | 36 ++++++++++++++++++
 7 files changed, 123 insertions(+), 81 deletions(-)

diff --git a/ql/pom.xml b/ql/pom.xml
index 8b0c02b..d5c83b6 100644
--- a/ql/pom.xml
+++ b/ql/pom.xml
@@ -98,6 +98,10 @@
       <artifactId>hive-spark-client</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hive</groupId>
+      <artifactId>hive-storage-api</artifactId>
+    </dependency>
     <!-- inter-project -->
     <dependency>
       <groupId>com.esotericsoftware</groupId>
diff --git a/serde/pom.xml b/serde/pom.xml
index 10fa1b7..ca4fee2 100644
--- a/serde/pom.xml
+++ b/serde/pom.xml
@@ -49,6 +49,10 @@
       <artifactId>hive-shims</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hive</groupId>
+      <artifactId>hive-storage-api</artifactId>
+    </dependency>
     <!-- inter-project -->
     <dependency>
       <groupId>com.google.code.findbugs</groupId>
diff --git a/common/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java b/storage-api/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java
similarity index 84%
rename from common/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java
rename to storage-api/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java
index 9b491d0..7555d16 100644
--- a/common/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java
+++ b/storage-api/src/java/org/apache/hadoop/hive/common/type/CalendarUtils.java
@@ -37,9 +37,15 @@ import java.util.concurrent.TimeUnit;
  */
 public class CalendarUtils {
 
-  private static SimpleDateFormat createFormatter(String fmt,
-                                                  GregorianCalendar calendar) {
+  public static final long SWITCHOVER_MILLIS;
+  public static final long SWITCHOVER_DAYS;
+
+  private static SimpleDateFormat createFormatter(String fmt, boolean proleptic) {
     SimpleDateFormat result = new SimpleDateFormat(fmt);
+    GregorianCalendar calendar = new GregorianCalendar(UTC);
+    if (proleptic) {
+      calendar.setGregorianChange(new Date(Long.MIN_VALUE));
+    }
     result.setCalendar(calendar);
     return result;
   }
@@ -47,24 +53,17 @@ public class CalendarUtils {
   private static final String DATE = "yyyy-MM-dd";
   private static final String TIME = DATE + " HH:mm:ss.SSS";
   private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
-  private static final GregorianCalendar HYBRID = new GregorianCalendar();
   private static final ThreadLocal<SimpleDateFormat> HYBRID_DATE_FORMAT =
-      ThreadLocal.withInitial(() -> createFormatter(DATE, HYBRID));
+      ThreadLocal.withInitial(() -> createFormatter(DATE, false));
   private static final ThreadLocal<SimpleDateFormat> HYBRID_TIME_FORMAT =
-      ThreadLocal.withInitial(() -> createFormatter(TIME, HYBRID));
-  private static final long SWITCHOVER_MILLIS;
-  private static final long SWITCHOVER_DAYS;
-  private static final GregorianCalendar PROLEPTIC = new GregorianCalendar();
+      ThreadLocal.withInitial(() -> createFormatter(TIME, false));
+
   private static final ThreadLocal<SimpleDateFormat> PROLEPTIC_DATE_FORMAT =
-      ThreadLocal.withInitial(() -> createFormatter(DATE, PROLEPTIC));
+      ThreadLocal.withInitial(() -> createFormatter(DATE, true));
   private static final ThreadLocal<SimpleDateFormat> PROLEPTIC_TIME_FORMAT =
-      ThreadLocal.withInitial(() -> createFormatter(TIME, PROLEPTIC));
+      ThreadLocal.withInitial(() -> createFormatter(TIME, true));
 
   static {
-    HYBRID.setTimeZone(UTC);
-    PROLEPTIC.setTimeZone(UTC);
-    PROLEPTIC.setGregorianChange(new Date(Long.MIN_VALUE));
-
     // Get the last day where the two calendars agree with each other.
     try {
       SWITCHOVER_MILLIS = HYBRID_DATE_FORMAT.get().parse("1582-10-15").getTime();
@@ -177,6 +176,20 @@ public class CalendarUtils {
     return hybrid;
   }
 
+  /**
+   *
+   * Formats epoch day to date according to proleptic or hybrid calendar
+   *
+   * @param epochDay  epoch day
+   * @param useProleptic if true - uses proleptic formatter, else uses hybrid formatter
+   * @return formatted date
+   */
+  public static String formatDate(long epochDay, boolean useProleptic) {
+    long millis = TimeUnit.DAYS.toMillis(epochDay);
+    return useProleptic ? PROLEPTIC_DATE_FORMAT.get().format(millis)
+        : HYBRID_DATE_FORMAT.get().format(millis);
+  }
+
   private CalendarUtils() {
     throw new UnsupportedOperationException();
   }
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java
index 3dac667..6608a6e 100644
--- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/DateColumnVector.java
@@ -17,37 +17,13 @@
  */
 package org.apache.hadoop.hive.ql.exec.vector;
 
-import java.text.SimpleDateFormat;
-import java.util.GregorianCalendar;
-import java.util.TimeZone;
-import java.util.concurrent.TimeUnit;
+import org.apache.hadoop.hive.common.type.CalendarUtils;
 
 /**
  * This class extends LongColumnVector in order to introduce some date-specific semantics. In
  * DateColumnVector, the elements of vector[] represent the days since 1970-01-01
  */
 public class DateColumnVector extends LongColumnVector {
-  private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
-  private static final GregorianCalendar PROLEPTIC_GREGORIAN_CALENDAR = new GregorianCalendar(UTC);
-  private static final GregorianCalendar GREGORIAN_CALENDAR = new GregorianCalendar(UTC);
-
-  private static final SimpleDateFormat PROLEPTIC_GREGORIAN_DATE_FORMATTER =
-      new SimpleDateFormat("yyyy-MM-dd");
-  private static final SimpleDateFormat GREGORIAN_DATE_FORMATTER =
-      new SimpleDateFormat("yyyy-MM-dd");
-
-  /**
-  * -141427: hybrid: 1582-10-15 proleptic: 1582-10-15
-  * -141428: hybrid: 1582-10-04 proleptic: 1582-10-14
-  */
-  private static final int CUTOVER_DAY_EPOCH = -141427; // it's 1582-10-15 in both calendars
-
-  static {
-    PROLEPTIC_GREGORIAN_CALENDAR.setGregorianChange(new java.util.Date(Long.MIN_VALUE));
-
-    PROLEPTIC_GREGORIAN_DATE_FORMATTER.setCalendar(PROLEPTIC_GREGORIAN_CALENDAR);
-    GREGORIAN_DATE_FORMATTER.setCalendar(GREGORIAN_CALENDAR);
-  }
 
   private boolean usingProlepticCalendar = false;
 
@@ -76,24 +52,16 @@ public class DateColumnVector extends LongColumnVector {
 
   private void updateDataAccordingProlepticSetting() throws Exception {
     for (int i = 0; i < vector.length; i++) {
-      if (vector[i] >= CUTOVER_DAY_EPOCH) { // no need for conversion
+      if (vector[i] >= CalendarUtils.SWITCHOVER_DAYS) { // no need for conversion
         continue;
       }
-      long millis = TimeUnit.DAYS.toMillis(vector[i]);
-      String originalFormatted = usingProlepticCalendar ? GREGORIAN_DATE_FORMATTER.format(millis)
-        : PROLEPTIC_GREGORIAN_DATE_FORMATTER.format(millis);
-
-      millis = (usingProlepticCalendar ? PROLEPTIC_GREGORIAN_DATE_FORMATTER.parse(originalFormatted)
-        : GREGORIAN_DATE_FORMATTER.parse(originalFormatted)).getTime();
-
-      vector[i] = TimeUnit.MILLISECONDS.toDays(millis);
+      vector[i] = usingProlepticCalendar ? CalendarUtils.convertDateToProleptic((int) vector[i]) : CalendarUtils
+          .convertDateToHybrid((int) vector[i]);
     }
   }
 
   public String formatDate(int i) {
-    long millis = TimeUnit.DAYS.toMillis(vector[i]);
-    return usingProlepticCalendar ? PROLEPTIC_GREGORIAN_DATE_FORMATTER.format(millis)
-      : GREGORIAN_DATE_FORMATTER.format(millis);
+    return CalendarUtils.formatDate(vector[i], usingProlepticCalendar);
   }
 
   public DateColumnVector setUsingProlepticCalendar(boolean usingProlepticCalendar) {
diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
index d5dfc92..7807e69 100644
--- a/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
+++ b/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.java
@@ -18,14 +18,12 @@
 package org.apache.hadoop.hive.ql.exec.vector;
 
 import java.sql.Timestamp;
-import java.text.SimpleDateFormat;
 import java.time.Instant;
 import java.time.LocalDateTime;
 import java.time.ZoneOffset;
 import java.util.Arrays;
-import java.util.GregorianCalendar;
-import java.util.TimeZone;
 
+import org.apache.hadoop.hive.common.type.CalendarUtils;
 import org.apache.hadoop.io.Writable;
 
 /**
@@ -41,26 +39,6 @@ import org.apache.hadoop.io.Writable;
  * using the scratch timestamp, and then perhaps update the column vector row with a result.
  */
 public class TimestampColumnVector extends ColumnVector {
-  private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
-  private static final GregorianCalendar PROLEPTIC_GREGORIAN_CALENDAR_UTC =
-      new GregorianCalendar(UTC);
-  private static final GregorianCalendar GREGORIAN_CALENDAR_UTC =
-      new GregorianCalendar(UTC);
-
-  private static final SimpleDateFormat PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC =
-      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-  private static final SimpleDateFormat GREGORIAN_TIMESTAMP_FORMATTER_UTC =
-      new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-
-  static {
-    PROLEPTIC_GREGORIAN_CALENDAR_UTC.setGregorianChange(new java.util.Date(Long.MIN_VALUE));
-
-    PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.setCalendar(PROLEPTIC_GREGORIAN_CALENDAR_UTC);
-    GREGORIAN_TIMESTAMP_FORMATTER_UTC.setCalendar(GREGORIAN_CALENDAR_UTC);
-  }
-
-  // it's 1582-10-15 in both calendars
-  private static final int CUTOVER_MILLIS_EPOCH = -141427 * 24 * 60 * 60 * 1000;
 
   /*
    * The storage arrays for this column vector corresponds to the storage of a Timestamp:
@@ -594,18 +572,14 @@ public class TimestampColumnVector extends ColumnVector {
 
   private void updateDataAccordingProlepticSetting() throws Exception {
     for (int i = 0; i < nanos.length; i++) {
-      if (time[i] >= CUTOVER_MILLIS_EPOCH) { // no need for conversion
+      if (time[i] >= CalendarUtils.SWITCHOVER_MILLIS) { // no need for conversion
         continue;
       }
       asScratchTimestamp(i);
       long offset = 0;
-      String formatted =
-          usingProlepticCalendar ? GREGORIAN_TIMESTAMP_FORMATTER_UTC.format(scratchTimestamp)
-            : PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.format(scratchTimestamp);
 
-      long millis = usingProlepticCalendar
-        ? PROLEPTIC_GREGORIAN_TIMESTAMP_FORMATTER_UTC.parse(formatted).getTime()
-        : GREGORIAN_TIMESTAMP_FORMATTER_UTC.parse(formatted).getTime();
+      long millis = usingProlepticCalendar ? CalendarUtils.convertTimeToProleptic(scratchTimestamp.getTime())
+          : CalendarUtils.convertTimeToHybrid(scratchTimestamp.getTime());
 
       Timestamp newTimeStamp = Timestamp.from(Instant.ofEpochMilli(millis));
 
diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java
index 0d4dc5d..9f53245 100644
--- a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java
+++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestDateColumnVector.java
@@ -17,6 +17,9 @@
  */
 package org.apache.hadoop.hive.ql.exec.vector;
 
+import java.util.ArrayList;
+import java.util.List;
+
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -77,4 +80,44 @@ public class TestDateColumnVector {
                         " new = " + newUseProleptic,
                         expectedDateString, dateColumnVector.formatDate(0));
   }
+
+  @Test(timeout = 300_000)
+  public void testMultiThreaded() throws Exception {
+
+    //when java DateTimeFormatter/GregorianCalendar race was not handled, used to throw exceptions like -
+
+    // 1) java.lang.NumberFormatException: For input string: "" OR java.lang.NumberFormatException: For input string: ".821582E.821582E44"
+
+    // 2) Caused by: java.lang.ArrayIndexOutOfBoundsException: -5325980
+    //	at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453)
+    //	at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397)
+
+    // create 5 threads and start manipulating vectors, should not throw any exceptions now.
+
+    List<Thread> threads = new ArrayList<>();
+
+    threads.add(startVectorManipulationThread(50000, -141428));
+    threads.add(startVectorManipulationThread(50000, -141430));
+    threads.add(startVectorManipulationThread(50000, -16768));
+    threads.add(startVectorManipulationThread(50000, -499952));
+    threads.add(startVectorManipulationThread(50000, -499955));
+
+    for (Thread thread : threads) {
+      thread.join();
+    }
+
+  }
+
+  private Thread startVectorManipulationThread(final int vectorLength, final int epochDay) {
+    Thread thread = new Thread(() -> {
+      DateColumnVector columnVector = new DateColumnVector(vectorLength).setUsingProlepticCalendar(true);
+      for (int i = 0; i < vectorLength; i++) {
+        columnVector.vector[i] = epochDay;
+      }
+      columnVector.changeCalendar(false, true);
+    });
+    thread.start();
+    return thread;
+  }
+
 }
diff --git a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java
index 333a5b5..2d85b11 100644
--- a/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java
+++ b/storage-api/src/test/org/apache/hadoop/hive/ql/exec/vector/TestTimestampColumnVector.java
@@ -24,7 +24,9 @@ import java.sql.Timestamp;
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.time.Instant;
+import java.util.ArrayList;
 import java.util.GregorianCalendar;
+import java.util.List;
 import java.util.Random;
 import java.util.TimeZone;
 import java.util.concurrent.TimeUnit;
@@ -210,4 +212,38 @@ public class TestTimestampColumnVector {
 
     return testFormatter;
   }
+
+
+  @Test(timeout = 300_000)
+  public void testMultiThreaded() throws Exception {
+
+    // similar to TestDateColumnVector#testMultiThreaded
+
+    List<Thread> threads = new ArrayList<>();
+
+    threads.add(startVectorManipulationThread(50000, -141428));
+    threads.add(startVectorManipulationThread(50000, -141430));
+    threads.add(startVectorManipulationThread(50000, -16768));
+    threads.add(startVectorManipulationThread(50000, -499952));
+    threads.add(startVectorManipulationThread(50000, -499955));
+
+    for (Thread thread : threads) {
+      thread.join();
+    }
+
+  }
+
+  private Thread startVectorManipulationThread(final int vectorLength, final long millis) {
+    Thread thread = new Thread(() -> {
+      TimestampColumnVector columnVector = new TimestampColumnVector(vectorLength).setUsingProlepticCalendar(true);
+      for (int i = 0; i < vectorLength; i++) {
+        columnVector.time[i] = millis;
+        columnVector.nanos[i] = 1;
+      }
+      columnVector.changeCalendar(false, true);
+    });
+    thread.start();
+    return thread;
+  }
+
 }