You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mj...@apache.org on 2017/09/08 16:50:49 UTC

[3/3] incubator-impala git commit: IMPALA-5867: Fix bugs parsing 2-digit year

IMPALA-5867: Fix bugs parsing 2-digit year

This patch fixes several bugs parsing 1 or 2-digit year formats.
Existing code is broken in several ways:
1. With 1 or 2-digit year format and month/day missing, ParseDateTime()
   throws an uncaught exception.
2. If now() is 02/29 in a leap year but (now() - 80 years) isn't,
   DateTimeFormatContext::SetCenturyBreak() throws an uncaught
   exception.
3. If the year parsed is 02/29 in a leap year but it isn't a leap year
   100 years ago, TimestampParser::Parse() will consider the date as
   invalid though it isn't.
This patch fixes above bugs and adds a few test cases in
be/src/runtime/timestamp-test.cc
The behaviors after change is:
1. A date without month or day is considered invalid. This is a
   pre-existing difference from Hive, which defaults missing month/day
   to 01/01.
2. Century break would be set to 02/28 80 years ago.
3. If parsed date is 00/02/29 but 1900/02/29 does not exist, treat
   it as 03/01 when comparing to century break.

Change-Id: Ia4f430caea88b6c33f8050a1984ee0ee32ecb0a1
Reviewed-on: http://gerrit.cloudera.org:8080/7910
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2fbdc8e3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2fbdc8e3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2fbdc8e3

Branch: refs/heads/master
Commit: 2fbdc8e37e4cb0a3b3408e90b5a972d778fea7eb
Parents: ac68913
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Wed Aug 30 14:14:52 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Sep 8 03:05:11 2017 +0000

----------------------------------------------------------------------
 be/src/runtime/timestamp-parse-util.cc | 95 +++++++++++++++++------------
 be/src/runtime/timestamp-parse-util.h  | 10 +++
 be/src/runtime/timestamp-test.cc       | 49 +++++++++++----
 3 files changed, 104 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-parse-util.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc
index 9b9e8c8..e64d904 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -29,8 +29,10 @@ namespace assign = boost::assign;
 using boost::unordered_map;
 using boost::gregorian::date;
 using boost::gregorian::date_duration;
+using boost::gregorian::gregorian_calendar;
 using boost::posix_time::hours;
 using boost::posix_time::not_a_date_time;
+using boost::posix_time::ptime;
 using boost::posix_time::time_duration;
 
 namespace impala {
@@ -45,6 +47,8 @@ struct DateTimeParseResult {
   int second;
   int32_t fraction;
   boost::posix_time::time_duration tz_offset;
+  // Whether to realign the year for 2-digit year format
+  bool realign_year;
 
   DateTimeParseResult()
     : year(0),
@@ -54,14 +58,22 @@ struct DateTimeParseResult {
       minute(0),
       second(0),
       fraction(0),
-      tz_offset(0,0,0,0) {
+      tz_offset(0,0,0,0),
+      realign_year(false) {
   }
 };
 
 void DateTimeFormatContext::SetCenturyBreak(const TimestampValue &now) {
-  const date& now_date = now.date();
-  century_break_ptime = boost::posix_time::ptime(
-      date(now_date.year() - 80, now_date.month(), now_date.day()), now.time());
+  auto& now_date = now.date();
+  // If the century break is at an invalid 02/29, set it to 02/28 for consistency with
+  // Hive.
+  if (now_date.month() == 2 && now_date.day() == 29 &&
+      !gregorian_calendar::is_leap_year(now_date.year() - 80)) {
+    century_break_ptime = ptime(date(now_date.year() - 80, 2, 28), now.time());
+  } else {
+    century_break_ptime = ptime(
+        date(now_date.year() - 80, now_date.month(), now_date.day()), now.time());
+  }
 }
 
 bool TimestampParser::initialized_ = false;
@@ -301,6 +313,32 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
   }
 }
 
+date TimestampParser::RealignYear(const DateTimeParseResult& dt_result,
+    const DateTimeFormatContext& dt_ctx, int day_offset, const time_duration& t) {
+  DCHECK(!dt_ctx.century_break_ptime.is_special());
+  // Let the century start at AABB and the year parsed be YY, this gives us AAYY.
+  int year = dt_result.year + (dt_ctx.century_break_ptime.date().year() / 100) * 100;
+  date unshifted_date;
+  // The potential actual date (02/29 in unshifted year + 100 years) might be valid
+  // even if unshifted date is not, so try to make unshifted date valid by adding 1 day.
+  // This makes the behavior closer to Hive.
+  if (dt_result.month == 2 && dt_result.day == 29 &&
+      !gregorian_calendar::is_leap_year(year)) {
+    unshifted_date = date(year, 3, 1);
+  } else {
+    unshifted_date = date(year, dt_result.month, dt_result.day);
+  }
+  unshifted_date += date_duration(day_offset);
+  // Advance 100 years if parsed time is before the century break.
+  // For example if the century breaks at 1937 but dt_result->year = 1936,
+  // the correct year would be 2036.
+  if (ptime(unshifted_date, t) < dt_ctx.century_break_ptime) {
+    return date(year + 100, dt_result.month, dt_result.day) + date_duration(day_offset);
+  } else {
+    return date(year, dt_result.month, dt_result.day) + date_duration(day_offset);
+  }
+}
+
 bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx,
     date* d, time_duration* t) {
   DCHECK(TimestampParser::initialized_);
@@ -330,25 +368,23 @@ bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContex
     *t = time_duration(0, 0, 0, 0);
   }
   if (dt_ctx.has_date_toks) {
-    bool is_valid_date = true;
     try {
       DCHECK(-1 <= day_offset && day_offset <= 1);
-      if ((dt_result.year == 1400 && dt_result.month == 1 && dt_result.day == 1 &&
-           day_offset == -1) ||
-          (dt_result.year == 9999 && dt_result.month == 12 && dt_result.day == 31 &&
-           day_offset == 1)) {
-        // Have to check lower/upper bound explicitly.
-        // Tried date::is_not_a_date_time() but it doesn't complain value is out of range
-        // for "'1400-01-01' - 1 day" and "'9999-12-31' + 1 day".
-        is_valid_date = false;
+      if (dt_result.realign_year) {
+        *d = RealignYear(dt_result, dt_ctx, day_offset, *t);
       } else {
-        *d = date(dt_result.year, dt_result.month, dt_result.day);
-        *d += date_duration(day_offset);
+        *d = date(dt_result.year, dt_result.month, dt_result.day)
+             + date_duration(day_offset);
+      }
+      // Have to check year lower/upper bound [1400, 9999] here because
+      // operator + (date, date_duration) won't throw an exception even if the result is
+      // out-of-range.
+      if (d->year() < 1400 || d->year() > 9999) {
+        // Calling year() on out-of-range date throws an exception itself. This branch is
+        // to describe the checking logic but is never taken.
+        DCHECK(false);
       }
     } catch (boost::exception&) {
-      is_valid_date = false;
-    }
-    if (!is_valid_date) {
       VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-"
                << dt_result.day;
       *d = date();
@@ -428,8 +464,6 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
   // Keep track of the number of characters we need to shift token positions by.
   // Variable-length tokens will result in values > 0;
   int shift_len = 0;
-  // Whether to realign the year for 2-digit year format
-  bool realign_year = false;
   for (const DateTimeFormatToken& tok: dt_ctx.toks) {
     const char* tok_val = str + tok.pos + shift_len;
     if (tok.type == SEPARATOR) {
@@ -449,10 +483,10 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
       case YEAR: {
         dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, &status);
         if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
-        if (UNLIKELY(dt_result->year < 1 || dt_result->year > 9999)) return false;
+        if (UNLIKELY(dt_result->year < 0 || dt_result->year > 9999)) return false;
         // Year in "Y" and "YY" format should be in the interval
         // [current time - 80 years, current time + 20 years)
-        if (tok_len <= 2) realign_year = true;
+        if (tok_len <= 2) dt_result->realign_year = true;
         break;
       }
       case MONTH_IN_YEAR: {
@@ -546,23 +580,6 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
       default: DCHECK(false) << "Unknown date/time format token";
     }
   }
-  // Hive uses Java's SimpleDateFormat to parse timestamp:
-  // In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years.
-  // https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html
-  if (realign_year) {
-    DCHECK(!dt_ctx.century_break_ptime.is_special());
-    // Let the century start at AABB and the year parsed be YY, this gives us AAYY.
-    dt_result->year += (dt_ctx.century_break_ptime.date().year() / 100) * 100;
-    date parsed_date(dt_result->year, dt_result->month, dt_result->day);
-    time_duration parsed_time(dt_result->hour, dt_result->minute, dt_result->second,
-        dt_result->fraction);
-    // Advance 100 years if parsed time is before the century break
-    // For example if the century breaks at 1937 but dt_result->year = 1936,
-    // the correct year would be 2036.
-    if (boost::posix_time::ptime(parsed_date, parsed_time) < dt_ctx.century_break_ptime) {
-      dt_result->year += 100;
-    }
-  }
   return true;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-parse-util.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h
index b68cc85..bbcc03f 100644
--- a/be/src/runtime/timestamp-parse-util.h
+++ b/be/src/runtime/timestamp-parse-util.h
@@ -219,6 +219,16 @@ class TimestampParser {
   static bool ParseDateTime(const char* str, int str_len,
       const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);
 
+  /// Helper function finding the correct century for 1 or 2 digit year according to
+  /// century break. Throws bad_year, bad_day_of_month, or bad_day_month if the date is
+  /// invalid. The century break behavior is copied from Java SimpleDateFormat in order to
+  /// be consistent with Hive.
+  /// In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years.
+  /// https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html
+  static boost::gregorian::date RealignYear(const DateTimeParseResult& dt_result,
+      const DateTimeFormatContext& dt_ctx, int day_offset,
+      const boost::posix_time::time_duration& t);
+
   /// Check if the string is a TimeZone offset token.
   /// Valid offset token format are 'hh:mm', 'hhmm', 'hh'.
   static bool IsValidTZOffset(const char* str_begin, const char* str_end);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index 5f3e0e6..b8919cc 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -231,7 +231,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int year, int month,
 
 TEST(TimestampTest, Basic) {
   // Fix current time to determine the behavior parsing 2-digit year format
-  TimestampValue now(date(2017, 7, 28), time_duration(16, 14, 24));
+  // Set it to 03/01 to test 02/29 edge cases.
+  TimestampValue now(date(1980, 3, 1), time_duration(16, 14, 24));
 
   char s1[] = "2012-01-20 01:10:01";
   char s2[] = "1990-10-20 10:10:10.123456789  ";
@@ -426,6 +427,19 @@ TEST(TimestampTest, Basic) {
   }
   // Test parsing/formatting of complex date/time formats
   vector<TimestampTC> test_cases = boost::assign::list_of
+    // Test year upper/lower bound
+    (TimestampTC("yyyy-MM-dd HH:mm:ss", "1400-01-01 00:00:00",
+        false, true, false, 1400, 1, 1))
+    (TimestampTC("yyyy-MM-dd HH:mm:ss", "1399-12-31 23:59:59",
+        false, true))
+    (TimestampTC("yyyy-MM-dd HH:mm:ss", "9999-12-31 23:59:59",
+        false, true, false, 9999, 12, 31, 23, 59, 59))
+    (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "1400-01-01 01:00:00 +01", false, true, false,
+        1400, 1, 1, 0, 0, 0))
+    (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "1400-01-01 01:00:00 +02", false, true))
+    (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "9999-12-31 22:00:00 -01", false, true, false,
+        9999, 12, 31, 23, 0, 0))
+    (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "9999-12-31 22:00:00 -02", false, true))
     // Test case on literal short months
     (TimestampTC("yyyy-MMM-dd", "2013-OCT-01", false, true, false, 2013, 10, 1))
     // Test case on literal short months
@@ -470,19 +484,32 @@ TEST(TimestampTest, Basic) {
     (TimestampTC("MMdd", "1201", false, true))
     // Test missing month
     (TimestampTC("yyyydd", "201301", false, true))
-    // Test missing month
-    (TimestampTC("yyyymm", "201301", false, true))
+    (TimestampTC("yydd", "1301", false, true))
+    // Test missing day
+    (TimestampTC("yyyyMM", "201301", false, true))
+    (TimestampTC("yyMM", "8512", false, true))
+    // Test missing month and day
+    (TimestampTC("yyyy", "2013", false, true))
+    (TimestampTC("yy", "13", false, true))
     // Test short year token
     (TimestampTC("y-MM-dd", "2013-11-13", false, true, false, 2013, 11, 13))
-    (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 2013, 11, 13))
+    (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 1913, 11, 13))
     // Test 2-digit year format
-    (TimestampTC("yy-MM-dd", "37-07-28", false, true, false, 2037, 7, 28))
-    (TimestampTC("yy-MM-dd", "37-07-29", false, true, false, 1937, 7, 29))
+    (TimestampTC("yy-MM-dd", "17-08-31", false, true, false, 1917, 8, 31))
+    (TimestampTC("yy-MM-dd", "99-08-31", false, true, false, 1999, 8, 31))
+    // Test 02/29 edge cases of 2-digit year format
+    (TimestampTC("yy-MM-dd", "00-02-28", false, true, false, 2000, 2, 28))
+    (TimestampTC("yy-MM-dd", "00-02-29", false, true, false, 2000, 2, 29))
+    (TimestampTC("yy-MM-dd", "00-03-01", false, true, false, 2000, 3, 1))
+    (TimestampTC("yy-MM-dd", "00-03-02", false, true, false, 1900, 3, 2))
+    (TimestampTC("yy-MM-dd", "04-02-29", false, true, false, 1904, 2, 29))
+    (TimestampTC("yy-MM-dd", "99-02-29", false, true))
     // Test 1-digit year format with time to show the exact boundary
-    (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:23", false, true, false,
-                 2037, 7, 28, 16, 14, 23))
-    (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:24", false, true, false,
-                 1937, 7, 28, 16, 14, 24))
+    // Before the cutoff. Year should be 2000
+    (TimestampTC("y-MM-dd HH:mm:ss", "00-02-29 16:14:23", false, true, false,
+        2000, 2, 29, 16, 14, 23))
+    // After the cutoff but 02/29/1900 is invalid
+    (TimestampTC("y-MM-dd HH:mm:ss", "00-02-29 16:14:24", false, true))
     // Test short month token
     (TimestampTC("yyyy-M-dd", "2013-11-13", false, true, false, 2013, 11, 13))
     (TimestampTC("yyyy-M-dd", "2013-1-13", false, true, false, 2013, 1, 13))
@@ -491,7 +518,7 @@ TEST(TimestampTest, Basic) {
     (TimestampTC("yyyy-MM-d", "2013-11-3", false, true, false, 2013, 11, 3))
     // Test short all date tokens
     (TimestampTC("y-M-d", "2013-11-13", false, true, false, 2013, 11, 13))
-    (TimestampTC("y-M-d", "13-1-3", false, true, false, 2013, 1, 3))
+    (TimestampTC("y-M-d", "13-1-3", false, true, false, 1913, 1, 3))
     // Test short hour token
     (TimestampTC("H:mm:ss", "14:24:34", false, false, true, 0, 0, 0, 14, 24, 34))
     (TimestampTC("H:mm:ss", "4:24:34", false, false, true, 0, 0, 0, 4, 24, 34))