You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mj...@apache.org on 2017/09/08 16:50:49 UTC
[3/3] incubator-impala git commit: IMPALA-5867: Fix bugs parsing
2-digit year
IMPALA-5867: Fix bugs parsing 2-digit year
This patch fixes several bugs parsing 1 or 2-digit year formats.
Existing code is broken in several ways:
1. With 1 or 2-digit year format and month/day missing, ParseDateTime()
throws an uncaught exception.
2. If now() is 02/29 in a leap year but (now() - 80 years) isn't,
DateTimeFormatContext::SetCenturyBreak() throws an uncaught
exception.
3. If the year parsed is 02/29 in a leap year but it isn't a leap year
100 years ago, TimestampParser::Parse() will consider the date as
invalid though it isn't.
This patch fixes above bugs and adds a few test cases in
be/src/runtime/timestamp-test.cc
The behaviors after change is:
1. A date without month or day is considered invalid. This is a
pre-existing difference from Hive, which defaults missing month/day
to 01/01.
2. Century break would be set to 02/28 80 years ago.
3. If parsed date is 00/02/29 but 1900/02/29 does not exist, treat
it as 03/01 when comparing to century break.
Change-Id: Ia4f430caea88b6c33f8050a1984ee0ee32ecb0a1
Reviewed-on: http://gerrit.cloudera.org:8080/7910
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2fbdc8e3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2fbdc8e3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2fbdc8e3
Branch: refs/heads/master
Commit: 2fbdc8e37e4cb0a3b3408e90b5a972d778fea7eb
Parents: ac68913
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Wed Aug 30 14:14:52 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Sep 8 03:05:11 2017 +0000
----------------------------------------------------------------------
be/src/runtime/timestamp-parse-util.cc | 95 +++++++++++++++++------------
be/src/runtime/timestamp-parse-util.h | 10 +++
be/src/runtime/timestamp-test.cc | 49 +++++++++++----
3 files changed, 104 insertions(+), 50 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-parse-util.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc
index 9b9e8c8..e64d904 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -29,8 +29,10 @@ namespace assign = boost::assign;
using boost::unordered_map;
using boost::gregorian::date;
using boost::gregorian::date_duration;
+using boost::gregorian::gregorian_calendar;
using boost::posix_time::hours;
using boost::posix_time::not_a_date_time;
+using boost::posix_time::ptime;
using boost::posix_time::time_duration;
namespace impala {
@@ -45,6 +47,8 @@ struct DateTimeParseResult {
int second;
int32_t fraction;
boost::posix_time::time_duration tz_offset;
+ // Whether to realign the year for 2-digit year format
+ bool realign_year;
DateTimeParseResult()
: year(0),
@@ -54,14 +58,22 @@ struct DateTimeParseResult {
minute(0),
second(0),
fraction(0),
- tz_offset(0,0,0,0) {
+ tz_offset(0,0,0,0),
+ realign_year(false) {
}
};
void DateTimeFormatContext::SetCenturyBreak(const TimestampValue &now) {
- const date& now_date = now.date();
- century_break_ptime = boost::posix_time::ptime(
- date(now_date.year() - 80, now_date.month(), now_date.day()), now.time());
+ auto& now_date = now.date();
+ // If the century break is at an invalid 02/29, set it to 02/28 for consistency with
+ // Hive.
+ if (now_date.month() == 2 && now_date.day() == 29 &&
+ !gregorian_calendar::is_leap_year(now_date.year() - 80)) {
+ century_break_ptime = ptime(date(now_date.year() - 80, 2, 28), now.time());
+ } else {
+ century_break_ptime = ptime(
+ date(now_date.year() - 80, now_date.month(), now_date.day()), now.time());
+ }
}
bool TimestampParser::initialized_ = false;
@@ -301,6 +313,32 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
}
}
+date TimestampParser::RealignYear(const DateTimeParseResult& dt_result,
+ const DateTimeFormatContext& dt_ctx, int day_offset, const time_duration& t) {
+ DCHECK(!dt_ctx.century_break_ptime.is_special());
+ // Let the century start at AABB and the year parsed be YY, this gives us AAYY.
+ int year = dt_result.year + (dt_ctx.century_break_ptime.date().year() / 100) * 100;
+ date unshifted_date;
+ // The potential actual date (02/29 in unshifted year + 100 years) might be valid
+ // even if unshifted date is not, so try to make unshifted date valid by adding 1 day.
+ // This makes the behavior closer to Hive.
+ if (dt_result.month == 2 && dt_result.day == 29 &&
+ !gregorian_calendar::is_leap_year(year)) {
+ unshifted_date = date(year, 3, 1);
+ } else {
+ unshifted_date = date(year, dt_result.month, dt_result.day);
+ }
+ unshifted_date += date_duration(day_offset);
+ // Advance 100 years if parsed time is before the century break.
+ // For example if the century breaks at 1937 but dt_result->year = 1936,
+ // the correct year would be 2036.
+ if (ptime(unshifted_date, t) < dt_ctx.century_break_ptime) {
+ return date(year + 100, dt_result.month, dt_result.day) + date_duration(day_offset);
+ } else {
+ return date(year, dt_result.month, dt_result.day) + date_duration(day_offset);
+ }
+}
+
bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContext& dt_ctx,
date* d, time_duration* t) {
DCHECK(TimestampParser::initialized_);
@@ -330,25 +368,23 @@ bool TimestampParser::Parse(const char* str, int len, const DateTimeFormatContex
*t = time_duration(0, 0, 0, 0);
}
if (dt_ctx.has_date_toks) {
- bool is_valid_date = true;
try {
DCHECK(-1 <= day_offset && day_offset <= 1);
- if ((dt_result.year == 1400 && dt_result.month == 1 && dt_result.day == 1 &&
- day_offset == -1) ||
- (dt_result.year == 9999 && dt_result.month == 12 && dt_result.day == 31 &&
- day_offset == 1)) {
- // Have to check lower/upper bound explicitly.
- // Tried date::is_not_a_date_time() but it doesn't complain value is out of range
- // for "'1400-01-01' - 1 day" and "'9999-12-31' + 1 day".
- is_valid_date = false;
+ if (dt_result.realign_year) {
+ *d = RealignYear(dt_result, dt_ctx, day_offset, *t);
} else {
- *d = date(dt_result.year, dt_result.month, dt_result.day);
- *d += date_duration(day_offset);
+ *d = date(dt_result.year, dt_result.month, dt_result.day)
+ + date_duration(day_offset);
+ }
+ // Have to check year lower/upper bound [1400, 9999] here because
+ // operator + (date, date_duration) won't throw an exception even if the result is
+ // out-of-range.
+ if (d->year() < 1400 || d->year() > 9999) {
+ // Calling year() on out-of-range date throws an exception itself. This branch is
+ // to describe the checking logic but is never taken.
+ DCHECK(false);
}
} catch (boost::exception&) {
- is_valid_date = false;
- }
- if (!is_valid_date) {
VLOG_ROW << "Invalid date: " << dt_result.year << "-" << dt_result.month << "-"
<< dt_result.day;
*d = date();
@@ -428,8 +464,6 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
// Keep track of the number of characters we need to shift token positions by.
// Variable-length tokens will result in values > 0;
int shift_len = 0;
- // Whether to realign the year for 2-digit year format
- bool realign_year = false;
for (const DateTimeFormatToken& tok: dt_ctx.toks) {
const char* tok_val = str + tok.pos + shift_len;
if (tok.type == SEPARATOR) {
@@ -449,10 +483,10 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
case YEAR: {
dt_result->year = StringParser::StringToInt<int>(tok_val, tok_len, &status);
if (UNLIKELY(StringParser::PARSE_SUCCESS != status)) return false;
- if (UNLIKELY(dt_result->year < 1 || dt_result->year > 9999)) return false;
+ if (UNLIKELY(dt_result->year < 0 || dt_result->year > 9999)) return false;
// Year in "Y" and "YY" format should be in the interval
// [current time - 80 years, current time + 20 years)
- if (tok_len <= 2) realign_year = true;
+ if (tok_len <= 2) dt_result->realign_year = true;
break;
}
case MONTH_IN_YEAR: {
@@ -546,23 +580,6 @@ bool TimestampParser::ParseDateTime(const char* str, int str_len,
default: DCHECK(false) << "Unknown date/time format token";
}
}
- // Hive uses Java's SimpleDateFormat to parse timestamp:
- // In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years.
- // https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html
- if (realign_year) {
- DCHECK(!dt_ctx.century_break_ptime.is_special());
- // Let the century start at AABB and the year parsed be YY, this gives us AAYY.
- dt_result->year += (dt_ctx.century_break_ptime.date().year() / 100) * 100;
- date parsed_date(dt_result->year, dt_result->month, dt_result->day);
- time_duration parsed_time(dt_result->hour, dt_result->minute, dt_result->second,
- dt_result->fraction);
- // Advance 100 years if parsed time is before the century break
- // For example if the century breaks at 1937 but dt_result->year = 1936,
- // the correct year would be 2036.
- if (boost::posix_time::ptime(parsed_date, parsed_time) < dt_ctx.century_break_ptime) {
- dt_result->year += 100;
- }
- }
return true;
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-parse-util.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h
index b68cc85..bbcc03f 100644
--- a/be/src/runtime/timestamp-parse-util.h
+++ b/be/src/runtime/timestamp-parse-util.h
@@ -219,6 +219,16 @@ class TimestampParser {
static bool ParseDateTime(const char* str, int str_len,
const DateTimeFormatContext& dt_ctx, DateTimeParseResult* dt_result);
+ /// Helper function finding the correct century for 1 or 2 digit year according to
+ /// century break. Throws bad_year, bad_day_of_month, or bad_day_month if the date is
+ /// invalid. The century break behavior is copied from Java SimpleDateFormat in order to
+ /// be consistent with Hive.
+ /// In SimpleDateFormat, the century for 2-digit-year breaks at current_time - 80 years.
+ /// https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html
+ static boost::gregorian::date RealignYear(const DateTimeParseResult& dt_result,
+ const DateTimeFormatContext& dt_ctx, int day_offset,
+ const boost::posix_time::time_duration& t);
+
/// Check if the string is a TimeZone offset token.
/// Valid offset token format are 'hh:mm', 'hhmm', 'hh'.
static bool IsValidTZOffset(const char* str_begin, const char* str_end);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2fbdc8e3/be/src/runtime/timestamp-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index 5f3e0e6..b8919cc 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -231,7 +231,8 @@ void TestTimestampTokens(vector<TimestampToken>* toks, int year, int month,
TEST(TimestampTest, Basic) {
// Fix current time to determine the behavior parsing 2-digit year format
- TimestampValue now(date(2017, 7, 28), time_duration(16, 14, 24));
+ // Set it to 03/01 to test 02/29 edge cases.
+ TimestampValue now(date(1980, 3, 1), time_duration(16, 14, 24));
char s1[] = "2012-01-20 01:10:01";
char s2[] = "1990-10-20 10:10:10.123456789 ";
@@ -426,6 +427,19 @@ TEST(TimestampTest, Basic) {
}
// Test parsing/formatting of complex date/time formats
vector<TimestampTC> test_cases = boost::assign::list_of
+ // Test year upper/lower bound
+ (TimestampTC("yyyy-MM-dd HH:mm:ss", "1400-01-01 00:00:00",
+ false, true, false, 1400, 1, 1))
+ (TimestampTC("yyyy-MM-dd HH:mm:ss", "1399-12-31 23:59:59",
+ false, true))
+ (TimestampTC("yyyy-MM-dd HH:mm:ss", "9999-12-31 23:59:59",
+ false, true, false, 9999, 12, 31, 23, 59, 59))
+ (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "1400-01-01 01:00:00 +01", false, true, false,
+ 1400, 1, 1, 0, 0, 0))
+ (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "1400-01-01 01:00:00 +02", false, true))
+ (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "9999-12-31 22:00:00 -01", false, true, false,
+ 9999, 12, 31, 23, 0, 0))
+ (TimestampTC("yyyy-MM-dd HH:mm:ss +hh", "9999-12-31 22:00:00 -02", false, true))
// Test case on literal short months
(TimestampTC("yyyy-MMM-dd", "2013-OCT-01", false, true, false, 2013, 10, 1))
// Test case on literal short months
@@ -470,19 +484,32 @@ TEST(TimestampTest, Basic) {
(TimestampTC("MMdd", "1201", false, true))
// Test missing month
(TimestampTC("yyyydd", "201301", false, true))
- // Test missing month
- (TimestampTC("yyyymm", "201301", false, true))
+ (TimestampTC("yydd", "1301", false, true))
+ // Test missing day
+ (TimestampTC("yyyyMM", "201301", false, true))
+ (TimestampTC("yyMM", "8512", false, true))
+ // Test missing month and day
+ (TimestampTC("yyyy", "2013", false, true))
+ (TimestampTC("yy", "13", false, true))
// Test short year token
(TimestampTC("y-MM-dd", "2013-11-13", false, true, false, 2013, 11, 13))
- (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 2013, 11, 13))
+ (TimestampTC("y-MM-dd", "13-11-13", false, true, false, 1913, 11, 13))
// Test 2-digit year format
- (TimestampTC("yy-MM-dd", "37-07-28", false, true, false, 2037, 7, 28))
- (TimestampTC("yy-MM-dd", "37-07-29", false, true, false, 1937, 7, 29))
+ (TimestampTC("yy-MM-dd", "17-08-31", false, true, false, 1917, 8, 31))
+ (TimestampTC("yy-MM-dd", "99-08-31", false, true, false, 1999, 8, 31))
+ // Test 02/29 edge cases of 2-digit year format
+ (TimestampTC("yy-MM-dd", "00-02-28", false, true, false, 2000, 2, 28))
+ (TimestampTC("yy-MM-dd", "00-02-29", false, true, false, 2000, 2, 29))
+ (TimestampTC("yy-MM-dd", "00-03-01", false, true, false, 2000, 3, 1))
+ (TimestampTC("yy-MM-dd", "00-03-02", false, true, false, 1900, 3, 2))
+ (TimestampTC("yy-MM-dd", "04-02-29", false, true, false, 1904, 2, 29))
+ (TimestampTC("yy-MM-dd", "99-02-29", false, true))
// Test 1-digit year format with time to show the exact boundary
- (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:23", false, true, false,
- 2037, 7, 28, 16, 14, 23))
- (TimestampTC("y-MM-dd HH:mm:ss", "37-07-28 16:14:24", false, true, false,
- 1937, 7, 28, 16, 14, 24))
+ // Before the cutoff. Year should be 2000
+ (TimestampTC("y-MM-dd HH:mm:ss", "00-02-29 16:14:23", false, true, false,
+ 2000, 2, 29, 16, 14, 23))
+ // After the cutoff but 02/29/1900 is invalid
+ (TimestampTC("y-MM-dd HH:mm:ss", "00-02-29 16:14:24", false, true))
// Test short month token
(TimestampTC("yyyy-M-dd", "2013-11-13", false, true, false, 2013, 11, 13))
(TimestampTC("yyyy-M-dd", "2013-1-13", false, true, false, 2013, 1, 13))
@@ -491,7 +518,7 @@ TEST(TimestampTest, Basic) {
(TimestampTC("yyyy-MM-d", "2013-11-3", false, true, false, 2013, 11, 3))
// Test short all date tokens
(TimestampTC("y-M-d", "2013-11-13", false, true, false, 2013, 11, 13))
- (TimestampTC("y-M-d", "13-1-3", false, true, false, 2013, 1, 3))
+ (TimestampTC("y-M-d", "13-1-3", false, true, false, 1913, 1, 3))
// Test short hour token
(TimestampTC("H:mm:ss", "14:24:34", false, false, true, 0, 0, 0, 14, 24, 34))
(TimestampTC("H:mm:ss", "4:24:34", false, false, true, 0, 0, 0, 4, 24, 34))