You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/03/13 22:20:22 UTC
[1/3] impala git commit: IMPALA-6638: Reduce file handle cache lock
contention
Repository: impala
Updated Branches:
refs/heads/master 8283081bf -> 0d7787fe4
IMPALA-6638: Reduce file handle cache lock contention
FileHandleCache::OpenFileHandle() currently holds the
lock while opening a file handle. This lengthens the
duration holding the lock considerably, causing
contention when a lot of file handles are being
opened (i.e. when the cache is cold).
This changes FileHandleCache::OpenFileHandle() drops
the lock while opening the file handle, then
reacquires it to add the file handle to the cache.
When running a simple select on a table with 46801
Parquet files, this fixes a small performance overhead
for the cold cache case compared to having the
cache off. All results are averaged over 5 runs:
File handle cache off: 7.19s
File handle cache cold (no patch): 7.92s
File handle cache cold (with patch): 7.20s
When running a select that accesses and aggregates
4 columns from the same Parquet table (thus requiring
more scan ranges for the same file), the fix reduces
contention, particularly for the case with 8 IO threads
per disk:
1 IO thread per disk:
Without patch: 9.59s
With patch: 8.11s
8 IO threads per disk:
Without patch: 14.2s
With patch: 8.17s
The patch has no impact on the performance when the
cache is hot.
Change-Id: I4c695b21ca556e9c73c703c0c891e64939271c8d
Reviewed-on: http://gerrit.cloudera.org:8080/9576
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/3839826e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/3839826e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/3839826e
Branch: refs/heads/master
Commit: 3839826e69e9dff2062464f7eb1bdfed66ca7702
Parents: 8283081
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Sat Mar 10 15:09:58 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Mar 13 02:40:07 2018 +0000
----------------------------------------------------------------------
be/src/runtime/io/handle-cache.inline.h | 61 +++++++++++++++++-----------
1 file changed, 37 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/3839826e/be/src/runtime/io/handle-cache.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/handle-cache.inline.h b/be/src/runtime/io/handle-cache.inline.h
index f0cced0..99b106c 100644
--- a/be/src/runtime/io/handle-cache.inline.h
+++ b/be/src/runtime/io/handle-cache.inline.h
@@ -86,16 +86,22 @@ CachedHdfsFileHandle* FileHandleCache::GetFileHandle(
// Hash the key and get appropriate partition
int index = HashUtil::Hash(fname->data(), fname->size(), 0) % cache_partitions_.size();
FileHandleCachePartition& p = cache_partitions_[index];
- boost::lock_guard<SpinLock> g(p.lock);
- pair<typename MapType::iterator, typename MapType::iterator> range =
- p.cache.equal_range(*fname);
// If this requires a new handle, skip to the creation codepath. Otherwise,
// find an unused entry with the same mtime
- FileHandleEntry* ret_elem = nullptr;
if (!require_new_handle) {
+ boost::lock_guard<SpinLock> g(p.lock);
+ pair<typename MapType::iterator, typename MapType::iterator> range =
+ p.cache.equal_range(*fname);
+
+ // When picking a cached entry, always follow the ordering of the map and
+ // pick earlier entries first. This allows excessive entries for a file
+ // to age out. For example, if there are three entries for a file and only
+ // one is used at a time, only the first will be used and the other two
+ // can age out.
while (range.first != range.second) {
FileHandleEntry* elem = &range.first->second;
+ DCHECK(elem->fh.get() != nullptr);
if (!elem->in_use && elem->fh->mtime() == mtime) {
// This element is currently in the lru_list, which means that lru_entry must
// be an iterator pointing into the lru_list.
@@ -104,35 +110,42 @@ CachedHdfsFileHandle* FileHandleCache::GetFileHandle(
// the lru_list by resetting its iterator to point to the end of the list.
p.lru_list.erase(elem->lru_entry);
elem->lru_entry = p.lru_list.end();
- ret_elem = elem;
*cache_hit = true;
- break;
+ elem->in_use = true;
+ return elem->fh.get();
}
++range.first;
}
}
// There was no entry that was free or caller asked for a new handle
- if (!ret_elem) {
- *cache_hit = false;
- // Create a new entry and move it into the map
- CachedHdfsFileHandle* new_fh = new CachedHdfsFileHandle(fs, fname->data(), mtime);
- if (!new_fh->ok()) {
- delete new_fh;
- return nullptr;
- }
- FileHandleEntry entry(new_fh, p.lru_list);
- typename MapType::iterator new_it = p.cache.emplace_hint(range.second,
- *fname, std::move(entry));
- ret_elem = &new_it->second;
- ++p.size;
- if (p.size > p.capacity) EvictHandles(p);
+ // Opening a file handle requires talking to the NameNode, so construct
+ // the file handle without holding the lock to reduce contention.
+ *cache_hit = false;
+ // Create a new file handle
+ CachedHdfsFileHandle* new_fh = new CachedHdfsFileHandle(fs, fname->data(), mtime);
+ if (!new_fh->ok()) {
+ delete new_fh;
+ return nullptr;
}
- DCHECK(ret_elem->fh.get() != nullptr);
- DCHECK(!ret_elem->in_use);
- ret_elem->in_use = true;
- return ret_elem->fh.get();
+ // Get the lock and create/move the new entry into the map
+ // This entry is new and will be immediately used. Place it as the first entry
+ // for this file in the multimap. The ordering is largely unimportant if all the
+ // existing entries are in use. However, if require_new_handle is true, there may be
+ // unused entries, so it would make more sense to insert the new entry at the front.
+ boost::lock_guard<SpinLock> g(p.lock);
+ pair<typename MapType::iterator, typename MapType::iterator> range =
+ p.cache.equal_range(*fname);
+ FileHandleEntry entry(new_fh, p.lru_list);
+ typename MapType::iterator new_it = p.cache.emplace_hint(range.first,
+ *fname, std::move(entry));
+ ++p.size;
+ if (p.size > p.capacity) EvictHandles(p);
+ FileHandleEntry* new_elem = &new_it->second;
+ DCHECK(!new_elem->in_use);
+ new_elem->in_use = true;
+ return new_elem->fh.get();
}
void FileHandleCache::ReleaseFileHandle(std::string* fname,
[2/3] impala git commit: IMPALA-6635: Add DECIMAL type to Kudu
predicates
Posted by ta...@apache.org.
IMPALA-6635: Add DECIMAL type to Kudu predicates
This patch enables pushing scan predicates on
DECIMAL columns down to Kudu.
Testing:
- Added Planner decimal predicate test to kudu.test
- Added Planner decimal in-list test to kudu-selectivity.test
Change-Id: I2569a9e1d58f1c58884d58633d46348364888ed7
Reviewed-on: http://gerrit.cloudera.org:8080/9578
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6d8ce640
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6d8ce640
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6d8ce640
Branch: refs/heads/master
Commit: 6d8ce64020ac8a103f03814c2903f6c5835886c0
Parents: 3839826
Author: Grant Henke <gh...@cloudera.com>
Authored: Fri Mar 9 15:12:43 2018 -0600
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Mar 13 20:31:43 2018 +0000
----------------------------------------------------------------------
.../java/org/apache/impala/planner/KuduScanNode.java | 6 ++++++
.../queries/PlannerTest/kudu-selectivity.test | 14 ++++++++++++++
.../functional-planner/queries/PlannerTest/kudu.test | 15 +++++++++++++++
3 files changed, 35 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/6d8ce640/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
index a02b49b..f60d519 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
@@ -406,6 +406,11 @@ public class KuduScanNode extends ScanNode {
}
break;
}
+ case DECIMAL: {
+ kuduPredicate = KuduPredicate.newComparisonPredicate(column, op,
+ ((NumericLiteral)literal).getValue());
+ break;
+ }
default: break;
}
if (kuduPredicate == null) return false;
@@ -505,6 +510,7 @@ public class KuduScanNode extends ScanNode {
}
break;
}
+ case DECIMAL: return ((NumericLiteral) e).getValue();
default:
Preconditions.checkState(false,
"Unsupported Kudu type considered for predicate: %s", e.getType().toSql());
http://git-wip-us.apache.org/repos/asf/impala/blob/6d8ce640/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
index e6c8422..cf3014f 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-selectivity.test
@@ -121,6 +121,7 @@ Per-Host Resources: mem-estimate=0B mem-reservation=0B
mem-estimate=0B mem-reservation=0B
tuple-ids=0 row-size=124B cardinality=3
====
+# Test alltypes in-list predicates.
select * from functional_kudu.alltypes where
-- predicates that can be pushed
tinyint_col in (1, 2) and
@@ -156,6 +157,19 @@ Per-Host Resources: mem-estimate=0B mem-reservation=0B
mem-estimate=0B mem-reservation=0B
tuple-ids=0 row-size=97B cardinality=5
====
+# Test decimal in-list predicates.
+select * from functional_kudu.decimal_tbl where d1 in (1234, 12345);
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+Per-Host Resources: mem-estimate=0B mem-reservation=0B
+ PLAN-ROOT SINK
+ | mem-estimate=0B mem-reservation=0B
+ |
+ 00:SCAN KUDU [functional_kudu.decimal_tbl]
+ kudu predicates: d1 IN (1234, 12345)
+ mem-estimate=0B mem-reservation=0B
+ tuple-ids=0 row-size=56B cardinality=2
+====
select * from functional_kudu.alltypes where
tinyint_col is not null and
smallint_col is null and
http://git-wip-us.apache.org/repos/asf/impala/blob/6d8ce640/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
index 354a6fa..57a08dc 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
@@ -535,3 +535,18 @@ INSERT INTO KUDU [functional_kudu.jointbl]
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
+# Decimal predicate.
+select * from functional_kudu.decimal_tbl where d4 = 0.123456789;
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN KUDU [functional_kudu.decimal_tbl]
+ kudu predicates: d4 = 0.123456789
+---- DISTRIBUTEDPLAN
+PLAN-ROOT SINK
+|
+01:EXCHANGE [UNPARTITIONED]
+|
+00:SCAN KUDU [functional_kudu.decimal_tbl]
+ kudu predicates: d4 = 0.123456789
+====
\ No newline at end of file
[3/3] impala git commit: IMPALA-5315: Cast to timestamp fails for
YYYY-M-D format
Posted by ta...@apache.org.
IMPALA-5315: Cast to timestamp fails for YYYY-M-D format
This change allows casting of a string in 'lazy' date/time
format to timestamp. The supported lazy date formats are:
yyyy-[M]M-[d]d
yyyy-[M]M-[d]d [H]H:[m]m:[s]s[.SSSSSSSSS]
[H]H:[m]m:[s]s[.SSSSSSSSS]
We will incur a SCAN performance penalty (approximately 1/2
TotalReadThroughput) when the string is in one of these
lazy date/time format.
Testing:
Benchmarked the performance consequence by executing this SQL on
a private build over 3.8 billion rows:
select min(cast (time_string as timestamp)) from private.impala_5315
Added tests for valid and invalid date/time format strings
in expr-test.cc to be inline with existing tests for CAST() function.
Added end-to-end tests into exprs.test and
select-lazy-timestamp.test to exercise the new function within
the context of a query.
Added tests to exercise the leading and trailing white space trimming
behaviour in default and lazy date/time string format (IMPALA-6630).
Change-Id: Ib9a184a09d7e7783f04d47588537612c2ecec28f
Reviewed-on: http://gerrit.cloudera.org:8080/7009
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/0d7787fe
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/0d7787fe
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/0d7787fe
Branch: refs/heads/master
Commit: 0d7787fe4df1ab2bb8231b0ee0912e3cf2787f9e
Parents: 6d8ce64
Author: Vincent Tran <vt...@cloudera.com>
Authored: Sat May 27 03:02:19 2017 -0400
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Mar 13 22:10:18 2018 +0000
----------------------------------------------------------------------
be/src/exprs/expr-test.cc | 117 ++++++++++++-
be/src/runtime/timestamp-parse-util.cc | 168 ++++++++++++++++++-
be/src/runtime/timestamp-parse-util.h | 29 ++++
testdata/data/lazy_timestamp.csv | 13 ++
.../queries/QueryTest/exprs.test | 36 ++++
.../QueryTest/select-lazy-timestamp.test | 20 +++
tests/query_test/test_scanners.py | 22 +++
7 files changed, 398 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index e02ff6a..bd25328 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -3051,6 +3051,121 @@ TEST_F(ExprTest, CastExprs) {
TestStringValue("cast(cast(cast('2012-01-01 09:10:11.123456789' as timestamp) as"
" timestamp) as string)", "2012-01-01 09:10:11.123456789");
+ // Test casting of lazy date and/or time format string to timestamp
+ TestTimestampValue(
+ "cast('2001-1-2' as timestamp)", TimestampValue::Parse("2001-01-02 00:00:00"));
+ TestTimestampValue(
+ "cast('2001-01-3' as timestamp)", TimestampValue::Parse("2001-01-03 00:00:00"));
+ TestTimestampValue(
+ "cast('2001-1-21' as timestamp)", TimestampValue::Parse("2001-01-21 00:00:00"));
+ TestTimestampValue("cast('2001-1-21 12:5:30' as timestamp)",
+ TimestampValue::Parse("2001-01-21 12:05:30"));
+ TestTimestampValue("cast('2001-1-21 13:5:05' as timestamp)",
+ TimestampValue::Parse("2001-01-21 13:05:05"));
+ TestTimestampValue("cast('2001-1-21 1:2:3' as timestamp)",
+ TimestampValue::Parse("2001-01-21 01:02:03"));
+ TestTimestampValue("cast('2001-1-21 1:5:31.12345' as timestamp)",
+ TimestampValue::Parse("2001-01-21 01:05:31.123450000"));
+ TestTimestampValue("cast('2001-1-21 1:5:31.12345678910111213' as timestamp)",
+ TimestampValue::Parse("2001-01-21 01:05:31.123456789"));
+ TestTimestampValue(
+ "cast('1:05:1.12' as timestamp)", TimestampValue::Parse("01:05:01.120000000"));
+ TestTimestampValue("cast('1:05:1' as timestamp)", TimestampValue::Parse("01:05:01"));
+ TestTimestampValue("cast(' 2001-01-9 1:05:1 ' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01"));
+ TestIsNull("cast('2001-1-21 11:2:3' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-6' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('01-1-21' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-1-21 12:5:3 AM' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1:05:31.123456foo' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('10/feb/10' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-foo1-2bar' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909/1-/2' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-1-2 12:32:1.111bar' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32:1.111.111.2' as timestamp)", TYPE_TIMESTAMP);
+
+ // Test various ways of truncating a "lazy" format to produce an invalid timestamp.
+ TestIsNull("cast('1909-10-2 12:32:1.' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32:11.' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32:11. ' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32:' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32: ' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 1:32:' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 1:2:' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 1:2' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 1:2 ' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12 ' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 2' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10- ' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909' as timestamp)", TYPE_TIMESTAMP);
+
+ // Test missing number from format.
+ TestIsNull("cast('1909-10-2 12:32:.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12::1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 :32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10- 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909--2 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('-10-2 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+
+ // Test duplicate separators - should return NULL because not a valid format.
+ TestIsNull("cast('1909--10-2 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10--2 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12::32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32::1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32:1..9999' as timestamp)", TYPE_TIMESTAMP);
+
+ // Test numbers with too many digits in date/time - should return NULL because not a
+ // valid timestamp.
+ TestIsNull("cast('19097-10-2 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-107-2 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-277 12:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 127:32:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:327:1.9999' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('1909-10-2 12:32:177.9999' as timestamp)", TYPE_TIMESTAMP);
+
+ // IMPALA-6630: Test whitespace trimming mechanism when cast from string to timestamp
+ TestTimestampValue("cast(' \t\r\n 2001-01-09 01:05:01.123456789 \t\r\n' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01.123456789"));
+ TestTimestampValue("cast(' \t\r\n 2001-01-09T01:05:01.123456789 \t\r\n' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01.123456789"));
+ TestTimestampValue("cast(' \t\r\n 2001-01-09 01:05:01 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01"));
+ TestTimestampValue("cast(' \t\r\n 2001-01-09T01:05:01 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01"));
+ TestTimestampValue("cast(' \t\r\n 2001-01-09 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("2001-01-09"));
+ TestTimestampValue("cast(' \t\r\n 01:05:01 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("01:05:01"));
+ TestTimestampValue("cast(' \t\r\n 01:05:01.123456789 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("01:05:01.123456789"));
+ TestTimestampValue("cast(' \t\r\n 2001-1-9 1:5:1 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01"));
+ TestTimestampValue("cast(' \t\r\n 2001-1-9 1:5:1.12345678 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("2001-01-09 01:05:01.123456780"));
+ TestTimestampValue("cast(' \t\r\n 1:5:1 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("01:05:01"));
+ TestTimestampValue("cast(' \t\r\n 1:5:1.12345678 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("01:05:01.123456780"));
+ TestTimestampValue("cast(' \t\r\n 2001-1-9 \t\r\n ' as timestamp)",
+ TimestampValue::Parse("2001-01-09"));
+
+ // Test invalid whitespace locations in strings to be casted to timestamp
+ TestIsNull(
+ "cast(' \t\r\n 2001-01-09 01:05:01 \t\r\n ' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-01-09 01:05:01' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-01-09\t01:05:01' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-01-09\r01:05:01' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-01-09\n01:05:01' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-1-9 1:5:1' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-1-9\t1:5:1' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-1-9\r1:5:1' as timestamp)", TYPE_TIMESTAMP);
+ TestIsNull("cast('2001-1-9\n1:5:1' as timestamp)", TYPE_TIMESTAMP);
+
// IMPALA-3163: Test precise conversion from Decimal to Timestamp.
TestTimestampValue("cast(cast(1457473016.1230 as decimal(17,4)) as timestamp)",
TimestampValue::Parse("2016-03-08 21:36:56.123000000", 29));
@@ -5984,7 +6099,7 @@ TEST_F(ExprTest, TimestampFunctions) {
TestIsNull("timestamp_cmp('','1966-05-04 15:33:45')", TYPE_INT);
TestIsNull("timestamp_cmp(NULL,'1966-05-04 15:33:45')", TYPE_INT);
// Invalid timestamp test case
- TestIsNull("timestamp_cmp('1966-5-4 5:33:45','1966-5-4 15:33:45')", TYPE_INT);
+ TestIsNull("timestamp_cmp('1966-5-4 50:33:45','1966-5-4 15:33:45')", TYPE_INT);
TestValue("int_months_between('1967-07-19','1966-06-04')", TYPE_INT, 13);
TestValue("int_months_between('1966-06-04 16:34:45','1967-07-19 15:33:46')",
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/be/src/runtime/timestamp-parse-util.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc
index e64d904..c444214 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -17,6 +17,8 @@
#include "runtime/timestamp-parse-util.h"
+#include <algorithm>
+
#include <boost/assign/list_of.hpp>
#include <boost/date_time/gregorian/gregorian.hpp>
#include <boost/unordered_map.hpp>
@@ -214,8 +216,150 @@ bool TimestampParser::ParseFormatTokens(DateTimeFormatContext* dt_ctx) {
return dt_ctx->has_date_toks || dt_ctx->has_time_toks;
}
+const char* TimestampParser::ParseDigitToken(const char* str, const char* str_end) {
+ const char* tok_end = str;
+ while (tok_end < str_end) {
+ if (!isdigit(*tok_end)) return tok_end;
+ ++tok_end;
+ }
+ return tok_end;
+}
+
+const char* TimestampParser::ParseSeparatorToken(
+ const char* str, const char* str_end, const char sep) {
+ const char* tok_end = str;
+ while (tok_end < str_end) {
+ if (*tok_end != sep) return tok_end;
+ ++tok_end;
+ }
+ return tok_end;
+}
+
+bool TimestampParser::ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx) {
+ DCHECK(dt_ctx != NULL);
+ DCHECK(dt_ctx->fmt != NULL);
+ DCHECK_GT(dt_ctx->fmt_len, 0);
+ DCHECK_EQ(dt_ctx->toks.size(), 0);
+ const char* str_begin = dt_ctx->fmt;
+ const char* str_end = str_begin + dt_ctx->fmt_len;
+ const char* str = str_begin;
+ const char* tok_end;
+
+ // Parse the 4-digit year
+ tok_end = ParseDigitToken(str, str_end);
+ if (tok_end - str == 4) {
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(YEAR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Check for the date separator '-'
+ tok_end = ParseSeparatorToken(str, str_end, '-');
+ if (tok_end - str != 1) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Parse the 1 or 2 digit month.
+ tok_end = ParseDigitToken(str, str_end);
+ if (tok_end - str != 1 && tok_end - str != 2) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(MONTH_IN_YEAR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Check for the date separator '-'
+ tok_end = ParseSeparatorToken(str, str_end, '-');
+ if (tok_end - str != 1) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Parse the 1 or 2 digit day in month
+ tok_end = ParseDigitToken(str, str_end);
+ if (tok_end - str != 1 && tok_end - str != 2) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(DAY_IN_MONTH, str - str_begin, tok_end - str, str));
+ str = tok_end;
+ dt_ctx->has_date_toks = true;
+
+ // If the string ends here, we only have a date component
+ if (str == str_end) return true;
+
+ // Check for the space between date and time component
+ tok_end = ParseSeparatorToken(str, str_end, ' ');
+ if (tok_end - str != 1) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Invalid format if date-time separator is not followed by more digits
+ if (str > str_end) return false;
+ tok_end = ParseDigitToken(str, str_end);
+ }
+
+ // Parse the 1 or 2 digit hour
+ if (tok_end - str != 1 && tok_end - str != 2) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(HOUR_IN_DAY, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Check for the time component separator ':'
+ tok_end = ParseSeparatorToken(str, str_end, ':');
+ if (tok_end - str != 1) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Parse the 1 or 2 digit minute
+ tok_end = ParseDigitToken(str, str_end);
+ if (tok_end - str != 1 && tok_end - str != 2) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(MINUTE_IN_HOUR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Check for the time component separator ':'
+ tok_end = ParseSeparatorToken(str, str_end, ':');
+ if (tok_end - str != 1) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Parse the 1 or 2 digit second
+ tok_end = ParseDigitToken(str, str_end);
+ if (tok_end - str != 1 && tok_end - str != 2) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SECOND_IN_MINUTE, str - str_begin, tok_end - str, str));
+ str = tok_end;
+ dt_ctx->has_time_toks = true;
+
+ // There is more to parse, there maybe a fractional component.
+ if (str < str_end) {
+ tok_end = ParseSeparatorToken(str, str_end, '.');
+ if (tok_end - str != 1) return false;
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(SEPARATOR, str - str_begin, tok_end - str, str));
+ str = tok_end;
+
+ // Invalid format when there is no fractional component following '.'
+ if (str > str_end) return false;
+
+ // Parse the fractional component.
+ // Like the non-lazy path, this will parse up to 9 fractional digits
+ tok_end = ParseDigitToken(str, str_end);
+ int num_digits = std::min<int>(9, tok_end - str);
+ dt_ctx->toks.push_back(
+ DateTimeFormatToken(FRACTION, str - str_begin, num_digits, str));
+ str = tok_end;
+
+ // Invalid format if there is more to parse after the fractional component
+ if (str < str_end) return false;
+ }
+ return true;
+}
+
bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
boost::posix_time::time_duration* t) {
+ int lazy_len;
+
DCHECK(TimestampParser::initialized_);
DCHECK(d != NULL);
DCHECK(t != NULL);
@@ -248,6 +392,7 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
}
}
+ lazy_len = len;
// Only process what we have to.
if (len > DEFAULT_DATE_TIME_FMT_LEN) len = DEFAULT_DATE_TIME_FMT_LEN;
// Determine the default formatting context that's required for parsing.
@@ -278,7 +423,7 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
// There is likely a fractional component that's below the expected 9 chars.
// We will need to work out which default context to use that corresponds to
// the fractional length in the string.
- if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN)) {
+ if (LIKELY(len > DEFAULT_SHORT_DATE_TIME_FMT_LEN) && LIKELY(str[19] == '.')) {
switch (str[10]) {
case ' ': {
dt_ctx =
@@ -295,7 +440,7 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
break;
}
}
- } else if (str[2] == ':') {
+ } else if (str[2] == ':' && str[5] == ':' && isdigit(str[7])) {
if (len > DEFAULT_TIME_FRAC_FMT_LEN) len = DEFAULT_TIME_FRAC_FMT_LEN;
if (len > DEFAULT_TIME_FMT_LEN && str[8] == '.') {
dt_ctx = &DEFAULT_TIME_FRAC_CTX[len - DEFAULT_TIME_FMT_LEN - 1];
@@ -304,12 +449,23 @@ bool TimestampParser::Parse(const char* str, int len, boost::gregorian::date* d,
}
}
}
- if (LIKELY(dt_ctx != NULL)) {
+
+ // Generating context lazily as a fall back if default formats fail.
+ // ParseFormatTokenByStr() does not require a template format string.
+ if (dt_ctx != nullptr) {
return Parse(str, len, *dt_ctx, d, t);
} else {
- *d = boost::gregorian::date();
- *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
- return false;
+ DateTimeFormatContext lazy_ctx;
+ lazy_ctx.Reset(str, lazy_len);
+ if (ParseFormatTokensByStr(&lazy_ctx)) {
+ dt_ctx = &lazy_ctx;
+ len = lazy_len;
+ return Parse(str, len, *dt_ctx, d, t);
+ } else {
+ *d = boost::gregorian::date();
+ *t = boost::posix_time::time_duration(boost::posix_time::not_a_date_time);
+ return false;
+ }
}
}
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/be/src/runtime/timestamp-parse-util.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-parse-util.h b/be/src/runtime/timestamp-parse-util.h
index bbcc03f..bccf0b7 100644
--- a/be/src/runtime/timestamp-parse-util.h
+++ b/be/src/runtime/timestamp-parse-util.h
@@ -177,6 +177,35 @@ class TimestampParser {
/// Return true if the parse was successful.
static bool ParseFormatTokens(DateTimeFormatContext* dt_ctx);
+ // Parse out the next digit token from the date/time string by checking for contiguous
+ // digit characters and return a pointer to the end of that token.
+ // str -- pointer to the string to be parsed
+ // str_end -- the pointer to the end of the string to be parsed
+ // Returns the pointer within the string to the end of the valid digit token.
+ static const char* ParseDigitToken(const char* str, const char* str_end);
+
+ // Parse out the next separator token from the date/time string against an expected
+ // character.
+ // str -- pointer to the string to be parsed
+ // str_end -- the pointer to the end of the string to be parsed
+ // sep -- the separator char to compare the token to
+ // Returns the pointer within the string to the end of the valid separator token.
+ static const char* ParseSeparatorToken(
+ const char* str, const char* str_end, const char sep);
+
+ /// Parse the date/time string to generate the DateTimeFormatToken required by
+ /// DateTimeFormatContext. Similar to ParseFormatTokens() this function will take the
+ /// string and length, then heuristically determine whether the value contains date
+ // tokens, time tokens, or both. Unlike ParseFormatTokens, it does not require the
+ // template format string.
+ /// str -- valid pointer to the string to parse
+ /// len -- length of the string to parse (must be > 0)
+ /// dt_ctx -- date/time format context (must contain valid tokens)
+ /// d -- the date value where the results of the parsing will be placed
+ /// t -- the time value where the results of the parsing will be placed
+ /// Returns true if the date/time was successfully parsed.
+ static bool ParseFormatTokensByStr(DateTimeFormatContext* dt_ctx);
+
/// Parse a default date/time string. The default timestamp format is:
/// yyyy-MM-dd HH:mm:ss.SSSSSSSSS or yyyy-MM-ddTHH:mm:ss.SSSSSSSSS. Either just the
/// date or just the time may be specified. All components are required in either the
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/testdata/data/lazy_timestamp.csv
----------------------------------------------------------------------
diff --git a/testdata/data/lazy_timestamp.csv b/testdata/data/lazy_timestamp.csv
new file mode 100644
index 0000000..d44db1d
--- /dev/null
+++ b/testdata/data/lazy_timestamp.csv
@@ -0,0 +1,13 @@
+2001-1-2
+2001-1-02
+2001-01-2
+1:6:8
+01:6:8
+1:06:8
+1:6:08
+1:6:8.123456789101112
+1:6:8.123456789
+1:6:8.12345
+2001-1-2 1:6:8
+2001-1-2 1:6:8.123456
+2001-1-2 1:6:8.123456789101112
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index b3d0ca8..b6909c1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -2959,3 +2959,39 @@ from functional.alltypes where id = 7
---- TYPES
BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT , BIGINT
====
+---- QUERY
+# IMPALA-5315: Test support for non zero-padded date/time strings cast as timestamp
+select cast('2001-1-21 12:5:30' as timestamp)
+---- RESULTS
+2001-01-21 12:05:30
+---- TYPES
+timestamp
+====
+---- QUERY
+select cast('2001-1-2 1:5:3.123' as timestamp)
+---- RESULTS
+2001-01-02 01:05:03.123000000
+---- TYPES
+timestamp
+====
+---- QUERY
+select cast('1:5:3' as timestamp)
+---- RESULTS
+01:05:03
+---- TYPES
+timestamp
+====
+---- QUERY
+select cast('1:5:3.1234567' as timestamp)
+---- RESULTS
+01:05:03.123456700
+---- TYPES
+timestamp
+====
+---- QUERY
+select cast('2001-1-2' as timestamp)
+---- RESULTS
+2001-01-02 00:00:00
+---- TYPES
+timestamp
+====
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/testdata/workloads/functional-query/queries/QueryTest/select-lazy-timestamp.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/select-lazy-timestamp.test b/testdata/workloads/functional-query/queries/QueryTest/select-lazy-timestamp.test
new file mode 100644
index 0000000..8258072
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/select-lazy-timestamp.test
@@ -0,0 +1,20 @@
+====
+---- QUERY
+select ts from lazy_ts
+---- RESULTS: VERIFY_IS_EQUAL_SORTED
+2001-01-02 00:00:00
+2001-01-02 00:00:00
+2001-01-02 00:00:00
+01:06:08
+01:06:08
+01:06:08
+01:06:08
+01:06:08.123456789
+01:06:08.123456789
+01:06:08.123450000
+2001-01-02 01:06:08
+2001-01-02 01:06:08.123456000
+2001-01-02 01:06:08.123456789
+---- TYPES
+timestamp
+====
http://git-wip-us.apache.org/repos/asf/impala/blob/0d7787fe/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index a67b793..a9ba5b8 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -855,3 +855,25 @@ class TestScanTruncatedFiles(ImpalaTestSuite):
result = self.execute_query("select count(*) from %s" % fq_tbl_name)
assert(len(result.data) == 1)
assert(result.data[0] == str(num_rows))
+
+class TestUncompressedText(ImpalaTestSuite):
+ @classmethod
+ def get_workload(cls):
+ return 'functional-query'
+
+ @classmethod
+ def add_test_dimensions(cls):
+ super(TestUncompressedText, cls).add_test_dimensions()
+ cls.ImpalaTestMatrix.add_constraint(lambda v:
+ v.get_value('table_format').file_format == 'text' and
+ v.get_value('table_format').compression_codec == 'none')
+
+ # IMPALA-5315: Test support for date/time in unpadded format
+ def test_scan_lazy_timestamp(self, vector, unique_database):
+ self.client.execute(("""CREATE TABLE {0}.lazy_ts (ts TIMESTAMP)""").format
+ (unique_database))
+ tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (unique_database,
+ "lazy_ts"))
+ check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
+ "/testdata/data/lazy_timestamp.csv", tbl_loc])
+ self.run_test_case('QueryTest/select-lazy-timestamp', vector, unique_database)