You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/06/05 16:06:53 UTC

[impala] 02/06: IMPALA-8562: Data cache should skip insertion of uncacheable ScanRanges

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e573b5502d4a93623dd1375e8e8febf9647c98db
Author: Michael Ho <kw...@cloudera.com>
AuthorDate: Thu May 16 17:40:17 2019 -0700

    IMPALA-8562: Data cache should skip insertion of uncacheable ScanRanges
    
    As shown in IMPALA-8561, there are some paths in the code which
    create uncacheable ScanRanges. These uncacheable ScanRanges have
    mtime of -1. 'mtime' is used for differentiating versions of files
    with the same names. An mtime == -1 means the cache entry could
    potentially be from any versions of a file with the same name.
    
    This change skips lookup or insertion of ScanRange with negative
    mtime, file offset or buffer length.
    
    Testing done: Added targeted test cases in data-cache-test
    
    Change-Id: I2294833b075a2ddcae956d9fdb04f3e85adb0391
    Reviewed-on: http://gerrit.cloudera.org:8080/13369
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/io/data-cache-test.cc | 18 ++++++++++++++++++
 be/src/runtime/io/data-cache.cc      | 14 ++++++++++++++
 be/src/runtime/io/data-cache.h       | 24 +++++++++++++-----------
 be/src/runtime/io/request-ranges.h   |  2 ++
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/be/src/runtime/io/data-cache-test.cc b/be/src/runtime/io/data-cache-test.cc
index 9b13918..15d995c 100644
--- a/be/src/runtime/io/data-cache-test.cc
+++ b/be/src/runtime/io/data-cache-test.cc
@@ -22,6 +22,7 @@
 
 #include "gutil/strings/join.h"
 #include "runtime/io/data-cache.h"
+#include "runtime/io/request-ranges.h"
 #include "runtime/test-env.h"
 #include "service/fe-support.h"
 #include "testutil/gtest-util.h"
@@ -262,6 +263,23 @@ TEST_F(DataCacheTest, TestBasics) {
 
   // Check that an insertion larger than the cache size will fail.
   ASSERT_FALSE(cache.Store(FNAME, MTIME, 0, test_buffer(), cache_size * 2));
+
+  // Test with uncacheable 'mtime' to make sure the entry is not stored.
+  ASSERT_FALSE(cache.Store(FNAME, BufferOpts::NEVER_CACHE, 0, test_buffer(),
+      TEMP_BUFFER_SIZE));
+  ASSERT_EQ(0, cache.Lookup(FNAME, BufferOpts::NEVER_CACHE, 0, TEMP_BUFFER_SIZE, buffer));
+
+  // Test with bad 'mtime' to make sure the entry is not stored.
+  ASSERT_FALSE(cache.Store(FNAME, -1000, 0, test_buffer(), TEMP_BUFFER_SIZE));
+  ASSERT_EQ(0, cache.Lookup(FNAME, -1000, 0, TEMP_BUFFER_SIZE, buffer));
+
+  // Test with bad 'offset' to make sure the entry is not stored.
+  ASSERT_FALSE(cache.Store(FNAME, MTIME, -2000, test_buffer(), TEMP_BUFFER_SIZE));
+  ASSERT_EQ(0, cache.Lookup(FNAME, MTIME, -2000, TEMP_BUFFER_SIZE, buffer));
+
+  // Test with bad 'buffer_len' to make sure the entry is not stored.
+  ASSERT_FALSE(cache.Store(FNAME, MTIME, 0, test_buffer(), -5000));
+  ASSERT_EQ(0, cache.Lookup(FNAME, MTIME, 0, -5000, buffer));
 }
 
 // Tests backing file rotation by setting FLAGS_data_cache_file_max_size_bytes to be 1/4
diff --git a/be/src/runtime/io/data-cache.cc b/be/src/runtime/io/data-cache.cc
index b81d30a..70b7aeb 100644
--- a/be/src/runtime/io/data-cache.cc
+++ b/be/src/runtime/io/data-cache.cc
@@ -287,6 +287,8 @@ struct DataCache::CacheKey {
  public:
   explicit CacheKey(const string& filename, int64_t mtime, int64_t offset)
     : key_(filename.size() + sizeof(mtime) + sizeof(offset)) {
+    DCHECK_GE(mtime, 0);
+    DCHECK_GE(offset, 0);
     key_.append(filename);
     key_.append(&mtime, sizeof(mtime));
     key_.append(&offset, sizeof(offset));
@@ -658,6 +660,12 @@ void DataCache::ReleaseResources() {
 int64_t DataCache::Lookup(const string& filename, int64_t mtime, int64_t offset,
     int64_t bytes_to_read, uint8_t* buffer) {
   DCHECK(!partitions_.empty());
+  // Bail out early for uncacheable ranges or invalid requests.
+  if (mtime < 0 || offset < 0 || bytes_to_read < 0) {
+    VLOG(3) << Substitute("Skipping lookup of invalid entry $0 mtime: $1 offset: $2 "
+         "bytes_to_read: $3", filename, mtime, offset, bytes_to_read);
+    return 0;
+  }
 
   // Construct a cache key. The cache key is also hashed to compute the partition index.
   const CacheKey key(filename, mtime, offset);
@@ -676,6 +684,12 @@ int64_t DataCache::Lookup(const string& filename, int64_t mtime, int64_t offset,
 bool DataCache::Store(const string& filename, int64_t mtime, int64_t offset,
     const uint8_t* buffer, int64_t buffer_len) {
   DCHECK(!partitions_.empty());
+  // Bail out early for uncacheable ranges or invalid requests.
+  if (mtime < 0 || offset < 0 || buffer_len < 0) {
+    VLOG(3) << Substitute("Skipping insertion of invalid entry $0 mtime: $1 offset: $2 "
+         "buffer_len: $3", filename, mtime, offset, buffer_len);
+    return false;
+  }
 
   // Construct a cache key. The cache key is also hashed to compute the partition index.
   const CacheKey key(filename, mtime, offset);
diff --git a/be/src/runtime/io/data-cache.h b/be/src/runtime/io/data-cache.h
index b1bf99d..6a290d9 100644
--- a/be/src/runtime/io/data-cache.h
+++ b/be/src/runtime/io/data-cache.h
@@ -45,17 +45,19 @@ class Cache;
 /// Each partition has a meta-data cache which tracks the mappings of cache keys to
 /// the locations of the cached data. A cache key is a tuple of (file's name, file's
 /// modification time, file offset) and a cache entry is a tuple of (backing file,
-/// offset in the backing file, length of the cached data, optional checksum). Each
-/// partition stores its set of cached data in backing files created on local storage.
-/// When inserting new data into the cache, the data is appended to the current backing
-/// file in use. The storage consumption of each cache entry counts towards the quota of
-/// that partition. When a partition reaches its capacity, the least recently used data
-/// in that partition is evicted. Evicted data is removed from the underlying storage by
-/// punching holes in the backing file it's stored in. As a backing file reaches a certain
-/// size (e.g. 4TB), new data will stop being appended to it and a new file will be
-/// created instead. Note that due to hole punching, the backing file is actually sparse.
-/// For instance, a backing file may look like the following after some insertion and
-/// eviction. All the holes in file consume no storage space at all.
+/// offset in the backing file, length of the cached data, optional checksum). The
+/// file's modification time is used for distinguishing between different versions of
+/// a file with a given name. Each partition stores its set of cached data in backing
+/// files created on local storage. When inserting new data into the cache, the data is
+/// appended to the current backing file in use. The storage consumption of each cache
+/// entry counts towards the quota of that partition. When a partition reaches its
+/// capacity, the least recently used data in that partition is evicted. Evicted data is
+/// removed from the underlying storage by punching holes in the backing file it's stored
+/// in. As a backing file reaches a certain size (e.g. 4TB), new data will stop being
+/// appended to it and a new file will be created instead. Note that due to hole punching,
+/// the backing file is actually sparse. For instance, a backing file may look like the
+/// following after some insertion and eviction. All the holes in file consume no storage
+/// space at all.
 ///
 /// 0                                                                             1GB
 /// +----------+----------+----------+-----------------+---------+---------+-------+
diff --git a/be/src/runtime/io/request-ranges.h b/be/src/runtime/io/request-ranges.h
index 29d105d..5f4053c 100644
--- a/be/src/runtime/io/request-ranges.h
+++ b/be/src/runtime/io/request-ranges.h
@@ -23,6 +23,7 @@
 #include <functional>
 
 #include <boost/thread/mutex.hpp>
+#include <gtest/gtest_prod.h> // for FRIEND_TEST
 
 #include "common/atomic.h"
 #include "common/hdfs.h"
@@ -198,6 +199,7 @@ struct BufferOpts {
  private:
   friend class ScanRange;
   friend class HdfsFileReader;
+  FRIEND_TEST(DataCacheTest, TestBasics);
 
   BufferOpts(
       bool try_cache, int64_t mtime, uint8_t* client_buffer, int64_t client_buffer_len)