You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/02/27 21:45:37 UTC

[2/5] impala git commit: IMPALA-6530: Track time spent opening HDFS file handles

IMPALA-6530: Track time spent opening HDFS file handles

When the HDFS NameNode is overloaded, opening file
handles can be a significant source of query execution
time. Currently, there is no statistic to track this
time at the HDFS scan node level.

This introduces a statistic "TotalRawHdfsOpenFileTime(*)"
to track the time spent in HdfsOpenFile(). Here is
an example of this statistic populated for the query
"select * from functional_parquet.widetable_1000_cols",
which is dominated by file handle opening time:
- CachedFileHandlesHitCount: 0 (0)
- CachedFileHandlesMissCount: 1.00K (1001)
...
- ScannerThreadsTotalWallClockTime: 980.432ms
  - MaterializeTupleTime(*): 1.759ms
  - ScannerThreadsSysTime: 4.000ms
  - ScannerThreadsUserTime: 56.000ms
- TotalRawHdfsOpenFileTime(*): 894.285ms
- TotalRawHdfsReadTime(*): 25.188ms

To make the TotalRawHdfsReadTime mutually exclusive
from the TotalRawHdfsOpenFileTime, the timer tracking
for the read timer moves from DiskIoMgr::ReadRange()
to inside the ScanRange::Read() function. This allows
it to exclude the portion of ScanRange::Read() that
is getting a file handle from the file handle cache.

Change-Id: Ia560af2d9b12f158e8811900a7b9d98f8e760858
Reviewed-on: http://gerrit.cloudera.org:8080/9370
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/83ac412c
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/83ac412c
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/83ac412c

Branch: refs/heads/master
Commit: 83ac412cc1a5a702ae08d234fc813db0ba42db62
Parents: 93e7a72
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Thu Feb 15 16:08:21 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Feb 27 07:40:47 2018 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-scan-node-base.cc  |   2 +
 be/src/exec/scan-node.cc            |   1 +
 be/src/exec/scan-node.h             |   3 +
 be/src/runtime/io/disk-io-mgr.cc    |   8 +-
 be/src/runtime/io/disk-io-mgr.h     |  11 ++-
 be/src/runtime/io/request-context.h |   7 ++
 be/src/runtime/io/scan-range.cc     | 126 +++++++++++++++++--------------
 be/src/util/impalad-metrics.cc      |   6 ++
 be/src/util/impalad-metrics.h       |   4 +
 common/thrift/metrics.json          |  10 +++
 10 files changed, 113 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/exec/hdfs-scan-node-base.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.cc b/be/src/exec/hdfs-scan-node-base.cc
index b6a51f6..89f3859 100644
--- a/be/src/exec/hdfs-scan-node-base.cc
+++ b/be/src/exec/hdfs-scan-node-base.cc
@@ -355,6 +355,7 @@ Status HdfsScanNodeBase::Open(RuntimeState* state) {
   // TODO: Revisit counters and move the counters specific to multi-threaded scans
   // into HdfsScanNode.
   read_timer_ = ADD_TIMER(runtime_profile(), TOTAL_HDFS_READ_TIMER);
+  open_file_timer_ = ADD_TIMER(runtime_profile(), TOTAL_HDFS_OPEN_FILE_TIMER);
   per_read_thread_throughput_counter_ = runtime_profile()->AddDerivedCounter(
       PER_READ_THREAD_THROUGHPUT_COUNTER, TUnit::BYTES_PER_SECOND,
       bind<int64_t>(&RuntimeProfile::UnitsPerSecond, bytes_read_counter_, read_timer_));
@@ -371,6 +372,7 @@ Status HdfsScanNodeBase::Open(RuntimeState* state) {
 
   reader_context_->set_bytes_read_counter(bytes_read_counter());
   reader_context_->set_read_timer(read_timer());
+  reader_context_->set_open_file_timer(open_file_timer());
   reader_context_->set_active_read_thread_counter(&active_hdfs_read_thread_counter_);
   reader_context_->set_disks_accessed_bitmap(&disks_accessed_bitmap_);
 

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/exec/scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/scan-node.cc b/be/src/exec/scan-node.cc
index 27726f8..8a18a0c 100644
--- a/be/src/exec/scan-node.cc
+++ b/be/src/exec/scan-node.cc
@@ -38,6 +38,7 @@ const string ScanNode::BYTES_READ_COUNTER = "BytesRead";
 const string ScanNode::ROWS_READ_COUNTER = "RowsRead";
 const string ScanNode::COLLECTION_ITEMS_READ_COUNTER = "CollectionItemsRead";
 const string ScanNode::TOTAL_HDFS_READ_TIMER = "TotalRawHdfsReadTime(*)";
+const string ScanNode::TOTAL_HDFS_OPEN_FILE_TIMER = "TotalRawHdfsOpenFileTime(*)";
 const string ScanNode::TOTAL_HBASE_READ_TIMER = "TotalRawHBaseReadTime(*)";
 const string ScanNode::TOTAL_THROUGHPUT_COUNTER = "TotalReadThroughput";
 const string ScanNode::MATERIALIZE_TUPLE_TIMER = "MaterializeTupleTime(*)";

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/exec/scan-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/scan-node.h b/be/src/exec/scan-node.h
index 0976e27..6e137af 100644
--- a/be/src/exec/scan-node.h
+++ b/be/src/exec/scan-node.h
@@ -108,6 +108,7 @@ class ScanNode : public ExecNode {
     return collection_items_read_counter_;
   }
   RuntimeProfile::Counter* read_timer() const { return read_timer_; }
+  RuntimeProfile::Counter* open_file_timer() const { return open_file_timer_; }
   RuntimeProfile::Counter* total_throughput_counter() const {
     return total_throughput_counter_;
   }
@@ -135,6 +136,7 @@ class ScanNode : public ExecNode {
   static const std::string ROWS_READ_COUNTER;
   static const std::string COLLECTION_ITEMS_READ_COUNTER;
   static const std::string TOTAL_HDFS_READ_TIMER;
+  static const std::string TOTAL_HDFS_OPEN_FILE_TIMER;
   static const std::string TOTAL_HBASE_READ_TIMER;
   static const std::string TOTAL_THROUGHPUT_COUNTER;
   static const std::string PER_READ_THREAD_THROUGHPUT_COUNTER;
@@ -168,6 +170,7 @@ class ScanNode : public ExecNode {
   /// [(2, [(3)]), (4, [])] this counter will be 3: (2, [(3)]), (3) and (4, [])
   RuntimeProfile::Counter* collection_items_read_counter_;
   RuntimeProfile::Counter* read_timer_; // total read time
+  RuntimeProfile::Counter* open_file_timer_; // total time spent opening file handles
   /// Wall based aggregate read throughput [bytes/sec]
   RuntimeProfile::Counter* total_throughput_counter_;
   /// Per thread read throughput [bytes/sec]

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/runtime/io/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr.cc b/be/src/runtime/io/disk-io-mgr.cc
index 0ac3669..07e02b4 100644
--- a/be/src/runtime/io/disk-io-mgr.cc
+++ b/be/src/runtime/io/disk-io-mgr.cc
@@ -740,8 +740,6 @@ void DiskIoMgr::ReadRange(
     // Update counters.
     COUNTER_ADD_IF_NOT_NULL(reader->active_read_thread_counter_, 1L);
     COUNTER_BITOR_IF_NOT_NULL(reader->disks_accessed_bitmap_, 1LL << disk_queue->disk_id);
-    SCOPED_TIMER(&read_timer_);
-    SCOPED_TIMER(reader->read_timer_);
 
     read_status = range->Read(buffer_desc->buffer_, buffer_desc->buffer_len_,
         &buffer_desc->len_, &buffer_desc->eosr_);
@@ -841,6 +839,7 @@ int DiskIoMgr::AssignQueue(const char* file, int disk_id, bool expected_local) {
 
 ExclusiveHdfsFileHandle* DiskIoMgr::GetExclusiveHdfsFileHandle(const hdfsFS& fs,
     std::string* fname, int64_t mtime, RequestContext *reader) {
+  SCOPED_TIMER(reader->open_file_timer_);
   ExclusiveHdfsFileHandle* fid = new ExclusiveHdfsFileHandle(fs, fname->data(), mtime);
   if (!fid->ok()) {
     VLOG_FILE << "Opening the file " << fname << " failed.";
@@ -864,6 +863,7 @@ void DiskIoMgr::ReleaseExclusiveHdfsFileHandle(ExclusiveHdfsFileHandle* fid) {
 CachedHdfsFileHandle* DiskIoMgr::GetCachedHdfsFileHandle(const hdfsFS& fs,
     std::string* fname, int64_t mtime, RequestContext *reader) {
   bool cache_hit;
+  SCOPED_TIMER(reader->open_file_timer_);
   CachedHdfsFileHandle* fh = file_handle_cache_.GetFileHandle(fs, fname, mtime, false,
       &cache_hit);
   if (fh == nullptr) return nullptr;
@@ -887,8 +887,10 @@ void DiskIoMgr::ReleaseCachedHdfsFileHandle(std::string* fname,
 }
 
 Status DiskIoMgr::ReopenCachedHdfsFileHandle(const hdfsFS& fs, std::string* fname,
-    int64_t mtime, CachedHdfsFileHandle** fid) {
+    int64_t mtime, RequestContext* reader, CachedHdfsFileHandle** fid) {
   bool cache_hit;
+  SCOPED_TIMER(reader->open_file_timer_);
+  ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_REOPENED->Increment(1L);
   file_handle_cache_.ReleaseFileHandle(fname, *fid, true);
   // The old handle has been destroyed, so *fid must be overwritten before returning.
   *fid = file_handle_cache_.GetFileHandle(fs, fname, mtime, true,

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/runtime/io/disk-io-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr.h b/be/src/runtime/io/disk-io-mgr.h
index d246e95..760d0e9 100644
--- a/be/src/runtime/io/disk-io-mgr.h
+++ b/be/src/runtime/io/disk-io-mgr.h
@@ -343,7 +343,8 @@ class DiskIoMgr : public CacheLineAligned {
   bool Validate() const;
 
   /// Given a FS handle, name and last modified time of the file, construct a new
-  /// ExclusiveHdfsFileHandle. In the case of an error, returns nullptr.
+  /// ExclusiveHdfsFileHandle. This records the time spent opening the handle in
+  /// 'reader' and counts this as a cache miss. In the case of an error, returns nullptr.
   ExclusiveHdfsFileHandle* GetExclusiveHdfsFileHandle(const hdfsFS& fs,
       std::string* fname, int64_t mtime, RequestContext* reader);
 
@@ -351,7 +352,8 @@ class DiskIoMgr : public CacheLineAligned {
   void ReleaseExclusiveHdfsFileHandle(ExclusiveHdfsFileHandle* fid);
 
   /// Given a FS handle, name and last modified time of the file, gets a
-  /// CachedHdfsFileHandle from the file handle cache. On success, records statistics
+  /// CachedHdfsFileHandle from the file handle cache. Records the time spent
+  /// opening the handle in 'reader'. On success, records statistics
   /// about whether this was a cache hit or miss in the 'reader' as well as at the
   /// system level. In case of an error returns nullptr.
   CachedHdfsFileHandle* GetCachedHdfsFileHandle(const hdfsFS& fs,
@@ -361,9 +363,10 @@ class DiskIoMgr : public CacheLineAligned {
   void ReleaseCachedHdfsFileHandle(std::string* fname, CachedHdfsFileHandle* fid);
 
   /// Reopens a file handle by destroying the file handle and getting a fresh
-  /// file handle from the cache. Returns an error if the file could not be reopened.
+  /// file handle from the cache. Records the time spent reopening the handle
+  /// in 'reader'. Returns an error if the file could not be reopened.
   Status ReopenCachedHdfsFileHandle(const hdfsFS& fs, std::string* fname, int64_t mtime,
-      CachedHdfsFileHandle** fid);
+      RequestContext* reader, CachedHdfsFileHandle** fid);
 
   /// "Disk" queue offsets for remote accesses.  Offset 0 corresponds to
   /// disk ID (i.e. disk_queue_ index) of num_local_disks().

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/runtime/io/request-context.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/request-context.h b/be/src/runtime/io/request-context.h
index 24fd0fc..b028596 100644
--- a/be/src/runtime/io/request-context.h
+++ b/be/src/runtime/io/request-context.h
@@ -128,6 +128,10 @@ class RequestContext {
 
   void set_read_timer(RuntimeProfile::Counter* read_timer) { read_timer_ = read_timer; }
 
+  void set_open_file_timer(RuntimeProfile::Counter* open_file_timer) {
+    open_file_timer_ = open_file_timer;
+  }
+
   void set_active_read_thread_counter(
       RuntimeProfile::Counter* active_read_thread_counter) {
    active_read_thread_counter_ = active_read_thread_counter;
@@ -245,6 +249,9 @@ class RequestContext {
   /// Total time spent in hdfs reading
   RuntimeProfile::Counter* read_timer_ = nullptr;
 
+  /// Total time spent open hdfs file handles
+  RuntimeProfile::Counter* open_file_timer_ = nullptr;
+
   /// Number of active read threads
   RuntimeProfile::Counter* active_read_thread_counter_ = nullptr;
 

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/runtime/io/scan-range.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/scan-range.cc b/be/src/runtime/io/scan-range.cc
index 9c2110c..4f7c38b 100644
--- a/be/src/runtime/io/scan-range.cc
+++ b/be/src/runtime/io/scan-range.cc
@@ -503,73 +503,83 @@ Status ScanRange::Read(
 
     int64_t max_chunk_size = MaxReadChunkSize();
     Status status = Status::OK();
-    while (*bytes_read < bytes_to_read) {
-      int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
-      DCHECK_GE(chunk_size, 0);
-      // The hdfsRead() length argument is an int.
-      DCHECK_LE(chunk_size, numeric_limits<int>::max());
-      int current_bytes_read = -1;
-      // bytes_read_ is only updated after the while loop
-      int64_t position_in_file = offset_ + bytes_read_ + *bytes_read;
-      int num_retries = 0;
-      while (true) {
-        status = Status::OK();
-        // For file handles from the cache, any of the below file operations may fail
-        // due to a bad file handle. In each case, record the error, but allow for a
-        // retry to fix it.
-        if (FLAGS_use_hdfs_pread) {
-          current_bytes_read = hdfsPread(fs_, hdfs_file, position_in_file,
-              buffer + *bytes_read, chunk_size);
-          if (current_bytes_read == -1) {
-            status = Status(TErrorCode::DISK_IO_ERROR,
-                GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
-          }
-        } else {
-          // If the file handle is borrowed, it may not be at the appropriate
-          // location. Seek to the appropriate location.
-          bool seek_failed = false;
-          if (borrowed_hdfs_fh != nullptr) {
-            if (hdfsSeek(fs_, hdfs_file, position_in_file) != 0) {
-              status = Status(TErrorCode::DISK_IO_ERROR, Substitute("Error seeking to $0 "
-                  " in file: $1: $2", position_in_file, file_, GetHdfsErrorMsg("")));
-              seek_failed = true;
-            }
-          }
-          if (!seek_failed) {
-            current_bytes_read = hdfsRead(fs_, hdfs_file, buffer + *bytes_read,
-                chunk_size);
+    {
+      ScopedTimer<MonotonicStopWatch> io_mgr_read_timer(&io_mgr_->read_timer_);
+      ScopedTimer<MonotonicStopWatch> req_context_read_timer(reader_->read_timer_);
+      while (*bytes_read < bytes_to_read) {
+        int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
+        DCHECK_GE(chunk_size, 0);
+        // The hdfsRead() length argument is an int.
+        DCHECK_LE(chunk_size, numeric_limits<int>::max());
+        int current_bytes_read = -1;
+        // bytes_read_ is only updated after the while loop
+        int64_t position_in_file = offset_ + bytes_read_ + *bytes_read;
+        int num_retries = 0;
+        while (true) {
+          status = Status::OK();
+          // For file handles from the cache, any of the below file operations may fail
+          // due to a bad file handle. In each case, record the error, but allow for a
+          // retry to fix it.
+          if (FLAGS_use_hdfs_pread) {
+            current_bytes_read = hdfsPread(fs_, hdfs_file, position_in_file,
+                buffer + *bytes_read, chunk_size);
             if (current_bytes_read == -1) {
               status = Status(TErrorCode::DISK_IO_ERROR,
                   GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
             }
+          } else {
+            // If the file handle is borrowed, it may not be at the appropriate
+            // location. Seek to the appropriate location.
+            bool seek_failed = false;
+            if (borrowed_hdfs_fh != nullptr) {
+              if (hdfsSeek(fs_, hdfs_file, position_in_file) != 0) {
+                status = Status(TErrorCode::DISK_IO_ERROR,
+                  Substitute("Error seeking to $0 in file: $1: $2", position_in_file,
+                      file_, GetHdfsErrorMsg("")));
+                seek_failed = true;
+              }
+            }
+            if (!seek_failed) {
+              current_bytes_read = hdfsRead(fs_, hdfs_file, buffer + *bytes_read,
+                  chunk_size);
+              if (current_bytes_read == -1) {
+                status = Status(TErrorCode::DISK_IO_ERROR,
+                    GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
+              }
+            }
           }
-        }
 
-        // Do not retry:
-        // - if read was successful (current_bytes_read != -1)
-        // - or if already retried once
-        // - or if this not using a borrowed file handle
-        DCHECK_LE(num_retries, 1);
-        if (current_bytes_read != -1 || borrowed_hdfs_fh == nullptr ||
-            num_retries == 1) {
+          // Do not retry:
+          // - if read was successful (current_bytes_read != -1)
+          // - or if already retried once
+          // - or if this not using a borrowed file handle
+          DCHECK_LE(num_retries, 1);
+          if (current_bytes_read != -1 || borrowed_hdfs_fh == nullptr ||
+              num_retries == 1) {
+            break;
+          }
+          // The error may be due to a bad file handle. Reopen the file handle and retry.
+          // Exclude this time from the read timers.
+          io_mgr_read_timer.Stop();
+          req_context_read_timer.Stop();
+          ++num_retries;
+          RETURN_IF_ERROR(io_mgr_->ReopenCachedHdfsFileHandle(fs_, file_string(),
+              mtime(), reader_, &borrowed_hdfs_fh));
+          hdfs_file = borrowed_hdfs_fh->file();
+          io_mgr_read_timer.Start();
+          req_context_read_timer.Start();
+        }
+        if (!status.ok()) break;
+        if (current_bytes_read == 0) {
+          // No more bytes in the file. The scan range went past the end.
+          *eosr = true;
           break;
         }
-        // The error may be due to a bad file handle. Reopen the file handle and retry.
-        ++num_retries;
-        RETURN_IF_ERROR(io_mgr_->ReopenCachedHdfsFileHandle(fs_, file_string(),
-            mtime(), &borrowed_hdfs_fh));
-        hdfs_file = borrowed_hdfs_fh->file();
-      }
-      if (!status.ok()) break;
-      if (current_bytes_read == 0) {
-        // No more bytes in the file. The scan range went past the end.
-        *eosr = true;
-        break;
-      }
-      *bytes_read += current_bytes_read;
+        *bytes_read += current_bytes_read;
 
-      // Collect and accumulate statistics
-      GetHdfsStatistics(hdfs_file);
+        // Collect and accumulate statistics
+        GetHdfsStatistics(hdfs_file);
+      }
     }
 
     if (borrowed_hdfs_fh != nullptr) {

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/util/impalad-metrics.cc
----------------------------------------------------------------------
diff --git a/be/src/util/impalad-metrics.cc b/be/src/util/impalad-metrics.cc
index 32320d8..815e4af 100644
--- a/be/src/util/impalad-metrics.cc
+++ b/be/src/util/impalad-metrics.cc
@@ -66,6 +66,8 @@ const char* ImpaladMetricKeys::IO_MGR_CACHED_FILE_HANDLES_HIT_COUNT =
     "impala-server.io.mgr.cached-file-handles-hit-count";
 const char* ImpaladMetricKeys::IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT =
     "impala-server.io.mgr.cached-file-handles-miss-count";
+const char* ImpaladMetricKeys::IO_MGR_CACHED_FILE_HANDLES_REOPENED =
+    "impala-server.io.mgr.cached-file-handles-reopened";
 const char* ImpaladMetricKeys::CATALOG_NUM_DBS =
     "catalog.num-databases";
 const char* ImpaladMetricKeys::CATALOG_NUM_TABLES =
@@ -117,6 +119,7 @@ IntCounter* ImpaladMetrics::IO_MGR_LOCAL_BYTES_READ = NULL;
 IntCounter* ImpaladMetrics::IO_MGR_SHORT_CIRCUIT_BYTES_READ = NULL;
 IntCounter* ImpaladMetrics::IO_MGR_CACHED_BYTES_READ = NULL;
 IntCounter* ImpaladMetrics::IO_MGR_BYTES_WRITTEN = NULL;
+IntCounter* ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_REOPENED = NULL;
 IntCounter* ImpaladMetrics::HEDGED_READ_OPS = NULL;
 IntCounter* ImpaladMetrics::HEDGED_READ_OPS_WIN = NULL;
 
@@ -215,6 +218,9 @@ void ImpaladMetrics::CreateMetrics(MetricGroup* m) {
   IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT = m->AddGauge(
       ImpaladMetricKeys::IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT, 0);
 
+  IO_MGR_CACHED_FILE_HANDLES_REOPENED = m->AddCounter(
+      ImpaladMetricKeys::IO_MGR_CACHED_FILE_HANDLES_REOPENED, 0);
+
   IO_MGR_BYTES_READ = m->AddCounter(ImpaladMetricKeys::IO_MGR_BYTES_READ, 0);
   IO_MGR_LOCAL_BYTES_READ = m->AddCounter(
       ImpaladMetricKeys::IO_MGR_LOCAL_BYTES_READ, 0);

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/be/src/util/impalad-metrics.h
----------------------------------------------------------------------
diff --git a/be/src/util/impalad-metrics.h b/be/src/util/impalad-metrics.h
index a62c4c6..7de7aa8 100644
--- a/be/src/util/impalad-metrics.h
+++ b/be/src/util/impalad-metrics.h
@@ -97,6 +97,9 @@ class ImpaladMetricKeys {
   /// Number of cache misses for cached HDFS file handles
   static const char* IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT;
 
+  /// Number of cached file handles that hit an error and were reopened
+  static const char* IO_MGR_CACHED_FILE_HANDLES_REOPENED;
+
   /// Number of DBs in the catalog
   static const char* CATALOG_NUM_DBS;
 
@@ -174,6 +177,7 @@ class ImpaladMetrics {
   static IntCounter* IO_MGR_CACHED_BYTES_READ;
   static IntCounter* IO_MGR_SHORT_CIRCUIT_BYTES_READ;
   static IntCounter* IO_MGR_BYTES_WRITTEN;
+  static IntCounter* IO_MGR_CACHED_FILE_HANDLES_REOPENED;
   static IntCounter* HEDGED_READ_OPS;
   static IntCounter* HEDGED_READ_OPS_WIN;
 

http://git-wip-us.apache.org/repos/asf/impala/blob/83ac412c/common/thrift/metrics.json
----------------------------------------------------------------------
diff --git a/common/thrift/metrics.json b/common/thrift/metrics.json
index 6328cd4..cce34d7 100644
--- a/common/thrift/metrics.json
+++ b/common/thrift/metrics.json
@@ -1575,6 +1575,16 @@
     "key": "impala-server.io.mgr.cached-file-handles-miss-count"
   },
   {
+    "description": "Number of cached HDFS file handles reopened",
+    "contexts": [
+      "IMPALAD"
+    ],
+    "label": "HDFS cached file handles reopened",
+    "units": "NONE",
+    "kind": "COUNTER",
+    "key": "impala-server.io.mgr.cached-file-handles-reopened"
+  },
+  {
     "description": "The number of active scratch directories for spilling to disk.",
     "contexts": [
       "IMPALAD"