You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by to...@apache.org on 2017/11/01 23:42:29 UTC
[4/4] kudu git commit: compaction_policy: out-of-line inessential fields in RowSetInfo

compaction_policy: out-of-line inessential fields in RowSetInfo

Based on analysis using toplev.py from pmu-tools[1] I found that the
compaction algorithm was memory-bandwidth bound for the benchmarks in
compaction_policy-test.

The core algorithms of the compaction policy only access a subset of
the fields in the object, meaning that some fields are accessed far
more frequently than others. This can also be illustrated by building
with -fsanitize=cache-efficiency-frag which generates the following
report for the RowSetInfo struct:

==8130==   # 0: offset = 0,	 size = 8,	 count = 20000,	 type = %"class.kudu::tablet::RowSet"*
==8130==   # 1: offset = 8,	 size = 4,	 count = 39998,	 type = i32
==8130==   # 2: offset = 12,	 size = 4,	 count = 107698980,	 type = i32
==8130==   # 3: offset = 16,	 size = 1,	 count = 39998,	 type = i8
==8130==   # 4: offset = 24,	 size = 32,	 count = 210001,	 type = %"class.std::__cxx11::basic_string" = type { %"struct.std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider", i64, %union.anon }
==8130==   # 5: offset = 56,	 size = 32,	 count = 210001,	 type = %"class.std::__cxx11::basic_string" = type { %"struct.std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider", i64, %union.anon }
==8130==   # 6: offset = 88,	 size = 8,	 count = 300940510,	 type = double
==8130==   # 7: offset = 96,	 size = 8,	 count = 200930507,	 type = double
==8130==   # 8: offset = 104,	 size = 8,	 count = 89700226,	 type = double

Here we can see that the second i32 (size_mb) as well as the three
doubles (cdf_min, cdf_max, and density) are accessed orders of
magnitude more than the other fields.

An easy fix for this is to out-of-line the other fields into an ancillary
struct elsewhere on the heap, and store just a pointer to the struct.
This ensures that the frequently-accessed fields fit into a smaller
memory footprint and thus involve far fewer memory accesses.

I benchmarked using both the 'TinyOverlap' case and the 'Ycsb' case,
measuring CPU cache misses at each level along with total runtime.

TinyOverlap case
------------------

Before:
 Performance counter stats for 'build/latest/bin/compaction_policy-test --gtest_filter=*TinyOver*' (10 runs):

        55,773,441      mem_load_retired.l1_miss                                      ( +-  0.91% )  (66.20%)
     1,200,071,033      mem_load_retired.l1_hit                                       ( +-  0.19% )  (66.51%)
        44,324,227      mem_load_retired.l2_miss                                      ( +-  1.21% )  (67.02%)
        11,525,953      mem_load_retired.l2_hit                                       ( +-  3.75% )  (67.34%)
            42,298      mem_load_retired.l3_miss                                      ( +-  8.56% )  (67.07%)
        44,461,423      mem_load_retired.l3_hit                                       ( +-  1.16% )  (66.44%)

       0.399781701 seconds time elapsed                                          ( +-  0.84% )
After:
 Performance counter stats for 'build/latest/bin/compaction_policy-test --gtest_filter=*TinyOver*' (10 runs):

        24,864,656      mem_load_retired.l1_miss                                      ( +-  1.37% )  (66.51%)
     1,261,166,288      mem_load_retired.l1_hit                                       ( +-  0.13% )  (66.73%)
         5,694,166      mem_load_retired.l2_miss                                      ( +-  4.37% )  (66.75%)
        18,221,204      mem_load_retired.l2_hit                                       ( +-  1.13% )  (66.97%)
            28,574      mem_load_retired.l3_miss                                      ( +- 16.39% )  (67.10%)
         5,829,783      mem_load_retired.l3_hit                                       ( +-  5.19% )  (66.63%)

       0.360860837 seconds time elapsed                                          ( +-  0.35% )

Summary of improvements:
  - ~10% improvement in wall time
  - L1 cache: 45% reduction in misses
  - L2 cache: 93% reduction in misses
  - L3 cache: 68% reduction in misses

YCSB test case
---------------

Before:
 Performance counter stats for 'build/latest/bin/compaction_policy-test --gtest_filter=*Ycsb*' (10 runs):

        97,654,120      mem_load_retired.l1_miss                                      ( +-  1.13% )  (66.41%)
     2,205,803,278      mem_load_retired.l1_hit                                       ( +-  0.13% )  (66.57%)
        80,632,313      mem_load_retired.l2_miss                                      ( +-  1.68% )  (66.73%)
        16,742,677      mem_load_retired.l2_hit                                       ( +-  3.77% )  (66.99%)
            60,874      mem_load_retired.l3_miss                                      ( +-  5.64% )  (67.02%)
        82,209,502      mem_load_retired.l3_hit                                       ( +-  1.08% )  (66.61%)

       0.747246686 seconds time elapsed                                          ( +-  0.49% )

After:
 Performance counter stats for 'build/latest/bin/compaction_policy-test --gtest_filter=*Ycsb*' (10 runs):

        39,568,838      mem_load_retired.l1_miss                                      ( +-  0.94% )  (66.49%)
     2,328,627,974      mem_load_retired.l1_hit                                       ( +-  0.12% )  (66.67%)
         6,673,404      mem_load_retired.l2_miss                                      ( +-  3.58% )  (66.76%)
        31,882,992      mem_load_retired.l2_hit                                       ( +-  1.11% )  (66.95%)
            59,963      mem_load_retired.l3_miss                                      ( +- 10.28% )  (66.95%)
         6,378,458      mem_load_retired.l3_hit                                       ( +-  3.19% )  (66.58%)

       0.676610184 seconds time elapsed                                          ( +-  0.44% )

Summary:
  - ~10% improvement in wall time
  - L1 cache: 59% reduction in misses
  - L2 cache: 92% reduction in misses
  - L3 cache: 1% reduction (apparently this workload always fit well
    in L3)

As this is the third in a set of patches for optimizations that were
concentrating on the "TinyOverlap" test case, I also verified
before/after of the "YCSB" compaction test case since the beginning of
series.  This also showed about a 14% improvement:

ycsb at start:
 Performance counter stats for 'build/latest/bin/compaction_policy-test --gtest_filter=*Ycsb*' (10 runs):

        795.924340      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.53% )
                27      context-switches          #    0.034 K/sec                    ( +- 60.21% )
                 0      cpu-migrations            #    0.000 K/sec                    ( +-100.00% )
             2,672      page-faults               #    0.003 M/sec                    ( +-  1.36% )
     2,998,040,105      cycles                    #    3.767 GHz                      ( +-  0.35% )
     7,457,801,099      instructions              #    2.49  insn per cycle           ( +-  0.00% )
     1,212,318,353      branches                  # 1523.158 M/sec                    ( +-  0.00% )
         9,741,331      branch-misses             #    0.80% of all branches          ( +-  0.03% )

       0.797407974 seconds time elapsed                                          ( +-  0.58% )

ycsb at end:
 Performance counter stats for 'build/latest/bin/compaction_policy-test --gtest_filter=*Ycsb*' (10 runs):

        681.047998      task-clock (msec)         #    0.999 CPUs utilized            ( +-  0.52% )
                 2      context-switches          #    0.003 K/sec                    ( +- 46.55% )
                 0      cpu-migrations            #    0.000 K/sec
             2,474      page-faults               #    0.004 M/sec                    ( +-  1.22% )
     2,581,500,560      cycles                    #    3.790 GHz                      ( +-  0.42% )
     7,267,792,155      instructions              #    2.82  insn per cycle           ( +-  0.00% )
     1,210,494,552      branches                  # 1777.400 M/sec                    ( +-  0.00% )
         9,790,631      branch-misses             #    0.81% of all branches          ( +-  0.02% )

       0.681425636 seconds time elapsed                                          ( +-  0.52% )

[1] https://github.com/andikleen/pmu-tools/wiki/toplev-manual

Change-Id: I148b8c976577cbe0020ed887f3bc4c7a2cb2c3ae
Reviewed-on: http://gerrit.cloudera.org:8080/8446
Tested-by: Kudu Jenkins
Reviewed-by: Adar Dembo <ad...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/9dd64e1b
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/9dd64e1b
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/9dd64e1b

Branch: refs/heads/master
Commit: 9dd64e1b3a3ab1a732fb6e0ababe66858cfbfa86
Parents: 810b960
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Nov 1 12:45:22 2017 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Wed Nov 1 23:41:57 2017 +0000

----------------------------------------------------------------------
 src/kudu/tablet/rowset_info.cc | 23 ++++++++++---------
 src/kudu/tablet/rowset_info.h  | 46 ++++++++++++++++++++++---------------
 src/kudu/tablet/rowset_tree.cc |  7 +++++-
 3 files changed, 46 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/9dd64e1b/src/kudu/tablet/rowset_info.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_info.cc b/src/kudu/tablet/rowset_info.cc
index fbe02a0..4c12780 100644
--- a/src/kudu/tablet/rowset_info.cc
+++ b/src/kudu/tablet/rowset_info.cc
@@ -257,12 +257,13 @@ void RowSetInfo::CollectOrdered(const RowSetTree& tree,
 }
 
 RowSetInfo::RowSetInfo(RowSet* rs, double init_cdf)
-  : rowset_(rs),
-    size_bytes_(rs->OnDiskBaseDataSizeWithRedos()),
-    size_mb_(std::max(implicit_cast<int>(size_bytes_ / 1024 / 1024), kMinSizeMb)),
-    cdf_min_key_(init_cdf),
-    cdf_max_key_(init_cdf) {
-  has_bounds_ = rs->GetBounds(&min_key_, &max_key_).ok();
+    : cdf_min_key_(init_cdf),
+      cdf_max_key_(init_cdf),
+      extra_(new ExtraData()) {
+  extra_->rowset = rs;
+  extra_->size_bytes = rs->OnDiskBaseDataSizeWithRedos();
+  extra_->has_bounds = rs->GetBounds(&extra_->min_key, &extra_->max_key).ok();
+  size_mb_ = std::max(implicit_cast<int>(extra_->size_bytes / 1024 / 1024), kMinSizeMb);
 }
 
 void RowSetInfo::FinalizeCDFVector(vector<RowSetInfo>* vec,
@@ -270,7 +271,7 @@ void RowSetInfo::FinalizeCDFVector(vector<RowSetInfo>* vec,
   if (quot == 0) return;
   for (RowSetInfo& cdf_rs : *vec) {
     CHECK_GT(cdf_rs.size_mb_, 0) << "Expected file size to be at least 1MB "
-                                 << "for RowSet " << cdf_rs.rowset_->ToString()
+                                 << "for RowSet " << cdf_rs.rowset()->ToString()
                                  << ", was " << cdf_rs.size_bytes()
                                  << " bytes.";
     cdf_rs.cdf_min_key_ /= quot;
@@ -282,12 +283,12 @@ void RowSetInfo::FinalizeCDFVector(vector<RowSetInfo>* vec,
 
 string RowSetInfo::ToString() const {
   string ret;
-  ret.append(rowset_->ToString());
+  ret.append(rowset()->ToString());
   StringAppendF(&ret, "(% 3dM) [%.04f, %.04f]", size_mb_,
                 cdf_min_key_, cdf_max_key_);
-  if (has_bounds_) {
-    ret.append(" [").append(KUDU_REDACT(Slice(min_key_).ToDebugString()));
-    ret.append(",").append(KUDU_REDACT(Slice(max_key_).ToDebugString()));
+  if (extra_->has_bounds) {
+    ret.append(" [").append(KUDU_REDACT(Slice(extra_->min_key).ToDebugString()));
+    ret.append(",").append(KUDU_REDACT(Slice(extra_->max_key).ToDebugString()));
     ret.append("]");
   }
   return ret;

http://git-wip-us.apache.org/repos/asf/kudu/blob/9dd64e1b/src/kudu/tablet/rowset_info.h
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_info.h b/src/kudu/tablet/rowset_info.h
index 96d1147..6cc5588 100644
--- a/src/kudu/tablet/rowset_info.h
+++ b/src/kudu/tablet/rowset_info.h
@@ -17,6 +17,8 @@
 #ifndef KUDU_TABLET_ROWSET_INFO_H_
 #define KUDU_TABLET_ROWSET_INFO_H_
 
+#include "kudu/gutil/ref_counted.h"
+
 #include <string>
 #include <vector>
 
@@ -41,7 +43,7 @@ class RowSetInfo {
                              std::vector<RowSetInfo>* min_key,
                              std::vector<RowSetInfo>* max_key);
 
-  int size_bytes() const { return size_bytes_; }
+  int size_bytes() const { return extra_->size_bytes; }
   int size_mb() const { return size_mb_; }
 
   // Return the value of the CDF at the minimum key of this candidate.
@@ -49,9 +51,9 @@ class RowSetInfo {
   // Return the value of the CDF at the maximum key of this candidate.
   double cdf_max_key() const { return cdf_max_key_; }
 
-  bool has_bounds() const { return has_bounds_; }
-  const std::string& min_key() const { return min_key_; }
-  const std::string& max_key() const { return max_key_; }
+  bool has_bounds() const { return extra_->has_bounds;  }
+  const std::string& min_key() const { return extra_->min_key; }
+  const std::string& max_key() const { return extra_->max_key; }
 
   // Return the "width" of the candidate rowset.
   //
@@ -64,7 +66,7 @@ class RowSetInfo {
 
   double density() const { return density_; }
 
-  RowSet* rowset() const { return rowset_; }
+  RowSet* rowset() const { return extra_->rowset; }
 
   std::string ToString() const;
 
@@ -81,24 +83,32 @@ class RowSetInfo {
   static void FinalizeCDFVector(std::vector<RowSetInfo>* vec,
                                 double quot);
 
-  RowSet* const rowset_;
-
-  // Cached version of rowset_->OnDiskBaseDataSize().
-  const int size_bytes_;
-
   // The size in MB, already clamped so that all rowsets have size at least
   // 1MB. This is cached to avoid the branch during the selection hot path.
-  const int size_mb_;
-
-  // True if the RowSet has known bounds.
-  // MemRowSets in particular do not.
-  bool has_bounds_;
-
-  // The bounds, if known.
-  std::string min_key_, max_key_;
+  int size_mb_;
 
   double cdf_min_key_, cdf_max_key_;
   double density_;
+
+  // We move these out of the RowSetInfo object because the std::strings are relatively
+  // large objects, and we'd like the RowSetInfos to be as small as possible so that
+  // the algorithm can fit mostly in CPU cache. The string bounds themselves are rarely
+  // accessed in the hot part of the algorithm so it's worth out-of-lining them.
+  //
+  // These are ref-counted so that RowSetInfo is copyable.
+  struct ExtraData : public RefCounted<ExtraData> {
+    // Cached version of rowset_->OnDiskBaseDataSize().
+    int size_bytes;
+
+    // True if the RowSet has known bounds.
+    // MemRowSets in particular do not.
+    bool has_bounds;
+    std::string min_key, max_key;
+
+    // The original RowSet that this RowSetInfo was constructed from.
+    RowSet* rowset;
+  };
+  const scoped_refptr<ExtraData> extra_;
 };
 
 } // namespace tablet

http://git-wip-us.apache.org/repos/asf/kudu/blob/9dd64e1b/src/kudu/tablet/rowset_tree.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/rowset_tree.cc b/src/kudu/tablet/rowset_tree.cc
index 073877a..829477f 100644
--- a/src/kudu/tablet/rowset_tree.cc
+++ b/src/kudu/tablet/rowset_tree.cc
@@ -66,9 +66,14 @@ struct QueryStruct {
 
 // Entry for use in the interval tree.
 struct RowSetWithBounds {
-  RowSet *rowset;
   string min_key;
   string max_key;
+
+  // NOTE: the ordering of struct fields here is purposeful: we access
+  // min_key and max_key frequently, so putting them first in the struct
+  // ensures they fill a single 64-byte cache line (each is 32 bytes).
+  // The 'rowset' pointer is accessed comparitively rarely.
+  RowSet *rowset;
 };
 
 // Traits struct for IntervalTree.