You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2020/08/11 01:23:43 UTC
[kudu] branch master updated: [benchmarks] added a few 'perf stat' metrics

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 1c5a04f  [benchmarks] added a few 'perf stat' metrics
1c5a04f is described below

commit 1c5a04f7774f348b3c9fc4fd39de98c858c87c5f
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Mon Aug 3 14:29:54 2020 -0700

    [benchmarks] added a few 'perf stat' metrics
    
    Added an ability to run most of the micro-benchmarks under 'perf stat'
    and record the following counters:
      * instructions per cycle
      * CPU utilization
      * branch misses
    
    These Performance Monitoring Counters (PMCs) are relatively stable
    in terms of 'run to run' with the same binary at the same machine
    (under 5% pct deviation).  They provide an extra coverage in addition
    to the application-specific metrics directly output by the tests.  The
    introduced PMCs show how effective the code is in using available
    hardware resources, whether the code is instruction or memory bound,
    etc. For example, see [1] and [2] how to interpret values
    of the instructions-per-cycle (IPC) performance counters.
    
    It seems these new performance metrics cannot be used for release-gating
    decisions, but they are useful in tracking improvements and regressions
    in the code, especially when updating common sub-components like
    third-party libraries, etc.
    
    For each of the mentioned performance counters, a graph is generated
    with data points for all the tests run under 'perf stat'.
    
    As of now, the following existing tests are not run under `perf stat`
    and corresponding TODOs are added:
      * tests based on external test mini-cluster (need to track performance
        of spawn kudu-tserver and kudu-master processes, but that's not
        available yet, need to implement that)
      * full_stack-insert-scan-test (custom-tailored --perf_stat_scan
        provision are not working for some reason, need fix that)
    
    To disable the collection of these performance counters, set
    RUN_PERF_STAT to 0 (it's set to 1 by default).
    
    [1] http://www.brendangregg.com/blog/2017-05-09/cpu-utilization-is-wrong.html
    [2] http://www.brendangregg.com/blog/2017-05-04/the-pmcs-of-ec2.html
    
    Change-Id: Ib742ffbaabca96c2ac455481313348da2133ff6e
    Reviewed-on: http://gerrit.cloudera.org:8080/16284
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Alexey Serbin <as...@cloudera.com>
---
 src/kudu/scripts/benchmarks.sh | 362 +++++++++++++++++++++++++++--------------
 1 file changed, 241 insertions(+), 121 deletions(-)

diff --git a/src/kudu/scripts/benchmarks.sh b/src/kudu/scripts/benchmarks.sh
index 6154c7d..5a16d3a 100755
--- a/src/kudu/scripts/benchmarks.sh
+++ b/src/kudu/scripts/benchmarks.sh
@@ -43,7 +43,7 @@ LOCAL_STATS_BASE="local-stats"
 NUM_MT_TABLET_TESTS=5
 MT_TABLET_TEST=mt-tablet-test
 RPC_BENCH_TEST=RpcBenchBenchmark
-CBTREE_TEST=cbtree-test
+CBTREE_TEST=ConcurrentBTreeScan
 BLOOM_TEST=BloomfileBenchmark
 MT_BLOOM_TEST=MultithreadedBloomfileBenchmark
 WIRE_PROTOCOL_TEST=WireProtocolBenchmark
@@ -80,6 +80,17 @@ HTML_FILE="benchmarks.html"
 # Most tests will run this many times.
 NUM_SAMPLES=${NUM_SAMPLES:-10}
 
+# Whether to run the benchmarks under 'perf stat', recording and plotting
+# the statistics collected by 'perf stat'.  The 'perf stat' has the
+# least overhead among all sub-commands of the 'perf' suite (e.g., see
+# http://www.brendangregg.com/perf.html).
+RUN_PERF_STAT=${RUN_PERF_STAT:-1}
+if [ $RUN_PERF_STAT -ne 0 -o "$RUN_PERF_STAT" = x"yes" ]; then
+  PERF_STAT="perf stat"
+else
+  PERF_STAT=""
+fi
+
 ################################################################
 # Global variables
 ################################################################
@@ -103,6 +114,24 @@ usage_and_die() {
   exit 1
 }
 
+perf_stat_enabled() {
+  if [ -z "$PERF_STAT" ]; then
+    return 1
+  fi
+  return 0
+}
+
+# Ensure the necessary performance tools are present.
+check_tools() {
+  if perf_stat_enabled; then
+    # Check for 'perf' assuming the 'stat' sub-tool is always available.
+    if ! which perf > /dev/null; then
+      return 1
+    fi
+  fi
+  return 0
+}
+
 ensure_cpu_scaling() {
   $BASE_DIR/src/kudu/scripts/ensure_cpu_scaling.sh "$@"
 }
@@ -143,7 +172,7 @@ write_img_plot() {
   local y_axis_label="$3"
   # Rscript fails when there's only a header, so just skip
   if [ `wc -l $input_file | cut -d ' ' -f1` -gt 1 ]; then
-    Rscript jobs_runtime.R $input_file $test_name "$y_axis_label"
+    Rscript jobs_runtime.R "$input_file" "$test_name" "$y_axis_label"
   fi
 }
 
@@ -191,8 +220,10 @@ run_benchmarks() {
   mkdir -p "$LOGDIR"
   mkdir -p "$OUTDIR"
 
+  export KUDU_ALLOW_SLOW_TESTS=true
+
   # run all of the variations of mt-tablet-test
-  ./build/latest/bin/mt-tablet-test \
+  $PERF_STAT ./build/latest/bin/mt-tablet-test \
     --gtest_filter=\*DoTestAllAtOnce\* \
     --num_counter_threads=0 \
     --tablet_test_flush_threshold_mb=32 \
@@ -204,20 +235,26 @@ run_benchmarks() {
 
   # run rpc-bench test 5 times. 10 seconds per run
   for i in $(seq 1 $NUM_SAMPLES); do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/rpc-bench \
-      --gtest_filter=*BenchmarkCalls &> $LOGDIR/$RPC_BENCH_TEST$i.log
+    $PERF_STAT ./build/latest/bin/rpc-bench \
+      --gtest_filter=*BenchmarkCalls \
+      &> $LOGDIR/$RPC_BENCH_TEST$i.log
   done
 
   # run cbtree-test 5 times. 20 seconds per run
   for i in $(seq 1 $NUM_SAMPLES); do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/cbtree-test \
-      --gtest_filter=TestCBTree.TestScanPerformance &> $LOGDIR/${CBTREE_TEST}$i.log
+    $PERF_STAT ./build/latest/bin/cbtree-test \
+      --gtest_filter=TestCBTree.TestScanPerformance \
+      &> $LOGDIR/${CBTREE_TEST}$i.log
   done
 
   # run bloomfile-test 5 times. ~3.3 seconds per run
   for i in $(seq 1 $NUM_SAMPLES); do
-    ./build/latest/bin/bloomfile-test --benchmark_queries=10000000 --bloom_size_bytes=32768 \
-      --n_keys=100000 --gtest_filter=*Benchmark &> $LOGDIR/$BLOOM_TEST$i.log
+    $PERF_STAT ./build/latest/bin/bloomfile-test \
+      --gtest_filter=*Benchmark \
+      --benchmark_queries=10000000 \
+      --bloom_size_bytes=32768 \
+      --n_keys=100000 \
+      &> $LOGDIR/$BLOOM_TEST$i.log
   done
 
   # run mt-bloomfile-test 5 times. 20-30 seconds per run.
@@ -225,46 +262,55 @@ run_benchmarks() {
   # set the cache capacity to be only 100M to ensure that the cache
   # churns a lot.
   for i in $(seq 1 $NUM_SAMPLES); do
-    ./build/latest/bin/mt-bloomfile-test --benchmark_queries=2000000 --bloom_size_bytes=4096 \
-      --n_keys=150000000 --block_cache_capacity_mb=100 &> $LOGDIR/$MT_BLOOM_TEST$i.log
+    $PERF_STAT ./build/latest/bin/mt-bloomfile-test \
+      --benchmark_queries=2000000 --bloom_size_bytes=4096 \
+      --n_keys=150000000 --block_cache_capacity_mb=100 \
+      &> $LOGDIR/$MT_BLOOM_TEST$i.log
   done
 
   # run wire_protocol-test 5 times.
   #
   # We run the non-null 10-column benchmark, selecting all rows.
   for i in $(seq 1 $NUM_SAMPLES); do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/wire_protocol-test --gtest_filter=*Benchmark/10_int64_non_null_sel_100pct \
+    $PERF_STAT ./build/latest/bin/wire_protocol-test \
+      --gtest_filter=*Benchmark/10_int64_non_null_sel_100pct \
       &> $LOGDIR/$WIRE_PROTOCOL_TEST$i.log
   done
 
   # run compaction-test 5 times, 6 seconds each
   for i in $(seq 1 $NUM_SAMPLES); do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/compaction-test \
-      --gtest_filter=TestCompaction.BenchmarkMerge* &> $LOGDIR/${COMPACT_MERGE_BENCH}$i.log
+    $PERF_STAT ./build/latest/bin/compaction-test \
+      --gtest_filter=TestCompaction.BenchmarkMerge* \
+      &> $LOGDIR/${COMPACT_MERGE_BENCH}$i.log
   done
 
   # run memrowset benchmark 5 times, ~10 seconds per run
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    ./build/latest/bin/memrowset-test --roundtrip_num_rows=10000000 \
-        --gtest_filter=\*InsertCount\* &> $LOGDIR/${MEMROWSET_BENCH}$i.log
+    $PERF_STAT ./build/latest/bin/memrowset-test \
+      --roundtrip_num_rows=10000000 \
+      --gtest_filter=\*InsertCount\* \
+      &> $LOGDIR/${MEMROWSET_BENCH}$i.log
   done
 
   # Run single-threaded TS insert latency benchmark, 5-6 seconds per run
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/tablet_server-test \
+    $PERF_STAT ./build/latest/bin/tablet_server-test \
       --gtest_filter=*MicroBench* \
       --single_threaded_insert_latency_bench_warmup_rows=1000 \
-      --single_threaded_insert_latency_bench_insert_rows=10000 &> $LOGDIR/${TS_INSERT_LATENCY}$i.log
+      --single_threaded_insert_latency_bench_insert_rows=10000 \
+      &> $LOGDIR/${TS_INSERT_LATENCY}$i.log
   done
 
   # Run multi-threaded TS insert benchmark
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=1 build/latest/bin/tablet_server-stress-test \
-      --num_inserts_per_thread=30000 -runtime_secs=0 &> $LOGDIR/${TS_8THREAD_BENCH}$i.log
+    $PERF_STAT ./build/latest/bin/tablet_server-stress-test \
+      --num_inserts_per_thread=30000 -runtime_secs=0 \
+      &> $LOGDIR/${TS_8THREAD_BENCH}$i.log
   done
 
   # Run full stack scan/insert test using MRS only, ~26s each
   for i in $(seq 1 $NUM_SAMPLES) ; do
+    # TODO(aserbin): fix --perf_stat_scan and enable stats collection
     ./build/latest/bin/full_stack-insert-scan-test \
       --gtest_filter=FullStackInsertScanTest.MRSOnlyStressTest \
       --concurrent_inserts=50 \
@@ -275,6 +321,7 @@ run_benchmarks() {
 
   # Run full stack scan/insert test with disk, ~50s each
   for i in $(seq 1 $NUM_SAMPLES) ; do
+    # TODO(aserbin): fix --perf_stat_scan and enable stats collection
     ./build/latest/bin/full_stack-insert-scan-test \
       --gtest_filter=FullStackInsertScanTest.WithDiskStressTest \
       --concurrent_inserts=50 \
@@ -286,6 +333,8 @@ run_benchmarks() {
   # Run dense storage node test, ~150s each.
   #
   # Needs to run as root for measure_startup_drop_caches.
+  # TODO(aserbin): add performance counter collection once it's possible to run
+  #                external minicluster's components under 'perf stat'
   for i in $(seq 1 $NUM_SAMPLES) ; do
     sudo ./build/latest/bin/dense_node-itest \
       --num_seconds=60 \
@@ -299,7 +348,7 @@ run_benchmarks() {
   # Run BlockingQueue concurrency test with 3 writers and 3 readers,
   # (i.e. symmetric in number of readers/writers), no non-blocking writers.
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/blocking_queue-test \
+    $PERF_STAT ./build/latest/bin/blocking_queue-test \
       --gtest_filter=BlockingQueueMultiThreadPerfTest.RequestRates \
       --num_blocking_writers=3 \
       --num_blocking_readers=3 \
@@ -310,7 +359,7 @@ run_benchmarks() {
   # Run BlockingQueue concurrency test with 3 writers
   # (2 blocking, 1 non-blocking) and 1 reader.
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/blocking_queue-test \
+    $PERF_STAT ./build/latest/bin/blocking_queue-test \
       --gtest_filter=BlockingQueueMultiThreadPerfTest.RequestRates \
       --num_blocking_writers=2 \
       --num_blocking_readers=1 \
@@ -320,14 +369,14 @@ run_benchmarks() {
 
   # Run ConcurrentGetTableSchemaTest.RPC with and without authz.
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/master-test \
+    $PERF_STAT ./build/latest/bin/master-test \
       --gtest_filter='*/ConcurrentGetTableSchemaTest.Rpc/*' \
       &> $LOGDIR/${GET_TABLE_SCHEMA_RPC}$i.log
   done
 
   # Run ConcurrentGetTableSchemaTest.DirectMethodCall with and without authz.
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/master-test \
+    $PERF_STAT ./build/latest/bin/master-test \
       --gtest_filter='*/ConcurrentGetTableSchemaTest.DirectMethodCall/*' \
       &> $LOGDIR/${GET_TABLE_SCHEMA_DIRECT_CALL}$i.log
   done
@@ -335,35 +384,52 @@ run_benchmarks() {
   # Run GetTableLocationsBenchmark, with and without cache.
   for capacity_mb in 0 32 ; do
     for i in $(seq 1 $NUM_SAMPLES) ; do
-      KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/table_locations-itest \
+      $PERF_STAT ./build/latest/bin/table_locations-itest \
         --gtest_filter=TableLocationsTest.GetTableLocationsBenchmark \
         --rpc_num_service_threads=8 \
         --benchmark_num_threads=12 \
         --table_locations_cache_capacity_mb=$capacity_mb \
-        &> $LOGDIR/${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}_$i.log
+        &> $LOGDIR/${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}$i.log
     done
   done
 
   # Run GetTableLocationsBenchmarkFunctionCall, with and without cache.
   for capacity_mb in 0 32 ; do
     for i in $(seq 1 $NUM_SAMPLES) ; do
-      KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/table_locations-itest \
+      $PERF_STAT ./build/latest/bin/table_locations-itest \
         --gtest_filter=TableLocationsTest.GetTableLocationsBenchmarkFunctionCall \
         --benchmark_num_threads=12 \
         --table_locations_cache_capacity_mb=$capacity_mb \
-        &> $LOGDIR/${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}_$i.log
+        &> $LOGDIR/${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}$i.log
     done
   done
 
   # Run SameTabletConcurrentWritesTest.InsertsOnly with 16 inserter threads.
   for i in $(seq 1 $NUM_SAMPLES) ; do
-    KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/same_tablet_concurrent_writes-itest \
+    $PERF_STAT ./build/latest/bin/same_tablet_concurrent_writes-itest \
       --gtest_filter='SameTabletConcurrentWritesTest.InsertsOnly' \
       --num_inserter_threads=16 \
       &> $LOGDIR/${SAME_TABLET_CONCURRENT_WRITES}$i.log
   done
 }
 
+parse_and_record_perf_stats() {
+  if ! perf_stat_enabled; then
+    return
+  fi
+  local build_id=$1
+  local test_id=$2
+  local iter=$3
+  local log=$4
+
+  local ins_per_cycle=$(grep -o '# .* ins.* per cycle' $log | awk '{print $2}')
+  record_result $build_id perf_stat_ins_per_cycle_${test_id} $iter $ins_per_cycle
+  local cpu_util=$(grep -o '# .* CPUs utilized' $log | awk '{print $2}')
+  record_result $build_id perf_stat_cpu_util_${test_id} $iter $cpu_util
+  local branch_misses=$(grep -o '# .* of all branches' $log | awk '{print $2}' | sed 's/%//g')
+  record_result $build_id perf_stat_branch_misses_${test_id} $iter $branch_misses
+}
+
 parse_and_record_all_results() {
   local BUILD_IDENTIFIER="$1"
 
@@ -388,56 +454,68 @@ parse_and_record_all_results() {
     timing=`sed -n "${linenumber}p" $LOGDIR/${MT_TABLET_TEST_TIMINGS}.txt`
     record_result $BUILD_IDENTIFIER MultiThreadedTabletTest_$i 1 `echo $timing / 1000 | bc -l`
   done
+  parse_and_record_perf_stats $BUILD_IDENTIFIER $MT_TABLET_TEST 1 $LOGDIR/${MT_TABLET_TEST}.log
 
   # parse out the real time from: "Time spent Insert 10000000 keys: real 16.438s user 16.164s  sys 0.229s"
   for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "Time spent Insert" $LOGDIR/${CBTREE_TEST}$i.log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ConcurrentBTreeScanInsert $i $real
-  done
+    local id=$CBTREE_TEST
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
 
-  for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "not frozen" $LOGDIR/${CBTREE_TEST}$i.log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ConcurrentBTreeScanNotFrozen $i $real
-  done
+    real=`grep "Time spent Insert" $log | ./parse_real_out.sh`
+    record_result $BUILD_IDENTIFIER ${id}Insert $i $real
 
-  for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "(frozen" $LOGDIR/${CBTREE_TEST}$i.log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ConcurrentBTreeScanFrozen $i $real
+    real=`grep "not frozen" $log | ./parse_real_out.sh`
+    record_result $BUILD_IDENTIFIER ${id}NotFrozen $i $real
+
+    real=`grep "(frozen" $log | ./parse_real_out.sh`
+    record_result $BUILD_IDENTIFIER ${id}Frozen $i $real
   done
 
   # parse out the real time from "Time spent with overlap: real 0.557s user 0.546s sys 0.010s"
   for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "with overlap" $LOGDIR/${COMPACT_MERGE_BENCH}$i.log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${COMPACT_MERGE_BENCH}${WITH_OVERLAP} $i $real
-  done
+    local id=$COMPACT_MERGE_BENCH
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
 
-  for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "without overlap" $LOGDIR/${COMPACT_MERGE_BENCH}$i.log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${COMPACT_MERGE_BENCH}${NO_OVERLAP} $i $real
+    real=`grep "with overlap" $log | ./parse_real_out.sh`
+    record_result $BUILD_IDENTIFIER ${id}${WITH_OVERLAP} $i $real
+
+    real=`grep "without overlap" $log | ./parse_real_out.sh`
+    record_result $BUILD_IDENTIFIER ${id}${NO_OVERLAP} $i $real
   done
 
   # parse out time from MRS benchmarks
   for i in $(seq 1 $NUM_SAMPLES); do
-    log=$LOGDIR/${MEMROWSET_BENCH}$i.log
+    local id=$MEMROWSET_BENCH
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+
     real=`grep "Time spent Inserting" $log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${MEMROWSET_BENCH}${INSERT} $i $real
+    record_result $BUILD_IDENTIFIER ${id}${INSERT} $i $real
     real=`grep "Time spent Scanning rows where none" $log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${MEMROWSET_BENCH}${SCAN_NONE_COMMITTED} $i $real
+    record_result $BUILD_IDENTIFIER ${id}${SCAN_NONE_COMMITTED} $i $real
     real=`grep "Time spent Scanning rows where all" $log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${MEMROWSET_BENCH}${SCAN_ALL_COMMITTED} $i $real
+    record_result $BUILD_IDENTIFIER ${id}${SCAN_ALL_COMMITTED} $i $real
   done
 
   # Parse out the real time from: "Time spent Running 10000000 queries: real 3.281s  user 3.273s sys 0.000s"
   for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "Time spent Running" $LOGDIR/$BLOOM_TEST$i.log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER $BLOOM_TEST $i $real
+    local id=$BLOOM_TEST
+    local log=$LOGDIR/${id}${i}.log
+    local real=`grep "Time spent Running" $log | ./parse_real_out.sh`
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+    record_result $BUILD_IDENTIFIER $id $i $real
   done
 
   # Parse out the real time from: "Time spent Running 2000000 queries: real 28.193s user 26.903s sys 1.032s"
   # Many threads output their value, we keep the last;
   for i in $(seq 1 $NUM_SAMPLES); do
-    real=`grep "Time spent Running" $LOGDIR/$MT_BLOOM_TEST$i.log | tail -n1 | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER $MT_BLOOM_TEST $i $real
+    local id=$MT_BLOOM_TEST
+    local log=$LOGDIR/${id}${i}.log
+    local real=`grep "Time spent Running" $log | tail -n1 | ./parse_real_out.sh`
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+    record_result $BUILD_IDENTIFIER $id $i $real
   done
 
   # Parse out row-wise rate of cycles/cell from:
@@ -445,73 +523,88 @@ parse_and_record_all_results() {
   # Parse out the columnar rate of cycles/cell from:
   #   "Converting 10_int64_non_null to PB (method columnar) row select rate 1: 1.313369 cycles/cell"
   for i in $(seq 1 $NUM_SAMPLES); do
-    real_rowwise=`grep "Converting.*to PB (method row-wise)" $LOGDIR/$WIRE_PROTOCOL_TEST$i.log | sed 's|^.*: \([[:digit:].]*\) cycles/cell$|\1|'`
-    record_result $BUILD_IDENTIFIER $WIRE_PROTOCOL_TEST $i $real_rowwise
+    local id=$WIRE_PROTOCOL_TEST
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+
+    local real_rowwise=`grep "Converting.*to PB (method row-wise)" $log | sed 's|^.*: \([[:digit:].]*\) cycles/cell$|\1|'`
+    record_result $BUILD_IDENTIFIER $id $i $real_rowwise
 
-    real_colwise=`grep "Converting.*to PB (method columnar)" $LOGDIR/$WIRE_PROTOCOL_TEST$i.log | sed 's|^.*: \([[:digit:].]*\) cycles/cell$|\1|'`
-    record_result $BUILD_IDENTIFIER ${WIRE_PROTOCOL_TEST}_columnar $i $real_colwise
+    local real_colwise=`grep "Converting.*to PB (method columnar)" $log | sed 's|^.*: \([[:digit:].]*\) cycles/cell$|\1|'`
+    record_result $BUILD_IDENTIFIER ${id}_columnar $i $real_colwise
   done
 
   # parse the rate out of: "I1009 15:00:30.023576 27043 rpc-bench.cc:108] Reqs/sec:         84404.4"
   for i in $(seq 1 $NUM_SAMPLES); do
-    rate=`grep Reqs $LOGDIR/$RPC_BENCH_TEST$i.log | cut -d ":" -f 5 | tr -d ' '`
-    record_result $BUILD_IDENTIFIER $RPC_BENCH_TEST $i $rate
+    local id=$RPC_BENCH_TEST
+    local log=$LOGDIR/${id}${i}.log
+    local rate=`grep Reqs $log | cut -d ":" -f 5 | tr -d ' '`
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+    record_result $BUILD_IDENTIFIER $id $i $rate
   done
 
   # parse latency numbers from single-threaded tserver benchmark
   for i in $(seq 1 $NUM_SAMPLES); do
+    local id=$TS_INSERT_LATENCY
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
     for metric in min mean percentile_95 percentile_99 percentile_99_9 ; do
-      val=$(grep "\"$metric\": " $LOGDIR/${TS_INSERT_LATENCY}$i.log | awk '{print $2}' | sed -e 's/,//')
-      record_result $BUILD_IDENTIFIER ${TS_INSERT_LATENCY}_$metric $i $val
+      val=$(grep "\"$metric\": " $log | awk '{print $2}' | sed -e 's/,//')
+      record_result $BUILD_IDENTIFIER ${id}_${metric} $i $val
     done
   done
 
   # parse latency and throughput numbers from multi-threaded tserver benchmark
   for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${TS_8THREAD_BENCH}$i.log
+    local id=$TS_8THREAD_BENCH
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
     for metric in min mean percentile_95 percentile_99 percentile_99_9 ; do
       val=$(grep "\"$metric\": " $log | awk '{print $2}' | sed -e 's/,//')
-      record_result $BUILD_IDENTIFIER ${TS_8THREAD_BENCH}_${metric}_latency $i $val
+      record_result $BUILD_IDENTIFIER ${id}_${metric}_latency $i $val
     done
     rate=$(grep -o 'Throughput.*' $log | awk '{print $2}')
-    record_result $BUILD_IDENTIFIER ${TS_8THREAD_BENCH}_throughput $i $rate
+    record_result $BUILD_IDENTIFIER ${id}_throughput $i $rate
     rate=$(grep -o 'CPU efficiency.*' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${TS_8THREAD_BENCH}_cpu_efficiency $i $rate
+    record_result $BUILD_IDENTIFIER ${id}_cpu_efficiency $i $rate
   done
 
   # parse scan timings for scans and inserts with MRS only
   for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${FS_SCANINSERT_MRS}$i.log
+    local id=$FS_SCANINSERT_MRS
+    local log=$LOGDIR/${id}${i}.log
     insert=`grep "Time spent concurrent inserts" $log | ./parse_real_out.sh`
     scan_full=`grep "Time spent full schema scan" $log | ./parse_real_out.sh`
     scan_str=`grep "Time spent String projection" $log | ./parse_real_out.sh`
     scan_int32=`grep "Time spent Int32 projection" $log | ./parse_real_out.sh`
     scan_int64=`grep "Time spent Int64 projection" $log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_insert $i $insert
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_full $i $scan_full
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_str $i $scan_str
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_int32 $i $scan_int32
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_int64 $i $scan_int64
+    record_result $BUILD_IDENTIFIER ${id}_insert $i $insert
+    record_result $BUILD_IDENTIFIER ${id}_scan_full $i $scan_full
+    record_result $BUILD_IDENTIFIER ${id}_scan_str $i $scan_str
+    record_result $BUILD_IDENTIFIER ${id}_scan_int32 $i $scan_int32
+    record_result $BUILD_IDENTIFIER ${id}_scan_int64 $i $scan_int64
   done
 
   # parse scan timings for scans and inserts with disk
   for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${FS_SCANINSERT_DISK}$i.log
+    local id=$FS_SCANINSERT_DISK
+    local log=$LOGDIR/${id}${i}.log
     insert=`grep "Time spent concurrent inserts" $log | ./parse_real_out.sh`
     scan_full=`grep "Time spent full schema scan" $log | ./parse_real_out.sh`
     scan_str=`grep "Time spent String projection" $log | ./parse_real_out.sh`
     scan_int32=`grep "Time spent Int32 projection" $log | ./parse_real_out.sh`
     scan_int64=`grep "Time spent Int64 projection" $log | ./parse_real_out.sh`
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_insert $i $insert
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_full $i $scan_full
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_str $i $scan_str
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_int32 $i $scan_int32
-    record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_int64 $i $scan_int64
+    record_result $BUILD_IDENTIFIER ${id}_insert $i $insert
+    record_result $BUILD_IDENTIFIER ${id}_scan_full $i $scan_full
+    record_result $BUILD_IDENTIFIER ${id}_scan_str $i $scan_str
+    record_result $BUILD_IDENTIFIER ${id}_scan_int32 $i $scan_int32
+    record_result $BUILD_IDENTIFIER ${id}_scan_int64 $i $scan_int64
   done
 
   # Parse timings and thread count
   for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${DENSE_NODE_ITEST}$i.log
+    local id=$DENSE_NODE_ITEST
+    local log=$LOGDIR/${id}${i}.log
     num_threads=`grep threads_running $log | cut -d ":" -f 5 | tr -d ' '`
     num_blocks=`grep log_block_manager_blocks_under_management $log | cut -d ":" -f 5 | tr -d ' '`
     num_bytes=`grep log_block_manager_bytes_under_management $log | cut -d ":" -f 5 | tr -d ' '`
@@ -528,65 +621,68 @@ parse_and_record_all_results() {
     time_restarting_tserver=`grep "Time spent restarting tserver" $log | ./parse_real_out.sh`
     time_bootstrapping_tablets=`grep "Time spent bootstrapping tablets" $log | ./parse_real_out.sh`
     set -e
-    record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_num_threads $i $num_threads
-    record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_num_blocks $i $num_blocks
-    record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_num_bytes $i $num_bytes
-    record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_num_containers $i $num_containers
-    record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_num_full_containers $i $num_full_containers
+    record_result $BUILD_IDENTIFIER ${id}_num_threads $i $num_threads
+    record_result $BUILD_IDENTIFIER ${id}_num_blocks $i $num_blocks
+    record_result $BUILD_IDENTIFIER ${id}_num_bytes $i $num_bytes
+    record_result $BUILD_IDENTIFIER ${id}_num_containers $i $num_containers
+    record_result $BUILD_IDENTIFIER ${id}_num_full_containers $i $num_full_containers
     if [ -n "$time_restarting_tserver" ]; then
-      record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_time_restarting_tserver $i $time_restarting_tserver
+      record_result $BUILD_IDENTIFIER ${id}_time_restarting_tserver $i $time_restarting_tserver
     fi
     if [ -n "$time_bootstrapping_tablets" ]; then
-      record_result $BUILD_IDENTIFIER ${DENSE_NODE_ITEST}_time_bootstrapping_tablets $i $time_bootstrapping_tablets
+      record_result $BUILD_IDENTIFIER ${id}_time_bootstrapping_tablets $i $time_bootstrapping_tablets
     fi
 
   done
 
   # Parse out total call rate and record the results for BlockingQueue test.
-  for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${BLOCKING_QUEUE_SYMMETRIC_TEST}$i.log
-    rate=$(grep -o 'total rate: .* calls/sec' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${BLOCKING_QUEUE_SYMMETRIC_TEST}_total_call_rate $i $rate
-
-    local log=$LOGDIR/${BLOCKING_QUEUE_NON_SYMMETRIC_TEST}$i.log
-    rate=$(grep -o 'total rate: .* calls/sec' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${BLOCKING_QUEUE_NON_SYMMETRIC_TEST}_total_call_rate $i $rate
+  for id in $BLOCKING_QUEUE_SYMMETRIC_TEST $BLOCKING_QUEUE_NON_SYMMETRIC_TEST; do
+    for i in $(seq 1 $NUM_SAMPLES); do
+      local log=$LOGDIR/${id}${i}.log
+      parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+      local rate=$(grep -o 'total rate: .* calls/sec' $log | awk '{print $3}')
+      record_result $BUILD_IDENTIFIER ${id}_total_call_rate $i $rate
+    done
   done
 
   # Parse out request rate and record the results for ConcurrentGetTableSchemaTest test.
-  for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${GET_TABLE_SCHEMA_RPC}$i.log
-    rate=$(grep -o 'GetTableSchema RPC: .* req/sec (authz disabled)' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${GET_TABLE_SCHEMA_RPC}_no_authz_req_rate $i $rate
-    rate=$(grep -o 'GetTableSchema RPC: .* req/sec (authz enabled)' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${GET_TABLE_SCHEMA_RPC}_authz_req_rate $i $rate
-
-    local log=$LOGDIR/${GET_TABLE_SCHEMA_DIRECT_CALL}$i.log
-    rate=$(grep -o 'GetTableSchema function: .* req/sec (authz disabled)' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${GET_TABLE_SCHEMA_DIRECT_CALL}_no_authz_req_rate $i $rate
-    rate=$(grep -o 'GetTableSchema function: .* req/sec (authz enabled)' $log | awk '{print $3}')
-    record_result $BUILD_IDENTIFIER ${GET_TABLE_SCHEMA_DIRECT_CALL}_authz_req_rate $i $rate
+  for id in $GET_TABLE_SCHEMA_RPC $GET_TABLE_SCHEMA_DIRECT_CALL; do
+    for i in $(seq 1 $NUM_SAMPLES); do
+      local log=$LOGDIR/${id}${i}.log
+      parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+      rate=$(grep -o 'GetTableSchema .*: .* req/sec (authz disabled)' $log | awk '{print $3}')
+      record_result $BUILD_IDENTIFIER ${id}_no_authz_req_rate $i $rate
+      rate=$(grep -o 'GetTableSchema .*: .* req/sec (authz enabled)' $log | awk '{print $3}')
+      record_result $BUILD_IDENTIFIER ${id}_authz_req_rate $i $rate
+    done
   done
 
   # Parse out request rate and record the results for GetTableLocationsBenchmark{FunctionCall}.
   for capacity_mb in 0 32 ; do
     for i in $(seq 1 $NUM_SAMPLES); do
-      local log=$LOGDIR/${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}_${i}.log
+      local id=${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}
+      local log=$LOGDIR/${id}${i}.log
+      parse_and_record_perf_stats $BUILD_IDENTIFIER ${id} $i $log
       rate=$(grep -o 'GetTableLocations RPC: .* req/sec' $log | awk '{print $3}')
-      record_result $BUILD_IDENTIFIER ${GET_TABLE_LOCATIONS_RPC}_${capacity_mb}_req_rate $i $rate
+      record_result $BUILD_IDENTIFIER ${id}_req_rate $i $rate
 
-      local log=$LOGDIR/${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}_${i}.log
+      local id=${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}
+      local log=$LOGDIR/${id}${i}.log
+      parse_and_record_perf_stats $BUILD_IDENTIFIER ${id} $i $log
       rate=$(grep -o 'GetTableLocations function call: .* req/sec' $log | awk '{print $4}')
-      record_result $BUILD_IDENTIFIER ${GET_TABLE_LOCATIONS_DIRECT_CALL}_${capacity_mb}_req_rate $i $rate
+      record_result $BUILD_IDENTIFIER ${id}_req_rate $i $rate
     done
   done
 
   for i in $(seq 1 $NUM_SAMPLES); do
-    local log=$LOGDIR/${SAME_TABLET_CONCURRENT_WRITES}$i.log
+    local id=$SAME_TABLET_CONCURRENT_WRITES
+    local log=$LOGDIR/${id}${i}.log
+    parse_and_record_perf_stats $BUILD_IDENTIFIER $id $i $log
+
     rate=$(grep -o 'write RPC request rate: .* req/sec' $log | awk '{print $5}')
+    record_result $BUILD_IDENTIFIER ${id}_req_rate $i $rate
     overflows=$(grep -o 'total count of RPC queue overflows: .*' $log | awk '{print $7}')
-    record_result $BUILD_IDENTIFIER ${SAME_TABLET_CONCURRENT_WRITES}_req_rate $i $rate
-    record_result $BUILD_IDENTIFIER ${SAME_TABLET_CONCURRENT_WRITES}_overflows $i $overflows
+    record_result $BUILD_IDENTIFIER ${id}_overflows $i $overflows
   done
 
   popd
@@ -629,9 +725,26 @@ load_and_generate_plot() {
   if [ -z "$y_axis_label" ]; then
     local y_axis_label="runtime (sec)"
   fi
-  local fpath=${OUTDIR}/${plot_name}.tsv
-  load_stats "$test_name" > $fpath
-  write_img_plot $fpath $plot_name "$y_axis_label"
+  local plot_file=${OUTDIR}/${plot_name}.tsv
+  load_stats "$test_name" > $plot_file
+  write_img_plot $plot_file $plot_name "$y_axis_label"
+}
+
+load_and_generate_perf_stat_plots() {
+  local id="perf_stat_ins_per_cycle"
+  local plot_file=${OUTDIR}/${id}.tsv
+  load_stats "${id}_%" > $plot_file
+  write_img_plot $plot_file "perf-stat-instructions-per-CPU-cycle" "instructions per cycle"
+
+  local id="perf_stat_cpu_util"
+  local plot_file=${OUTDIR}/${id}.tsv
+  load_stats "${id}_%" > $plot_file
+  write_img_plot $plot_file "perf-stat-cpu-utilization" "CPUs utilized"
+
+  local id="perf_stat_branch_misses"
+  local plot_file=${OUTDIR}/${id}.tsv
+  load_stats "${id}_%" > $plot_file
+  write_img_plot $plot_file "perf-stat-branch-misses" "% of all branches"
 }
 
 load_stats_and_generate_plots() {
@@ -641,9 +754,9 @@ load_stats_and_generate_plots() {
 
   load_and_generate_plot "%MultiThreadedTabletTest%" mt-tablet-test-runtime
 
-  load_and_generate_plot ConcurrentBTreeScanInsert cb-tree-insert
-  load_and_generate_plot ConcurrentBTreeScanNotFrozen cb-ctree-not-frozen
-  load_and_generate_plot ConcurrentBTreeScanFrozen cb-ctree-frozen
+  load_and_generate_plot ${CBTREE_TEST}Insert cb-tree-insert
+  load_and_generate_plot ${CBTREE_TEST}NotFrozen cb-ctree-not-frozen
+  load_and_generate_plot ${CBTREE_TEST}Frozen cb-ctree-frozen
 
   load_and_generate_plot "${COMPACT_MERGE_BENCH}%" compact-merge-bench
 
@@ -682,8 +795,8 @@ load_stats_and_generate_plots() {
   load_and_generate_plot "${GET_TABLE_SCHEMA_DIRECT_CALL}_authz_req_rate" get-table-schema-dc-authz "function call rate (req/sec)"
   load_and_generate_plot "${GET_TABLE_SCHEMA_DIRECT_CALL}_no_authz_req_rate" get-table-schema-dc-no-authz "function call rate (req/sec)"
 
-  load_and_generate_plot "${GET_TABLE_LOCATIONS_RPC}%_req_rate" get-table-locations-rpc "RPC rate (req/sec)"
-  load_and_generate_plot "${GET_TABLE_LOCATIONS_DIRECT_CALL}%_req_rate" get-table-locations-dc "function call rate (req/sec)"
+  load_and_generate_plot "${GET_TABLE_LOCATIONS_RPC}%_req_rate" get-table-locations-rpc
+  load_and_generate_plot "${GET_TABLE_LOCATIONS_DIRECT_CALL}%_req_rate" get-table-locations-dc
 
   load_and_generate_plot "${SAME_TABLET_CONCURRENT_WRITES}_req_rate" same-tablet-concurrent-writes-rate "RPC rate (req/sec)"
   load_and_generate_plot "${SAME_TABLET_CONCURRENT_WRITES}_overflows" same-tablet-concurrent-writes-overflows "RPC queue overflows (total count)"
@@ -695,6 +808,9 @@ load_stats_and_generate_plots() {
     write_mttablet_img_plots $OUTDIR/test$i.tsv test$i || true
   done
 
+  # Generate performance counters plots for all tests run under 'perf stat'.
+  load_and_generate_perf_stat_plots
+
   if [ "${BENCHMARK_MODE}" = "${MODE_JENKINS}" ]; then
     ################################################################
     # Plot the separately-recorded TPCH and YCSB graphs as well
@@ -796,11 +912,15 @@ OUTDIR="$BASE_DIR/$OUT_DIR_NAME"
 
 # Ensure we are in KUDU_HOME
 if [ ! -f "$BASE_DIR/LICENSE.txt" ]; then
-  set +x
   echo "Error: must run from top of Kudu source tree"
   usage_and_die
 fi
 
+if ! check_tools; then
+  echo "Error: could not find necessary tools to run benchmarks"
+  exit 1
+fi
+
 # Set up environment.
 ulimit -m $[3000*1000]
 ulimit -c unlimited   # gather core dumps