You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by gr...@apache.org on 2020/08/26 21:20:06 UTC

[kudu] branch master updated (fc9b614 -> 5e328d3)

This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git.


    from fc9b614  Add .clang-format file
     new 4b3e884  KUDU-2844 (3/3): avoid copying plain/dict strings to RowBlock Arena
     new bcb2b2e  [docs] fix wrong usage of command in administration.adoc
     new 1ffef3e  [docker] Generate all tags in a single build command
     new 141dfab  [build] Adjust JAVA_HOME candidates
     new 5e328d3  [docker] Support building and pushing multi-arch images

The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 build-support/java-home-candidates.txt             |  34 +++--
 docker/docker-build.py                             | 145 +++++++++++----------
 docs/administration.adoc                           |   2 +-
 src/kudu/cfile/binary_dict_block.cc                |  23 ++--
 src/kudu/cfile/binary_plain_block.cc               |  28 ++--
 src/kudu/cfile/binary_plain_block.h                |   6 +-
 src/kudu/cfile/block_handle.h                      |   4 +-
 src/kudu/common/rowblock_memory.h                  |  36 ++++-
 .../full_stack-insert-scan-test.cc                 |  28 +++-
 9 files changed, 201 insertions(+), 105 deletions(-)


[kudu] 03/05: [docker] Generate all tags in a single build command

Posted by gr...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 1ffef3e5faca254c43799773caa7abe6d47d9645
Author: Grant Henke <gr...@apache.org>
AuthorDate: Tue Aug 4 08:58:14 2020 -0500

    [docker] Generate all tags in a single build command
    
    This patch adjusts the docker-build.py script to generate all the
    tags upfront and use them in the `docker build` command instead
    of adding them one-by-one afterwards. This is a preliminary step
    that simplifies adding ARM support to the build script.
    
    Change-Id: Iccb0b2ec02940457487852f048d80a42a39e5fb4
    Reviewed-on: http://gerrit.cloudera.org:8080/16298
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 docker/docker-build.py | 94 ++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 52 deletions(-)

diff --git a/docker/docker-build.py b/docker/docker-build.py
index 045c116..5da3847 100755
--- a/docker/docker-build.py
+++ b/docker/docker-build.py
@@ -201,6 +201,43 @@ def build_args(base, version, vcs_ref, cache_from):
     args += ' --cache-from %s' % cache_from
   return args
 
+def build_tags(opts, base, version, vcs_ref, target):
+  os_tag = get_os_tag(base)
+  version_tag = get_version_tag(version, vcs_ref)
+
+  tags = []
+  full_tag = get_full_tag(opts.repository, target, version_tag, os_tag)
+  # Don't tag hash versions unless tag_hash is true.
+  if is_release_version(version) or opts.tag_hash:
+    tags.append(full_tag)
+  # If this is the default OS, also tag it without the OS-specific tag.
+  if base == DEFAULT_OS:
+    default_os_tag = get_full_tag(opts.repository, target, version_tag, '')
+    # Don't tag hash versions unless tag_hash is true.
+    if is_release_version(version) or opts.tag_hash:
+      tags.append(default_os_tag)
+
+  # Add the minor version tag if this is a release version.
+  if is_release_version(version):
+    minor_version = get_minor_version(version)
+    minor_tag = get_full_tag(opts.repository, target, minor_version, os_tag)
+    tags.append(minor_tag)
+    # Add the default OS tag.
+    if base == DEFAULT_OS:
+      minor_default_os_tag = get_full_tag(opts.repository, target, minor_version, '')
+      tags.append(minor_default_os_tag)
+
+  # Add the latest version tags.
+  if not opts.skip_latest:
+    latest_tag = get_full_tag(opts.repository, target, 'latest', os_tag)
+    tags.append(latest_tag)
+    # Add the default OS tag.
+    if base == DEFAULT_OS:
+      latest_default_os_tag = get_full_tag(opts.repository, target, 'latest', '')
+      tags.append(latest_default_os_tag)
+
+  return tags
+
 def verify_docker_resources(opts):
   cpus = int(subprocess.check_output(['docker', 'system', 'info',
                                       '--format', '{{.NCPU}}'])
@@ -238,7 +275,6 @@ def main():
   version = read_version()
   vcs_ref = read_vcs_ref()
   print('Version: %s (%s)' % (version, vcs_ref))
-  version_tag = get_version_tag(version, vcs_ref)
 
   bases = unique_bases(opts)
   targets = unique_targets(opts)
@@ -248,65 +284,19 @@ def main():
   tags = [] # Keep track of the tags for publishing at the end.
   for base in bases:
     print('Building targets for %s...' % base)
-    os_tag = get_os_tag(base)
 
     for target in targets:
+      target_tags = build_tags(opts, base, version, vcs_ref, target)
+      # Build the target.
       print('Building %s target...' % target)
-      full_tag = get_full_tag(opts.repository, target, version_tag, os_tag)
-      print(full_tag)
-
-      # Build the target and tag with the full tag.
       docker_build_cmd = 'docker build'
       docker_build_cmd += build_args(base, version, vcs_ref, opts.cache_from)
       docker_build_cmd += ' --file %s' % os.path.join(ROOT, 'docker', 'Dockerfile')
-      docker_build_cmd += ' --target %s --tag %s' % (target, full_tag)
+      docker_build_cmd += ' --target %s' % target
+      docker_build_cmd += ''.join(' --tag %s' % tag for tag in target_tags)
       docker_build_cmd += ' %s' % ROOT
       run_command(docker_build_cmd, opts)
-      tags.append(full_tag)
-
-      # If this is the default OS, also tag it without the OS-specific tag.
-      if base == DEFAULT_OS:
-        default_os_tag = get_full_tag(opts.repository, target, version_tag, '')
-        default_os_cmd = 'docker tag %s %s' % (full_tag, default_os_tag)
-        run_command(default_os_cmd, opts)
-        tags.append(default_os_tag)
-
-      # Add the minor version tag if this is a release version.
-      if is_release_version(version):
-        minor_version = get_minor_version(version)
-        minor_tag = get_full_tag(opts.repository, target, minor_version, os_tag)
-        minor_cmd = 'docker tag %s %s' % (full_tag, minor_tag)
-        run_command(minor_cmd, opts)
-        tags.append(minor_tag)
-
-        # Add the default OS tag.
-        if base == DEFAULT_OS:
-          minor_default_os_tag = get_full_tag(opts.repository, target, minor_version, '')
-          minor_default_os_cmd = 'docker tag %s %s' % (full_tag, minor_default_os_tag)
-          run_command(minor_default_os_cmd, opts)
-          tags.append(minor_default_os_tag)
-
-      # Add the latest version tags.
-      if not opts.skip_latest:
-        latest_tag = get_full_tag(opts.repository, target, 'latest', os_tag)
-        latest_cmd = 'docker tag %s %s' % (full_tag, latest_tag)
-        run_command(latest_cmd, opts)
-        tags.append(latest_tag)
-
-        # Add the default OS tag.
-        if base == DEFAULT_OS:
-          latest_default_os_tag = get_full_tag(opts.repository, target, 'latest', '')
-          latest_default_os_cmd = 'docker tag %s %s' % (full_tag, latest_default_os_tag)
-          run_command(latest_default_os_cmd, opts)
-          tags.append(latest_default_os_tag)
-
-      # Remove the hash tags if the aren't wanted.
-      if not opts.tag_hash and not is_release_version(version):
-        print('Removing hash based Docker tags...')
-        hash_tag_pattern = '%s:*%s*' % (opts.repository, vcs_ref)
-        rmi_command = 'docker rmi $(docker images -q "%s" --format "{{.Repository}}:{{.Tag}}")'\
-                      % (hash_tag_pattern, )
-        run_command(rmi_command, opts)
+      tags.extend(target_tags)
 
   if opts.publish:
     print('Publishing Docker images...')


[kudu] 04/05: [build] Adjust JAVA_HOME candidates

Posted by gr...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 141dfab4f65f9f4083ee5d62d4033a1ade9f232b
Author: Grant Henke <gr...@apache.org>
AuthorDate: Mon Aug 24 15:07:03 2020 -0500

    [build] Adjust JAVA_HOME candidates
    
    This patch removes the Java 7 candidates given Java 7 is no
    longer supported and adds Java 11 candidates along with
    a few more generic candidates.
    
    The updated version of bigtop-detect-javahome, which is the
    original inspiration for this file, was used to help update this file.
    https://github.com/apache/bigtop/blob/master/bigtop-packages/src/common/bigtop-utils/bigtop-detect-javahome
    
    Change-Id: Ie51cde4153be23e8f33fcc061531c2c022791b16
    Reviewed-on: http://gerrit.cloudera.org:8080/16354
    Reviewed-by: Grant Henke <gr...@apache.org>
    Tested-by: Grant Henke <gr...@apache.org>
---
 build-support/java-home-candidates.txt | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/build-support/java-home-candidates.txt b/build-support/java-home-candidates.txt
index 7a08118..b214c4e 100644
--- a/build-support/java-home-candidates.txt
+++ b/build-support/java-home-candidates.txt
@@ -16,8 +16,23 @@
 # under the License.
 
 # This file list the locations to look for java candidates in priority order.
+# The priority order is newer to older versions and specific to generic versions.
 # Note: Trailing comments are not allowed.
 
+# Oracle JDK 11 Candidates
+/usr/java/jdk-11
+/usr/lib/jvm/jdk-11
+/usr/lib/jvm/java-11-oracle
+
+# OpenJDK 11 Candidates
+/usr/lib/jvm/java-11-openjdk-arm64
+/usr/lib/jvm/java-11-openjdk-amd64
+/usr/lib/jvm/java-11-openjdk-ppc64el
+/usr/lib/jvm/java-11
+/usr/java/jdk-11
+/usr/lib/jvm/jdk-11
+/usr/lib64/jvm/jdk-11
+
 # Oracle JDK 8 Candidates
 /usr/java/jdk1.8
 /usr/java/jre1.8
@@ -27,28 +42,19 @@
 /usr/lib/jdk8-latest
 
 # OpenJDK 8 Candidates
+/usr/lib/jvm/java-8-openjdk-arm64
 /usr/lib/jvm/java-1.8.0-openjdk-amd64
 /usr/lib/jvm/java-1.8.0-openjdk-ppc64el
 /usr/lib/jvm/java-1.8.0-openjdk
 /usr/lib64/jvm/java-1.8.0-openjdk-1.8.0
 
-# Oracle JDK 7 Candidates
-/usr/java/jdk1.7
-/usr/java/jre1.7
-/usr/lib/jvm/j2sdk1.7-oracle
-/usr/lib/jvm/j2sdk1.7-oracle/jre
-/usr/lib/jvm/java-7-oracle
-/usr/lib/jdk7-latest
-
-# OpenJDK 7 Candidates
-/usr/lib/jvm/java-1.7.0-openjdk
-/usr/lib/jvm/java-7-openjdk-amd64
-/usr/lib/jvm/java-7-openjdk
-
 # Misc. Candidates
+/Library/Java/Home
+/usr/bin/java
 /usr/java/default
 /usr/lib/jvm/java
 /usr/lib/jvm/jre
 /usr/lib/jvm/default-java
 /usr/lib/jvm/java-openjdk
-/usr/lib/jvm/jre-openjdk
\ No newline at end of file
+/usr/lib/jvm/jre-openjdk
+/usr/lib/jvm/java-oracle
\ No newline at end of file


[kudu] 02/05: [docs] fix wrong usage of command in administration.adoc

Posted by gr...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit bcb2b2e390eb8fb17f347087bb847af4158e64d6
Author: ningw <wa...@sensorsdata.cn>
AuthorDate: Wed Aug 26 13:58:01 2020 +0800

    [docs] fix wrong usage of command in administration.adoc
    
    should be `kudu cluster ksck` instead of `kudu ksck`
    
    Change-Id: Id2322ea53f89a91176419d7decb5125389d5c4cd
    Reviewed-on: http://gerrit.cloudera.org:8080/16368
    Tested-by: Kudu Jenkins
    Reviewed-by: Grant Henke <gr...@apache.org>
---
 docs/administration.adoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/administration.adoc b/docs/administration.adoc
index 0eb1cfd..72fca5b 100644
--- a/docs/administration.adoc
+++ b/docs/administration.adoc
@@ -1774,7 +1774,7 @@ clusters_info:
   Example::
 +
 ----
-$ sudo -u kudu kudu ksck @cluster_name1
+$ sudo -u kudu kudu cluster ksck @cluster_name1
 ----
 +
 


[kudu] 05/05: [docker] Support building and pushing multi-arch images

Posted by gr...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 5e328d38519d656b881219db8087b8eb8af1d4d8
Author: Grant Henke <gr...@apache.org>
AuthorDate: Wed Aug 5 12:18:54 2020 -0500

    [docker] Support building and pushing multi-arch images
    
    This change introduces support for building and pushing ARM
    and multi-arch Docker images.
    
    Note that building an image for a different architecture than
    your machine is painfully slow due to emulation. Improvements
    in emulation performance will likely come in the future.
    
    Change-Id: I7f64e4b9591927f160cdf886507cb740578e20b5
    Reviewed-on: http://gerrit.cloudera.org:8080/16361
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 docker/docker-build.py | 53 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/docker/docker-build.py b/docker/docker-build.py
index 5da3847..4b9fb00 100755
--- a/docker/docker-build.py
+++ b/docker/docker-build.py
@@ -67,6 +67,7 @@ ROOT = os.path.abspath(os.path.join(os.path.dirname(ME), ".."))
 DEFAULT_OS = 'ubuntu:xenial'
 DEFAULT_TARGETS = ['kudu','kudu-python']
 DEFAULT_REPOSITORY = 'apache/kudu'
+DEFAULT_ACTION = 'load'
 
 REQUIRED_CPUS = 4
 REQUIRED_MEMORY_GIB = 4
@@ -75,6 +76,10 @@ def parse_args():
   """ Parses the command-line arguments """
   parser = argparse.ArgumentParser(description='Build the Apache Kudu Docker images',
                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+  parser.add_argument('--platforms', nargs='+', choices=[
+                      'linux/amd64', 'linux/arm64', ],
+                      help='The platforms to build with. If unspecified, the platform of your '
+                           'build machine will be used.')
   parser.add_argument('--bases', nargs='+', default=DEFAULT_OS, choices=[
                       'centos:6', 'centos:7', 'centos:8',
                       'debian:jessie', 'debian:stretch',
@@ -90,8 +95,11 @@ def parse_args():
   parser.add_argument('--repository', default=DEFAULT_REPOSITORY,
                       help='The repository string to use when tagging the image')
 
-  parser.add_argument('--publish', action='store_true',
-                      help='If passed, the tagged images will be pushed to the Docker repository')
+  parser.add_argument('--action', default=DEFAULT_ACTION, choices=['load', 'push'],
+                      help='The action to take with the built images. "load" will export the '
+                           'images to docker so they can be used locally. "push" will push the '
+                           'images to the registry.')
+
   parser.add_argument('--skip-latest', action='store_true',
                       help='If passed, skips adding a tag using `-latest` along with the '
                       'versioned tag')
@@ -119,6 +127,11 @@ def run_command(cmd, opts):
   if not opts.dry_run:
     subprocess.check_output(cmd, shell=True)
 
+def use_buildx_builder():
+  ret = subprocess.call(['docker', 'buildx', 'use', 'kudu-builder'])
+  if ret != 0:
+    subprocess.check_output(['docker', 'buildx', 'create', '--name', 'kudu-builder', '--use'])
+
 def read_version():
   with open(os.path.join(ROOT, 'version.txt'), 'r') as vfile:
     return vfile.read().strip()
@@ -174,6 +187,12 @@ def get_full_tag(repository, target, version_tag, os_tag):
   full_tag = full_tag[:-1]
   return "%s:%s" % (repository, full_tag)
 
+def platforms_csv(opts):
+  if type(opts.platforms) is list:
+    return ",".join(list(dict.fromkeys(opts.platforms)))
+  else:
+    return opts.platforms
+
 def unique_bases(opts):
   if type(opts.bases) is list:
     return list(dict.fromkeys(opts.bases))
@@ -271,6 +290,10 @@ def main():
   # performance, storage management, feature functionality, and security.
   # https://docs.docker.com/develop/develop-images/build_enhancements/
   os.environ['DOCKER_BUILDKIT'] = '1'
+  # Enable the experimental CLI so we can use `docker buildx` commands.
+  os.environ['DOCKER_CLI_EXPERIMENTAL'] = 'enabled'
+  # Create/Use a buildx builder to support pushing multi-platform builds.
+  use_buildx_builder()
 
   version = read_version()
   vcs_ref = read_vcs_ref()
@@ -281,7 +304,11 @@ def main():
   print('Bases: %s' % bases)
   print('Targets: %s' % targets)
 
-  tags = [] # Keep track of the tags for publishing at the end.
+  if opts.action == 'push' and not is_release_version(version):
+    print('ERROR: Only release versions can be pushed. Found version %s (%s)'
+          % (version, vcs_ref))
+    sys.exit(1)
+
   for base in bases:
     print('Building targets for %s...' % base)
 
@@ -289,24 +316,20 @@ def main():
       target_tags = build_tags(opts, base, version, vcs_ref, target)
       # Build the target.
       print('Building %s target...' % target)
-      docker_build_cmd = 'docker build'
+      # Note: It isn't currently possible to load multi-arch images into docker,
+      # they can only be pushed to docker hub. This isn't expected to be a long
+      # lived limitation.
+      # https://github.com/docker/buildx/issues/59
+      # https://github.com/moby/moby/pull/38738
+      docker_build_cmd = 'docker buildx build --%s' % opts.action
+      if opts.platforms:
+        docker_build_cmd += ' --platform %s' % platforms_csv(opts)
       docker_build_cmd += build_args(base, version, vcs_ref, opts.cache_from)
       docker_build_cmd += ' --file %s' % os.path.join(ROOT, 'docker', 'Dockerfile')
       docker_build_cmd += ' --target %s' % target
       docker_build_cmd += ''.join(' --tag %s' % tag for tag in target_tags)
       docker_build_cmd += ' %s' % ROOT
       run_command(docker_build_cmd, opts)
-      tags.extend(target_tags)
-
-  if opts.publish:
-    print('Publishing Docker images...')
-    if not is_release_version(version):
-      print('ERROR: Only release versions can be published. Found version %s (%s)'
-            % (version, vcs_ref))
-      sys.exit(1)
-    for tag in tags:
-      push_cmd = "docker push %s" % tag
-      run_command(push_cmd, opts)
 
   end_time = datetime.datetime.now()
   runtime = end_time - start_time


[kudu] 01/05: KUDU-2844 (3/3): avoid copying plain/dict strings to RowBlock Arena

Posted by gr...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

granthenke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 4b3e884c09f0b4e8871273a29590e8c56f3c5df7
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Thu Apr 23 23:52:57 2020 -0700

    KUDU-2844 (3/3): avoid copying plain/dict strings to RowBlock Arena
    
    This changes Dictionary and Plain binary blocks to no longer copy string
    values into the destination RowBlock's Arena. Instead, the dictionary
    block (or plain block) is attached to the RowBlockMemory's list of
    reference-counted block handles, and the cell directly refers to the
    underlying block handle.
    
    I modified full_stack-insert-scan-test to have it do some scans with
    fault tolerance and no caching. If I comment out the code path that
    reference-counts the blocks during a scan, the newly added test fails.
    
    I performance-tested this by loading a lineitem table using
    tpch_real_world:
    
    $ tpch_real_world -tpch-path-to-dbgen-dir /data/2/todd/dbgen/ \
      -tpch_num_inserters=8 -tpch_scaling_factor=100 \
      --tpch_mini_cluster_base_dir /data/1/todd/tpch-kudu-data \
    
    For the numbers reported below, I accidentally cancelled the load after
    222M rows were present, but I used the same dataset for both "before"
    and "after" so the relative comparison is still valid.
    
    I started a tserver and master with the same data directories, with
    the tserver running inside perf stat (or perf record to look at
    profile):
    
    $ kudu-master \
      -fs-wal-dir /data/1/todd/tpch-kudu-data/master-0/wal/ \
      -fs-data-dirs /data/1/todd/tpch-kudu-data/master-0/data/
    
    $ perf stat kudu-tserver \
      -fs-wal-dir /data/1/todd/tpch-kudu-data/ts-0/wal/ \
      -fs-data-dirs /data/1/todd/tpch-kudu-data/ts-0/data/
    
    I waited until the data had been fully flushed from MRS and compacted
    before running the read workloads. To test the reads I ran the following
    10 times:
    
    $ kudu perf table_scan localhost tpch_real_world  \
      --columns l_shipdate,l_shipmode,l_comment --num_threads=16
    
    The results of the first test were a bit noisy due to NUMA placement
    issues -- some runs were 30-40% faster than other runs, even on the same
    build, which made it hard to compare results, even though it was clear
    that the optimized version used fewer cycles on average. So, I ran both
    the tserver and the client using 'numactl -m 0 -N 0' to force everything
    to a single NUMA node. This made results much more consistent.
    
    Before:
    
             255870.36 msec task-clock                #    3.058 CPUs utilized
                244847      context-switches          #    0.957 K/sec
                  3322      cpu-migrations            #    0.013 K/sec
                245814      page-faults               #    0.961 K/sec
         1066864136000      cycles                    #    4.170 GHz                      (83.46%)
           84410991344      stalled-cycles-frontend   #    7.91% frontend cycles idle     (83.37%)
          340913242391      stalled-cycles-backend    #   31.95% backend cycles idle      (83.25%)
         1131564485394      instructions              #    1.06  insn per cycle
                                                      #    0.30  stalled cycles per insn  (83.34%)
          187879069908      branches                  #  734.274 M/sec                    (83.32%)
            8550168935      branch-misses             #    4.55% of all branches          (83.26%)
    
         191.262870000 seconds user
          64.765755000 seconds sys
    
    After:
    
             214131.49 msec task-clock                #    2.750 CPUs utilized
                245357      context-switches          #    0.001 M/sec
                  2734      cpu-migrations            #    0.013 K/sec
                248108      page-faults               #    0.001 M/sec
          893270854012      cycles                    #    4.172 GHz                      (83.45%)
           83805641687      stalled-cycles-frontend   #    9.38% frontend cycles idle     (83.25%)
          345166097238      stalled-cycles-backend    #   38.64% backend cycles idle      (83.29%)
          913435059189      instructions              #    1.02  insn per cycle
                                                      #    0.38  stalled cycles per insn  (83.36%)
          142198832288      branches                  #  664.072 M/sec                    (83.36%)
            4819907752      branch-misses             #    3.39% of all branches          (83.29%)
    
          77.876854360 seconds time elapsed
    
         146.195821000 seconds user
          68.113598000 seconds sys
    
    To summarize, the change gives about 1.30x reduction in CPU cycles on the
    tserver. The wall clock reported by the perf scan tool showed a 15-20%
    reduction in wall-clock. Testing just scanning a dictionary-encoded column
    shows even better results.
    
    Looking at 'perf diff -c ratio -o 0 --sort=sym' between two profiles collected
    by perf-record, we can see that the BinaryDictBlockDecoder code path is much
    cheaper, most of the memcpy calls are removed, and slightly more CPU is spent
    serializing the result (probably due to reduced locality of reference copying
    the string data to the wire).
    
        Orig % CPU     Ratio      Symbol
    -------------------------------------------------------------------------------------
        26.16%        0.437272  [.] kudu::cfile::BinaryDictBlockDecoder::CopyNextDecodeStrings(unsigned long*, kudu::ColumnDataView*)
        19.63%        1.135304  [.] kudu::SerializeRowBlock(kudu::RowBlock const&, kudu::Schema const*, kudu::faststring*, kudu::faststring*, bool)
        11.49%        1.002845  [k] copy_user_enhanced_fast_string
        10.29%        0.068955  [.] __memcpy_ssse3_back
    
    Change-Id: I93fa1f9fd401814a42dc5a1f3fd2ffb1286ac441
    Reviewed-on: http://gerrit.cloudera.org:8080/15802
    Tested-by: Kudu Jenkins
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
 src/kudu/cfile/binary_dict_block.cc                | 23 +++++++++-----
 src/kudu/cfile/binary_plain_block.cc               | 28 +++++++++++------
 src/kudu/cfile/binary_plain_block.h                |  6 +++-
 src/kudu/cfile/block_handle.h                      |  4 +--
 src/kudu/common/rowblock_memory.h                  | 36 +++++++++++++++++++++-
 .../full_stack-insert-scan-test.cc                 | 28 +++++++++++++++--
 6 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/src/kudu/cfile/binary_dict_block.cc b/src/kudu/cfile/binary_dict_block.cc
index 4000e61..194915a 100644
--- a/src/kudu/cfile/binary_dict_block.cc
+++ b/src/kudu/cfile/binary_dict_block.cc
@@ -17,6 +17,7 @@
 
 #include "kudu/cfile/binary_dict_block.h"
 
+#include <functional>
 #include <limits>
 #include <ostream>
 #include <utility>
@@ -35,6 +36,7 @@
 #include "kudu/common/columnblock.h"
 #include "kudu/common/common.pb.h"
 #include "kudu/common/rowblock.h"
+#include "kudu/common/rowblock_memory.h"
 #include "kudu/common/types.h"
 #include "kudu/gutil/casts.h"
 #include "kudu/gutil/map-util.h"
@@ -293,12 +295,13 @@ Status BinaryDictBlockDecoder::CopyNextAndEval(size_t* n,
     return CopyNextDecodeStrings(n, dst);
   }
 
+  bool retain_dict = false;
+
   // Load the rows' codeword values into a buffer for scanning.
   BShufBlockDecoder<UINT32>* d_bptr = down_cast<BShufBlockDecoder<UINT32>*>(data_decoder_.get());
   codeword_buf_.resize(*n * sizeof(uint32_t));
   d_bptr->CopyNextValuesToArray(n, codeword_buf_.data());
   Slice* out = reinterpret_cast<Slice*>(dst->data());
-  Arena* out_arena = dst->arena();
   for (size_t i = 0; i < *n; i++, out++) {
     // Check with the SelectionVectorView to see whether the data has already
     // been cleared, in which case we can skip evaluation.
@@ -307,13 +310,18 @@ Status BinaryDictBlockDecoder::CopyNextAndEval(size_t* n,
     }
     uint32_t codeword = *reinterpret_cast<uint32_t*>(&codeword_buf_[i*sizeof(uint32_t)]);
     if (BitmapTest(codewords_matching_pred->bitmap(), codeword)) {
-      // Row is included in predicate, copy data to block.
-      CHECK(out_arena->RelocateSlice(dict_decoder_->string_at_index(codeword), out));
+      // Row is included in predicate: point the cell in the block
+      // to the entry in the dictionary.
+      *out = dict_decoder_->string_at_index(codeword);
+      retain_dict = true;
     } else {
       // Mark that the row will not be returned.
       sel->ClearBit(i);
     }
   }
+  if (retain_dict) {
+    dst->memory()->RetainReference(dict_decoder_->block_handle());
+  }
   return Status::OK();
 }
 
@@ -323,22 +331,21 @@ Status BinaryDictBlockDecoder::CopyNextDecodeStrings(size_t* n, ColumnDataView*
   DCHECK_LE(*n, dst->nrows());
   DCHECK_EQ(dst->stride(), sizeof(Slice));
 
-  Arena* out_arena = dst->arena();
   Slice* out = reinterpret_cast<Slice*>(dst->data());
 
   codeword_buf_.resize((*n)*sizeof(uint32_t));
 
   // Copy the codewords into a temporary buffer first.
-  // And then Copy the strings corresponding to the codewords to the destination buffer.
   BShufBlockDecoder<UINT32>* d_bptr = down_cast<BShufBlockDecoder<UINT32>*>(data_decoder_.get());
   RETURN_NOT_OK(d_bptr->CopyNextValuesToArray(n, codeword_buf_.data()));
 
+  // Now point the cells in the destination block to the string data in the dictionary
+  // block.
   for (int i = 0; i < *n; i++) {
     uint32_t codeword = *reinterpret_cast<uint32_t*>(&codeword_buf_[i*sizeof(uint32_t)]);
-    Slice elem = dict_decoder_->string_at_index(codeword);
-    CHECK(out_arena->RelocateSlice(elem, out));
-    out++;
+    *out++ = dict_decoder_->string_at_index(codeword);
   }
+  dst->memory()->RetainReference(dict_decoder_->block_handle());
   return Status::OK();
 }
 
diff --git a/src/kudu/cfile/binary_plain_block.cc b/src/kudu/cfile/binary_plain_block.cc
index 9ab00ed..9782d24 100644
--- a/src/kudu/cfile/binary_plain_block.cc
+++ b/src/kudu/cfile/binary_plain_block.cc
@@ -19,6 +19,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <functional>
 #include <ostream>
 #include <vector>
 
@@ -31,6 +32,7 @@
 #include "kudu/common/columnblock.h"
 #include "kudu/common/common.pb.h"
 #include "kudu/common/rowblock.h"
+#include "kudu/common/rowblock_memory.h"
 #include "kudu/common/schema.h"
 #include "kudu/common/types.h"
 #include "kudu/gutil/stringprintf.h"
@@ -39,7 +41,6 @@
 #include "kudu/util/coding-inl.h"
 #include "kudu/util/group_varint-inl.h"
 #include "kudu/util/hexdump.h"
-#include "kudu/util/memory/arena.h"
 
 using std::vector;
 
@@ -163,8 +164,8 @@ Status BinaryPlainBlockBuilder::GetLastKey(void *key_void) const {
 ////////////////////////////////////////////////////////////
 
 BinaryPlainBlockDecoder::BinaryPlainBlockDecoder(scoped_refptr<BlockHandle> block)
-    : block_handle_(std::move(block)),
-      data_(block_handle_->data()),
+    : block_(std::move(block)),
+      data_(block_->data()),
       parsed_(false),
       num_elems_(0),
       ordinal_pos_base_(0),
@@ -301,7 +302,6 @@ Status BinaryPlainBlockDecoder::HandleBatch(size_t* n, ColumnDataView* dst, Cell
   CHECK_EQ(dst->type_info()->physical_type(), BINARY);
   DCHECK_LE(*n, dst->nrows());
   DCHECK_EQ(dst->stride(), sizeof(Slice));
-  Arena *out_arena = dst->arena();
   if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) {
     *n = 0;
     return Status::OK();
@@ -311,15 +311,16 @@ Status BinaryPlainBlockDecoder::HandleBatch(size_t* n, ColumnDataView* dst, Cell
   Slice *out = reinterpret_cast<Slice*>(dst->data());
   for (size_t i = 0; i < max_fetch; i++, out++, cur_idx_++) {
     Slice elem(string_at_index(cur_idx_));
-    c(i, elem, out, out_arena);
+    c(i, elem, out);
   }
   *n = max_fetch;
   return Status::OK();
 }
 
 Status BinaryPlainBlockDecoder::CopyNextValues(size_t* n, ColumnDataView* dst) {
-  return HandleBatch(n, dst, [&](size_t i, Slice elem, Slice* out, Arena* out_arena) {
-    CHECK(out_arena->RelocateSlice(elem, out));
+  dst->memory()->RetainReference(block_);
+  return HandleBatch(n, dst, [&](size_t /*i*/, Slice elem, Slice* out) {
+                               *out = elem;
   });
 }
 
@@ -327,16 +328,23 @@ Status BinaryPlainBlockDecoder::CopyNextAndEval(size_t* n,
                                                 ColumnMaterializationContext* ctx,
                                                 SelectionVectorView* sel,
                                                 ColumnDataView* dst) {
+  bool retain_block = false;
   ctx->SetDecoderEvalSupported();
-  return HandleBatch(n, dst, [&](size_t i, Slice elem, Slice* out, Arena* out_arena) {
+  Status s = HandleBatch(n, dst, [&](size_t i, Slice elem, Slice* out) {
     if (!sel->TestBit(i)) {
       return;
-    } else if (ctx->pred()->EvaluateCell<BINARY>(static_cast<const void*>(&elem))) {
-      CHECK(out_arena->RelocateSlice(elem, out));
+    }
+    if (ctx->pred()->EvaluateCell<BINARY>(static_cast<const void*>(&elem))) {
+      *out = elem;
+      retain_block = true;
     } else {
       sel->ClearBit(i);
     }
   });
+  if (PREDICT_TRUE(s.ok() && retain_block)) {
+    dst->memory()->RetainReference(block_);
+  }
+  return s;
 }
 
 
diff --git a/src/kudu/cfile/binary_plain_block.h b/src/kudu/cfile/binary_plain_block.h
index 953c4ae..edd4784 100644
--- a/src/kudu/cfile/binary_plain_block.h
+++ b/src/kudu/cfile/binary_plain_block.h
@@ -142,6 +142,10 @@ class BinaryPlainBlockDecoder final : public BlockDecoder {
     return Slice(&data_[str_offset], len);
   }
 
+  const scoped_refptr<BlockHandle>& block_handle() {
+    return block_;
+  }
+
   // Minimum length of a header.
   static const size_t kMinHeaderSize = sizeof(uint32_t) * 3;
 
@@ -163,7 +167,7 @@ class BinaryPlainBlockDecoder final : public BlockDecoder {
     return ret;
   }
 
-  scoped_refptr<BlockHandle> block_handle_;
+  scoped_refptr<BlockHandle> block_;
   Slice data_;
   bool parsed_;
 
diff --git a/src/kudu/cfile/block_handle.h b/src/kudu/cfile/block_handle.h
index d1bb62e..b88d844 100644
--- a/src/kudu/cfile/block_handle.h
+++ b/src/kudu/cfile/block_handle.h
@@ -24,8 +24,8 @@
 #include <boost/variant/variant.hpp>
 
 #include "kudu/cfile/block_cache.h"
-#include "kudu/gutil/ref_counted.h"
 #include "kudu/common/rowblock_memory.h"
+#include "kudu/gutil/ref_counted.h"
 
 namespace kudu {
 namespace cfile {
@@ -40,7 +40,7 @@ namespace cfile {
 // Note that the BlockHandle itself may refer to a BlockCacheHandle, which itself is
 // reference-counted. When all of the references to a BlockHandle go out of scope, it
 // results in decrementing the BlockCacheHandle's reference count.
-class BlockHandle : public RefCountedThreadSafe<BlockHandle> {
+class BlockHandle final : public RefCountedThreadSafe<BlockHandle> {
  public:
   static scoped_refptr<BlockHandle> WithOwnedData(const Slice& data) {
     return { new BlockHandle(data) };
diff --git a/src/kudu/common/rowblock_memory.h b/src/kudu/common/rowblock_memory.h
index 9117ebb..5a11b4b 100644
--- a/src/kudu/common/rowblock_memory.h
+++ b/src/kudu/common/rowblock_memory.h
@@ -16,10 +16,16 @@
 // under the License.
 #pragma once
 
+#include <functional>
+#include <vector>
+
+#include "kudu/gutil/ref_counted.h"
 #include "kudu/util/memory/arena.h"
 
 namespace kudu {
 
+class RowBlockRefCounted;
+
 // Handles the memory allocated alongside a RowBlock for variable-length
 // cells.
 //
@@ -27,11 +33,39 @@ namespace kudu {
 // data (eg BINARY columns). In this case, the data cannot be inlined directly
 // into the columnar data arrays that are part of the RowBlock and instead need
 // to be allocated out of a separate Arena. This class wraps that Arena.
+//
+// In some cases (eg "plain" or "dictionary" encodings), the underlying blocks may contain
+// string data in a non-encoded form. In that case, instead of copying strings, we can
+// refer to the strings within those data blocks themselves, and hold a reference to
+// the underlying block. This class holds those reference counts as well.
 struct RowBlockMemory {
   Arena arena;
 
   explicit RowBlockMemory(int arena_size = 32 * 1024) : arena(arena_size) {}
-  void Reset() { arena.Reset(); }
+  ~RowBlockMemory() { Reset(); }
+
+  void Reset() {
+    arena.Reset();
+    for (auto& f : to_release_) {
+      f();
+    }
+    to_release_.clear();
+  }
+
+  // Retain a reference, typically to a BlockHandle. This is templatized to avoid
+  // a circular dependency between kudu/common/ and kudu/cfile/
+  template<class T>
+  void RetainReference(const scoped_refptr<T>& item) {
+    // TODO(todd) if this ever ends up being a hot code path, we could
+    // probably optimize by having a small hashset of pointers. If an
+    // element is already in the set, we don't need to add a second copy.
+    T* raw = item.get();
+    raw->AddRef();
+    to_release_.emplace_back([=]() { raw->Release(); });
+  }
+
+ private:
+  std::vector<std::function<void()>> to_release_;
 };
 
 }  // namespace kudu
diff --git a/src/kudu/integration-tests/full_stack-insert-scan-test.cc b/src/kudu/integration-tests/full_stack-insert-scan-test.cc
index b91949e..7d3134e 100644
--- a/src/kudu/integration-tests/full_stack-insert-scan-test.cc
+++ b/src/kudu/integration-tests/full_stack-insert-scan-test.cc
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <memory>
 #include <ostream>
+#include <set>
 #include <string>
 #include <thread>
 #include <utility>
@@ -40,6 +41,7 @@
 #include "kudu/client/write_op.h"
 #include "kudu/codegen/compilation_manager.h"
 #include "kudu/common/partial_row.h"
+#include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/strings/split.h"
@@ -98,6 +100,7 @@ using kudu::client::KuduTable;
 using kudu::client::KuduTableCreator;
 using kudu::cluster::InternalMiniCluster;
 using kudu::cluster::InternalMiniClusterOptions;
+using std::set;
 using std::string;
 using std::thread;
 using std::unique_ptr;
@@ -196,9 +199,17 @@ class FullStackInsertScanTest : public KuduTest {
   // Insert the rows that are associated with that ID.
   void InsertRows(CountDownLatch* start_latch, int id, uint32_t seed);
 
+  enum class ScanFlag {
+    // Disable the block cache for the scan.
+    kDontCacheBlocks,
+    // Enable fault tolerance. This triggers different iterator code paths.
+    kFaultTolerant,
+  };
+
   // Run a scan from the reader_client_ with the projection schema schema
   // and LOG_TIMING message msg.
-  void ScanProjection(const vector<string>& cols, const string& msg);
+  void ScanProjection(const vector<string>& cols, const string& msg,
+                      const set<ScanFlag>& flags = {});
 
   vector<string> AllColumnNames() const;
   vector<string> StringColumnNames() const;
@@ -325,6 +336,11 @@ void FullStackInsertScanTest::DoTestScans() {
 
   NO_FATALS(ScanProjection({}, "empty projection, 0 col"));
   NO_FATALS(ScanProjection({ "key" }, "key scan, 1 col"));
+  NO_FATALS(ScanProjection(AllColumnNames(), "full schema scan, no cache, 10 col",
+                           { ScanFlag::kDontCacheBlocks }));
+  NO_FATALS(ScanProjection(AllColumnNames(),
+                           "fault-tolerant full schema scan, no cache, 10 col",
+                           { ScanFlag::kDontCacheBlocks, ScanFlag::kFaultTolerant }));
   NO_FATALS(ScanProjection(AllColumnNames(), "full schema scan, 10 col"));
   NO_FATALS(ScanProjection(StringColumnNames(), "String projection, 1 col"));
   NO_FATALS(ScanProjection(Int32ColumnNames(), "Int32 projection, 4 col"));
@@ -394,8 +410,10 @@ void FullStackInsertScanTest::InsertRows(CountDownLatch* start_latch, int id,
   FlushSessionOrDie(session);
 }
 
+
 void FullStackInsertScanTest::ScanProjection(const vector<string>& cols,
-                                             const string& msg) {
+                                             const string& msg,
+                                             const set<ScanFlag>& flags) {
   {
     // Warmup codegen cache
     KuduScanner scanner(reader_table_.get());
@@ -404,6 +422,12 @@ void FullStackInsertScanTest::ScanProjection(const vector<string>& cols,
     codegen::CompilationManager::GetSingleton()->Wait();
   }
   KuduScanner scanner(reader_table_.get());
+  if (ContainsKey(flags, ScanFlag::kDontCacheBlocks)) {
+    CHECK_OK(scanner.SetCacheBlocks(false));
+  }
+  if (ContainsKey(flags, ScanFlag::kFaultTolerant)) {
+    CHECK_OK(scanner.SetFaultTolerant());
+  }
   ASSERT_OK(scanner.SetProjectedColumnNames(cols));
   uint64_t nrows = 0;
   LOG_TIMING(INFO, msg) {