You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by ta...@apache.org on 2020/03/25 16:29:12 UTC

[impala] branch master updated (b8d6b0d -> 5ff7c6a)

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from b8d6b0d  IMPALA-9546: Update ranger-admin-site.xml.template after RANGER-2688
     new da5b498  IMPALA-9373: more tactical IWYU fixes
     new 5ff7c6a  IMPALA-9538 Bump up linux-syscall-support.h

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/benchmarks/atod-benchmark.cc                |    1 +
 be/src/benchmarks/bloom-filter-benchmark.cc        |    1 +
 be/src/benchmarks/overflow-benchmark.cc            |   18 +-
 be/src/codegen/codegen-anyval.cc                   |    3 +-
 be/src/codegen/codegen-anyval.h                    |    3 +-
 be/src/codegen/llvm-codegen.cc                     |    2 +-
 be/src/common/init.cc                              |    1 -
 be/src/common/logging.cc                           |   42 +
 be/src/common/logging.h                            |    9 +-
 be/src/common/status.cc                            |   11 +-
 be/src/common/thread-debug-info-test.cc            |    1 +
 be/src/common/thread-debug-info.h                  |    8 +-
 be/src/exec/aggregator.cc                          |    1 +
 be/src/exec/blocking-plan-root-sink.cc             |    2 +
 be/src/exec/buffered-plan-root-sink.cc             |    2 +
 be/src/exec/catalog-op-executor.cc                 |    1 +
 be/src/exec/data-sink.cc                           |    1 +
 be/src/exec/data-sink.h                            |    2 +-
 be/src/exec/exec-node.cc                           |    2 +-
 be/src/exec/exec-node.h                            |   15 +-
 be/src/exec/filter-context.cc                      |    1 +
 be/src/exec/filter-context.h                       |    7 +-
 be/src/exec/grouping-aggregator.cc                 |    1 +
 be/src/exec/hash-table-test.cc                     |    1 -
 be/src/exec/hdfs-avro-scanner-ir.cc                |    2 +-
 be/src/exec/hdfs-columnar-scanner-ir.cc            |    2 +
 be/src/exec/hdfs-columnar-scanner.cc               |    4 +
 be/src/exec/hdfs-columnar-scanner.h                |   12 +-
 be/src/exec/hdfs-orc-scanner.cc                    |    2 +
 be/src/exec/hdfs-scan-node.cc                      |    1 +
 be/src/exec/hdfs-scanner.cc                        |    1 +
 be/src/exec/hdfs-sequence-scanner.cc               |   34 +-
 be/src/exec/hdfs-text-scanner.cc                   |   36 +-
 be/src/exec/join-builder.cc                        |    3 +
 be/src/exec/kudu-scan-node.cc                      |    1 +
 be/src/exec/kudu-scanner.cc                        |    1 +
 be/src/exec/kudu-table-sink.cc                     |    1 +
 be/src/exec/kudu-table-sink.h                      |    4 +-
 be/src/exec/orc-column-readers.cc                  |    7 +-
 be/src/exec/orc-column-readers.h                   |    1 +
 be/src/exec/parquet/hdfs-parquet-scanner.cc        |    7 +-
 be/src/exec/parquet/hdfs-parquet-scanner.h         |    1 -
 be/src/exec/parquet/parquet-column-chunk-reader.cc |    1 +
 be/src/exec/parquet/parquet-column-chunk-reader.h  |    3 +-
 be/src/exec/parquet/parquet-column-readers.cc      |    3 +
 be/src/exec/parquet/parquet-common.h               |    2 +
 be/src/exec/parquet/parquet-version-test.cc        |    1 -
 be/src/exec/partitioned-hash-join-builder-ir.cc    |   16 +-
 be/src/exec/partitioned-hash-join-builder.cc       |    1 +
 be/src/exec/plan-root-sink.cc                      |    2 +
 be/src/exec/read-write-util-test.cc                |    1 -
 be/src/exec/row-batch-cache.h                      |    1 -
 be/src/exec/row-batch-list-test.cc                 |    1 -
 be/src/exec/scan-node.cc                           |    2 +
 be/src/exec/topn-node.cc                           |    1 +
 be/src/exec/zigzag-test.cc                         |    1 -
 be/src/exprs/agg-fn-evaluator.cc                   |   23 +-
 be/src/exprs/agg-fn.h                              |    4 +
 be/src/exprs/aggregate-functions-ir.cc             |   17 +-
 be/src/exprs/aggregate-functions-test.cc           |    5 +-
 be/src/exprs/anyval-util.cc                        |    3 -
 be/src/exprs/anyval-util.h                         |    1 -
 be/src/exprs/decimal-functions-ir.cc               |    7 +-
 be/src/exprs/decimal-operators-ir.cc               |    2 +-
 be/src/exprs/expr-test.cc                          |    4 +-
 be/src/exprs/expr-value.h                          |   14 +-
 be/src/exprs/hive-udf-call.cc                      |    3 +-
 be/src/exprs/literal.cc                            |    1 +
 be/src/exprs/math-functions-ir.cc                  |    7 +-
 be/src/exprs/scalar-expr-evaluator.h               |    8 +
 be/src/exprs/timestamp-functions-ir.cc             |    5 +-
 be/src/exprs/timezone_db.cc                        |   12 +-
 be/src/exprs/udf-builtins.cc                       |   11 +
 be/src/gutil/linux_syscall_support.h               | 2722 +++++++++++++-------
 be/src/gutil/spinlock_linux-inl.h                  |   16 +-
 be/src/kudu/util/debug-util.cc                     |    3 +-
 be/src/rpc/thrift-util-test.cc                     |    2 -
 be/src/runtime/CMakeLists.txt                      |    1 -
 be/src/runtime/bufferpool/buffer-allocator.h       |    2 +
 be/src/runtime/bufferpool/buffer-pool-internal.h   |   17 +-
 be/src/runtime/bufferpool/buffer-pool-test.cc      |   29 +-
 be/src/runtime/bufferpool/buffer-pool.cc           |   10 +-
 be/src/runtime/bufferpool/buffer-pool.h            |    8 +-
 be/src/runtime/client-cache.h                      |   25 +-
 be/src/runtime/coordinator-backend-state.cc        |    1 +
 .../runtime/datetime-iso-sql-format-tokenizer.cc   |   10 +
 be/src/runtime/datetime-parser-common.cc           |    1 +
 be/src/runtime/datetime-parser-common.h            |    2 +
 be/src/runtime/decimal-test.cc                     |   13 +-
 be/src/runtime/decimal-value.h                     |    4 +-
 be/src/runtime/decimal-value.inline.h              |   48 +-
 be/src/runtime/descriptors.h                       |   16 +-
 be/src/runtime/exec-env.cc                         |    1 +
 be/src/runtime/io/data-cache-test.cc               |    1 -
 be/src/runtime/io/disk-io-mgr-test.cc              |    7 +-
 be/src/runtime/mem-tracker.h                       |   18 +-
 be/src/runtime/multi-precision.cc                  |   66 -
 be/src/runtime/multi-precision.h                   |   36 +-
 be/src/runtime/query-state.cc                      |    3 +-
 be/src/runtime/query-state.h                       |    6 +-
 be/src/runtime/runtime-filter-bank.cc              |    1 +
 be/src/runtime/sorted-run-merger.cc                |    1 +
 be/src/runtime/sorted-run-merger.h                 |    4 +-
 be/src/runtime/sorter.cc                           |    1 +
 be/src/runtime/sorter.h                            |    2 +-
 be/src/runtime/test-env.cc                         |    2 +
 be/src/runtime/test-env.h                          |    2 -
 be/src/runtime/timestamp-parse-util.cc             |   22 +-
 be/src/runtime/timestamp-test.cc                   |    2 +-
 be/src/runtime/timestamp-value.h                   |   23 +-
 be/src/runtime/tmp-file-mgr-internal.h             |   17 +-
 be/src/runtime/tmp-file-mgr-test.cc                |  102 +-
 be/src/runtime/tmp-file-mgr.cc                     |  118 +-
 be/src/runtime/tmp-file-mgr.h                      |  624 ++---
 be/src/runtime/tuple.h                             |    3 +
 be/src/service/client-request-state-map.cc         |    1 +
 be/src/service/client-request-state.cc             |    2 +
 be/src/service/client-request-state.h              |    1 -
 be/src/service/control-service.h                   |    1 -
 be/src/service/impala-beeswax-server.cc            |    1 +
 be/src/service/impala-http-handler.cc              |    1 +
 be/src/service/impala-internal-service.cc          |    1 +
 be/src/service/impala-server.h                     |    3 +-
 be/src/statestore/statestore.h                     |    1 -
 be/src/udf/uda-test.cc                             |    2 -
 be/src/util/CMakeLists.txt                         |    1 -
 be/src/util/arithmetic-util.h                      |   27 +
 be/src/util/auth-util.cc                           |    1 +
 be/src/util/auth-util.h                            |    1 -
 be/src/util/bit-stream-utils-test.cc               |    2 +
 be/src/util/bit-util-test.cc                       |   47 +-
 be/src/util/bit-util.h                             |   62 -
 be/src/util/bitmap-test.cc                         |    3 -
 be/src/util/bloom-filter.h                         |    1 -
 be/src/util/cgroup-util.cc                         |    1 -
 be/src/util/codec.cc                               |    3 +
 be/src/util/codec.h                                |    6 +-
 be/src/util/debug-util.h                           |    6 +-
 .../util/{unique-id-hash.h => decimal-constants.h} |   31 +-
 be/src/util/decimal-util.cc                        |   26 -
 be/src/util/decimal-util.h                         |   14 +-
 be/src/util/decompress.cc                          |    1 +
 be/src/util/decompress.h                           |    1 -
 be/src/util/dict-test.cc                           |    5 -
 be/src/util/event-metrics.h                        |    1 -
 be/src/util/logging-support-test.cc                |    1 -
 be/src/util/mem-info.cc                            |    2 -
 be/src/util/memory-metrics.h                       |    2 -
 be/src/util/metrics.h                              |   14 +-
 be/src/util/os-info.cc                             |    7 +-
 be/src/util/pretty-printer.h                       |    3 +-
 be/src/util/proc-info-test.cc                      |    8 -
 be/src/util/process-state-info.cc                  |    1 -
 be/src/util/redactor-test-utils.h                  |    2 -
 be/src/util/runtime-profile.cc                     |    1 +
 be/src/util/string-parser-test.cc                  |    1 -
 be/src/util/string-parser.h                        |    3 +
 be/src/util/symbols-util-test.cc                   |    2 -
 be/src/util/system-state-info.cc                   |    4 +-
 be/src/util/tuple-row-compare.h                    |    1 -
 be/src/util/uid-util-test.cc                       |    4 -
 be/src/util/uid-util.cc                            |    2 +
 be/src/util/uid-util.h                             |   12 +-
 163 files changed, 2861 insertions(+), 1874 deletions(-)
 delete mode 100644 be/src/runtime/multi-precision.cc
 copy be/src/util/{unique-id-hash.h => decimal-constants.h} (59%)
 delete mode 100644 be/src/util/decimal-util.cc

[impala] 02/02: IMPALA-9538 Bump up linux-syscall-support.h

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 5ff7c6a7de76aac2623710b3e43ceed8ce7424c8
Author: zhaorenhai <zh...@hotmail.com>
AuthorDate: Fri Mar 20 09:59:05 2020 +0000

    IMPALA-9538 Bump up linux-syscall-support.h
    
    Bump up linux-syscall-support.h to newest version
    which support aarch64
    
    Change-Id: I6c46acb17f048890a3f93fc6b910b2df3c1a7058
    Reviewed-on: http://gerrit.cloudera.org:8080/15510
    Reviewed-by: Tim Armstrong <ta...@cloudera.com>
    Tested-by: Tim Armstrong <ta...@cloudera.com>
---
 be/src/gutil/linux_syscall_support.h | 2722 ++++++++++++++++++++++------------
 be/src/gutil/spinlock_linux-inl.h    |   16 +-
 be/src/kudu/util/debug-util.cc       |    3 +-
 3 files changed, 1798 insertions(+), 943 deletions(-)

diff --git a/be/src/gutil/linux_syscall_support.h b/be/src/gutil/linux_syscall_support.h
index 5476d0b..a177cfb 100644
--- a/be/src/gutil/linux_syscall_support.h
+++ b/be/src/gutil/linux_syscall_support.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005-2008, Google Inc.
+/* Copyright (c) 2005-2011, Google Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -66,6 +66,15 @@
  *   results in prefixes "sys[0..9]_". It is also possible to set this
  *   macro to -1, which avoids all prefixes.
  *
+ * SYS_SYSCALL_ENTRYPOINT:
+ *   Some applications (such as sandboxes that filter system calls), need
+ *   to be able to run custom-code each time a system call is made. If this
+ *   macro is defined, it expands to the name of a "common" symbol. If
+ *   this symbol is assigned a non-NULL pointer value, it is used as the
+ *   address of the system call entrypoint.
+ *   A pointer to this symbol can be obtained by calling
+ *   get_syscall_entrypoint()
+ *
  * This file defines a few internal symbols that all start with "LSS_".
  * Do not access these symbols from outside this file. They are not part
  * of the supported API.
@@ -73,11 +82,14 @@
 #ifndef SYS_LINUX_SYSCALL_SUPPORT_H
 #define SYS_LINUX_SYSCALL_SUPPORT_H
 
-/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux.
+/* We currently only support x86-32, x86-64, ARM, MIPS, PPC, s390 and s390x
+ * on Linux.
  * Porting to other related platforms should not be difficult.
  */
-#if (defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \
-     defined(__mips__) || defined(__PPC__)) && defined(__linux)
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) ||   \
+     defined(__mips__) || defined(__PPC__) || defined(__ARM_EABI__) || \
+     defined(__aarch64__) || defined(__s390__)) \
+  && (defined(__linux) || defined(__ANDROID__))
 
 #ifndef SYS_CPLUSPLUS
 #ifdef __cplusplus
@@ -89,23 +101,76 @@ extern "C" {
 #endif
 
 #include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
 #include <signal.h>
 #include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
 #include <string.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/time.h>
 #include <sys/types.h>
-#include <syscall.h>
+#include <sys/syscall.h>
 #include <unistd.h>
 #include <linux/unistd.h>
 #include <endian.h>
 
 #ifdef __mips__
 /* Include definitions of the ABI currently in use.                          */
+#ifdef __ANDROID__
+/* Android doesn't have sgidefs.h, but does have asm/sgidefs.h,
+ * which has the definitions we need.
+ */
+#include <asm/sgidefs.h>
+#else
 #include <sgidefs.h>
 #endif
+#endif
+#endif
 
+/* Some libcs, for example Android NDK and musl, #define these
+ * macros as aliases to their non-64 counterparts. To avoid naming
+ * conflict, remove them.
+ *
+ * These are restored by the corresponding #pragma pop_macro near
+ * the end of this file.
+ */
+#pragma push_macro("stat64")
+#pragma push_macro("fstat64")
+#pragma push_macro("lstat64")
+#pragma push_macro("pread64")
+#pragma push_macro("pwrite64")
+#pragma push_macro("getdents64")
+#undef stat64
+#undef fstat64
+#undef lstat64
+#undef pread64
+#undef pwrite64
+#undef getdents64
+
+#if defined(__ANDROID__) && defined(__x86_64__)
+// A number of x86_64 syscalls are blocked by seccomp on recent Android;
+// undefine them so that modern alternatives will be used instead where
+// possible.
+// The alternative syscalls have been sanity checked against linux-3.4+;
+// older versions might not work.
+# undef __NR_getdents
+# undef __NR_dup2
+# undef __NR_fork
+# undef __NR_getpgrp
+# undef __NR_open
+# undef __NR_poll
+# undef __NR_readlink
+# undef __NR_stat
+# undef __NR_unlink
+# undef __NR_pipe
+#endif
+
+#if defined(__ANDROID__)
+// waitpid is blocked by seccomp on all architectures on recent Android.
+# undef __NR_waitpid
 #endif
 
 /* As glibc often provides subtly incompatible data structures (and implicit
@@ -147,12 +212,17 @@ struct kernel_dirent64 {
 };
 
 /* include/linux/dirent.h                                                    */
+#if !defined(__NR_getdents)
+// when getdents is not available, getdents64 is used for both.
+#define kernel_dirent kernel_dirent64
+#else
 struct kernel_dirent {
   long               d_ino;
   long               d_off;
   unsigned short     d_reclen;
   char               d_name[256];
 };
+#endif
 
 /* include/linux/uio.h                                                       */
 struct kernel_iovec {
@@ -216,26 +286,14 @@ struct kernel_rusage {
   long               ru_nivcsw;
 };
 
-/* include/linux/capablilty.h                                                */
-struct kernel_cap_user_header {
-  unsigned int version;
-  int pid;
-};
-
-struct kernel_cap_user_data {
-  unsigned int effective;
-  unsigned int permitted;
-  unsigned int inheritable;
-};
-
-struct siginfo;
-#if defined(__i386__) || defined(__arm__) || defined(__PPC__)
+#if defined(__i386__) || defined(__ARM_EABI__) || defined(__ARM_ARCH_3__) \
+  || defined(__PPC__) || (defined(__s390__) && !defined(__s390x__))
 
 /* include/asm-{arm,i386,mips,ppc}/signal.h                                  */
 struct kernel_old_sigaction {
   union {
     void             (*sa_handler_)(int);
-    void             (*sa_sigaction_)(int, struct siginfo *, void *);
+    void             (*sa_sigaction_)(int, siginfo_t *, void *);
   };
   unsigned long      sa_mask;
   unsigned long      sa_flags;
@@ -243,6 +301,8 @@ struct kernel_old_sigaction {
 } __attribute__((packed,aligned(4)));
 #elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
   #define kernel_old_sigaction kernel_sigaction
+#elif defined(__aarch64__)
+  // No kernel_old_sigaction defined for arm64.
 #endif
 
 /* Some kernel functions (e.g. sigaction() in 2.6.23) require that the
@@ -260,7 +320,7 @@ struct kernel_old_sigaction {
 #define KERNEL_NSIG  64
 #endif
 
-/* include/asm-{arm,i386,mips,x86_64}/signal.h                               */
+/* include/asm-{arm,aarch64,i386,mips,x86_64}/signal.h                       */
 struct kernel_sigset_t {
   unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/
                     (8*sizeof(unsigned long))];
@@ -272,13 +332,13 @@ struct kernel_sigaction {
   unsigned long      sa_flags;
   union {
     void             (*sa_handler_)(int);
-    void             (*sa_sigaction_)(int, struct siginfo *, void *);
+    void             (*sa_sigaction_)(int, siginfo_t *, void *);
   };
   struct kernel_sigset_t sa_mask;
 #else
   union {
     void             (*sa_handler_)(int);
-    void             (*sa_sigaction_)(int, struct siginfo *, void *);
+    void             (*sa_sigaction_)(int, siginfo_t *, void *);
   };
   unsigned long      sa_flags;
   void               (*sa_restorer)(void);
@@ -292,7 +352,7 @@ struct kernel_sockaddr {
   char               sa_data[14];
 };
 
-/* include/asm-{arm,i386,mips,ppc}/stat.h                                    */
+/* include/asm-{arm,aarch64,i386,mips,ppc,s390}/stat.h                       */
 #ifdef __mips__
 #if _MIPS_SIM == _MIPS_SIM_ABI64
 struct kernel_stat {
@@ -319,7 +379,7 @@ struct kernel_stat64 {
   unsigned           __pad2;
   unsigned long long st_blocks;
 };
-#elif defined __PPC__ && !defined __PPC64__
+#elif defined __PPC__
 struct kernel_stat64 {
   unsigned long long st_dev;
   unsigned long long st_ino;
@@ -365,8 +425,8 @@ struct kernel_stat64 {
 };
 #endif
 
-/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h                             */
-#if defined(__i386__) || defined(__arm__)
+/* include/asm-{arm,aarch64,i386,mips,x86_64,ppc,s390}/stat.h                */
+#if defined(__i386__) || defined(__ARM_ARCH_3__) || defined(__ARM_EABI__)
 struct kernel_stat {
   /* The kernel headers suggest that st_dev and st_rdev should be 32bit
    * quantities encoding 12bit major and 20bit minor numbers in an interleaved
@@ -396,39 +456,34 @@ struct kernel_stat {
 };
 #elif defined(__x86_64__)
 struct kernel_stat {
-  unsigned long      st_dev;
-  unsigned long      st_ino;
-  unsigned long      st_nlink;
+  uint64_t           st_dev;
+  uint64_t           st_ino;
+  uint64_t           st_nlink;
   unsigned           st_mode;
   unsigned           st_uid;
   unsigned           st_gid;
   unsigned           __pad0;
-  unsigned long      st_rdev;
-  long               st_size;
-  long               st_blksize;
-  long               st_blocks;
-  unsigned long      st_atime_;
-  unsigned long      st_atime_nsec_;
-  unsigned long      st_mtime_;
-  unsigned long      st_mtime_nsec_;
-  unsigned long      st_ctime_;
-  unsigned long      st_ctime_nsec_;
-  long               __unused[3];
+  uint64_t           st_rdev;
+  int64_t            st_size;
+  int64_t            st_blksize;
+  int64_t            st_blocks;
+  uint64_t           st_atime_;
+  uint64_t           st_atime_nsec_;
+  uint64_t           st_mtime_;
+  uint64_t           st_mtime_nsec_;
+  uint64_t           st_ctime_;
+  uint64_t           st_ctime_nsec_;
+  int64_t            __unused4[3];
 };
 #elif defined(__PPC__)
 struct kernel_stat {
-  unsigned long      st_dev;
+  unsigned           st_dev;
   unsigned long      st_ino;      // ino_t
-#ifdef __PPC64__
-  unsigned long      st_nlink;    // nlink_t
-  unsigned int       st_mode;     // mode_t
-#else
-  unsigned int       st_mode;     // mode_t
+  unsigned long      st_mode;     // mode_t
   unsigned short     st_nlink;    // nlink_t
-#endif
-  unsigned int       st_uid;      // uid_t
-  unsigned int       st_gid;      // gid_t
-  unsigned long      st_rdev;
+  unsigned           st_uid;      // uid_t
+  unsigned           st_gid;      // gid_t
+  unsigned           st_rdev;
   long               st_size;     // off_t
   unsigned long      st_blksize;
   unsigned long      st_blocks;
@@ -440,9 +495,6 @@ struct kernel_stat {
   unsigned long      st_ctime_nsec_;
   unsigned long      __unused4;
   unsigned long      __unused5;
-#ifdef __PPC64__
-  unsigned long      __unused6;
-#endif
 };
 #elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
 struct kernel_stat {
@@ -467,9 +519,76 @@ struct kernel_stat {
   int                st_blocks;
   int                st_pad4[14];
 };
+#elif defined(__aarch64__)
+struct kernel_stat {
+  unsigned long      st_dev;
+  unsigned long      st_ino;
+  unsigned int       st_mode;
+  unsigned int       st_nlink;
+  unsigned int       st_uid;
+  unsigned int       st_gid;
+  unsigned long      st_rdev;
+  unsigned long      __pad1;
+  long               st_size;
+  int                st_blksize;
+  int                __pad2;
+  long               st_blocks;
+  long               st_atime_;
+  unsigned long      st_atime_nsec_;
+  long               st_mtime_;
+  unsigned long      st_mtime_nsec_;
+  long               st_ctime_;
+  unsigned long      st_ctime_nsec_;
+  unsigned int       __unused4;
+  unsigned int       __unused5;
+};
+#elif defined(__s390x__)
+struct kernel_stat {
+  unsigned long      st_dev;
+  unsigned long      st_ino;
+  unsigned long      st_nlink;
+  unsigned int       st_mode;
+  unsigned int       st_uid;
+  unsigned int       st_gid;
+  unsigned int       __pad1;
+  unsigned long      st_rdev;
+  unsigned long      st_size;
+  unsigned long      st_atime_;
+  unsigned long      st_atime_nsec_;
+  unsigned long      st_mtime_;
+  unsigned long      st_mtime_nsec_;
+  unsigned long      st_ctime_;
+  unsigned long      st_ctime_nsec_;
+  unsigned long      st_blksize;
+  long               st_blocks;
+  unsigned long      __unused[3];
+};
+#elif defined(__s390__)
+struct kernel_stat {
+  unsigned short     st_dev;
+  unsigned short     __pad1;
+  unsigned long      st_ino;
+  unsigned short     st_mode;
+  unsigned short     st_nlink;
+  unsigned short     st_uid;
+  unsigned short     st_gid;
+  unsigned short     st_rdev;
+  unsigned short     __pad2;
+  unsigned long      st_size;
+  unsigned long      st_blksize;
+  unsigned long      st_blocks;
+  unsigned long      st_atime_;
+  unsigned long      st_atime_nsec_;
+  unsigned long      st_mtime_;
+  unsigned long      st_mtime_nsec_;
+  unsigned long      st_ctime_;
+  unsigned long      st_ctime_nsec_;
+  unsigned long      __unused4;
+  unsigned long      __unused5;
+};
 #endif
 
-/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h                           */
+/* include/asm-{arm,aarch64,i386,mips,x86_64,ppc,s390}/statfs.h              */
 #ifdef __mips__
 #if _MIPS_SIM != _MIPS_SIM_ABI64
 struct kernel_statfs64 {
@@ -487,6 +606,22 @@ struct kernel_statfs64 {
   unsigned long      f_spare[6];
 };
 #endif
+#elif defined(__s390__)
+/* See also arch/s390/include/asm/compat.h                                   */
+struct kernel_statfs64 {
+  unsigned int       f_type;
+  unsigned int       f_bsize;
+  unsigned long long f_blocks;
+  unsigned long long f_bfree;
+  unsigned long long f_bavail;
+  unsigned long long f_files;
+  unsigned long long f_ffree;
+  struct { int val[2]; } f_fsid;
+  unsigned int       f_namelen;
+  unsigned int       f_frsize;
+  unsigned int       f_flags;
+  unsigned int       f_spare[4];
+};
 #elif !defined(__x86_64__)
 struct kernel_statfs64 {
   unsigned long      f_type;
@@ -503,7 +638,7 @@ struct kernel_statfs64 {
 };
 #endif
 
-/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h                   */
+/* include/asm-{arm,i386,mips,x86_64,ppc,generic,s390}/statfs.h              */
 #ifdef __mips__
 struct kernel_statfs {
   long               f_type;
@@ -518,11 +653,40 @@ struct kernel_statfs {
   long               f_namelen;
   long               f_spare[6];
 };
-#else
+#elif defined(__x86_64__)
 struct kernel_statfs {
   /* x86_64 actually defines all these fields as signed, whereas all other  */
   /* platforms define them as unsigned. Leaving them at unsigned should not */
-  /* cause any problems.                                                    */
+  /* cause any problems. Make sure these are 64-bit even on x32.            */
+  uint64_t           f_type;
+  uint64_t           f_bsize;
+  uint64_t           f_blocks;
+  uint64_t           f_bfree;
+  uint64_t           f_bavail;
+  uint64_t           f_files;
+  uint64_t           f_ffree;
+  struct { int val[2]; } f_fsid;
+  uint64_t           f_namelen;
+  uint64_t           f_frsize;
+  uint64_t           f_spare[5];
+};
+#elif defined(__s390__)
+struct kernel_statfs {
+  unsigned int       f_type;
+  unsigned int       f_bsize;
+  unsigned long      f_blocks;
+  unsigned long      f_bfree;
+  unsigned long      f_bavail;
+  unsigned long      f_files;
+  unsigned long      f_ffree;
+  struct { int val[2]; } f_fsid;
+  unsigned int       f_namelen;
+  unsigned int       f_frsize;
+  unsigned int       f_flags;
+  unsigned int       f_spare[4];
+};
+#else
+struct kernel_statfs {
   unsigned long      f_type;
   unsigned long      f_bsize;
   unsigned long      f_blocks;
@@ -537,52 +701,10 @@ struct kernel_statfs {
 };
 #endif
 
-#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \
-    defined(__PPC__)
-/* include/linux/aio_abi.h                                                   */
-/* Layout depends on big/little endian.                                      */
-struct kernel_iocb {
-  unsigned long long aio_data;
-  unsigned int       aio_key;
-  unsigned int       aio_reserved;
-  unsigned short     aio_lio_opcode;
-           short     aio_reqprio;
-  unsigned int       aio_filedes;
-  unsigned long long aio_buf;
-  unsigned long long aio_nbytes;
-  unsigned long long aio_offset;
-  unsigned long long aio_reserved2;
-  unsigned int       aio_flags;
-  unsigned int       aio_resfd;
-};
-#elif defined(__PPC__)
-struct kernel_iocb {
-  unsigned long long aio_data;
-  unsigned int       aio_reserved;
-  unsigned int       aio_key;
-  unsigned short     aio_lio_opcode;
-           short     aio_reqprio;
-  unsigned int       aio_fildes;
-  unsigned long long aio_buf;
-  unsigned long long aio_nbytes;
-  unsigned long long aio_offset;
-  unsigned long long aio_reserved2;
-  unsigned int       aio_flags;
-  unsigned int       aio_resfd;
-};
-#endif
-
-/* include/linux/aio_abi.h                                                   */
-struct kernel_io_event {
-  unsigned long long data;
-  unsigned long long obj;
-           long long res;
-           long long res2;
-};
 
 /* Definitions missing from the standard header files                        */
 #ifndef O_DIRECTORY
-#if defined(__arm__) || defined(__PPC_)
+#if defined(__ARM_ARCH_3__) || defined(__ARM_EABI__) || defined(__aarch64__)
 #define O_DIRECTORY             0040000
 #else
 #define O_DIRECTORY             0200000
@@ -635,11 +757,12 @@ struct kernel_io_event {
 #endif
 #ifndef MAKE_PROCESS_CPUCLOCK
 #define MAKE_PROCESS_CPUCLOCK(pid, clock)                                     \
-        ((~(int)(pid) << 3) | (int)(clock))
+        ((int)(~(unsigned)(pid) << 3) | (int)(clock))
 #endif
 #ifndef MAKE_THREAD_CPUCLOCK
 #define MAKE_THREAD_CPUCLOCK(tid, clock)                                      \
-        ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK))
+        ((int)(~(unsigned)(tid) << 3) |                                       \
+         (int)((clock) | CPUCLOCK_PERTHREAD_MASK))
 #endif
 
 #ifndef FUTEX_WAIT
@@ -711,12 +834,6 @@ struct kernel_io_event {
 #endif
 
 #if defined(__i386__)
-#ifndef __NR_mount
-#define __NR_mount               21
-#endif
-#ifndef __NR_setgroups32
-#define __NR_setgroups32         81
-#endif
 #ifndef __NR_quotactl
 #define __NR_quotactl           131
 #endif
@@ -795,13 +912,6 @@ struct kernel_io_event {
 #define __NR_sched_setaffinity  241
 #define __NR_sched_getaffinity  242
 #endif
-#ifndef __NR_io_setup
-#define __NR_io_setup           245
-#define __NR_io_destroy         246
-#define __NR_io_getevents       247
-#define __NR_io_submit          248
-#define __NR_io_cancel          249
-#endif
 #ifndef __NR_set_tid_address
 #define __NR_set_tid_address    258
 #endif
@@ -835,9 +945,6 @@ struct kernel_io_event {
 #ifndef __NR_unlinkat
 #define __NR_unlinkat           301
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            310
-#endif
 #ifndef __NR_move_pages
 #define __NR_move_pages         317
 #endif
@@ -847,37 +954,8 @@ struct kernel_io_event {
 #ifndef __NR_fallocate
 #define __NR_fallocate          324
 #endif
-#ifndef __NR_preadv
-#define __NR_preadv             333
-#endif
-#ifndef __NR_pwritev
-#define __NR_pwritev            334
-#endif
-#ifndef __NR_setns
-#define __NR_setns              346
-#endif
 /* End of i386 definitions                                                   */
-#elif defined(__arm__)
-#ifndef __syscall
-#if defined(__thumb__) || defined(__ARM_EABI__)
-#define __SYS_REG(name) register long __sysreg __asm__("r6") = __NR_##name;
-#define __SYS_REG_LIST(regs...) [sysreg] "r" (__sysreg) , ##regs
-#define __syscall(name) "swi\t0"
-#define __syscall_safe(name)                     \
-  "push  {r7}\n"                                 \
-  "mov   r7,%[sysreg]\n"                         \
-  __syscall(name)"\n"                            \
-  "pop   {r7}"
-#else
-#define __SYS_REG(name)
-#define __SYS_REG_LIST(regs...) regs
-#define __syscall(name) "swi\t" __sys1(__NR_##name) ""
-#define __syscall_safe(name) __syscall(name)
-#endif
-#endif
-#ifndef __NR_mount
-#define __NR_mount              (__NR_SYSCALL_BASE + 21)
-#endif
+#elif defined(__ARM_ARCH_3__) || defined(__ARM_EABI__)
 #ifndef __NR_setresuid
 #define __NR_setresuid          (__NR_SYSCALL_BASE + 164)
 #define __NR_getresuid          (__NR_SYSCALL_BASE + 165)
@@ -897,9 +975,6 @@ struct kernel_io_event {
 #ifndef __NR_pwrite64
 #define __NR_pwrite64           (__NR_SYSCALL_BASE + 181)
 #endif
-#ifndef __NR_capset
-#define __NR_capset             (__NR_SYSCALL_BASE + 185)
-#endif
 #ifndef __NR_ugetrlimit
 #define __NR_ugetrlimit         (__NR_SYSCALL_BASE + 191)
 #endif
@@ -909,9 +984,6 @@ struct kernel_io_event {
 #ifndef __NR_fstat64
 #define __NR_fstat64            (__NR_SYSCALL_BASE + 197)
 #endif
-#ifndef __NR_setgroups32
-#define __NR_setgroups32        (__NR_SYSCALL_BASE + 206)
-#endif
 #ifndef __NR_setresuid32
 #define __NR_setresuid32        (__NR_SYSCALL_BASE + 208)
 #define __NR_getresuid32        (__NR_SYSCALL_BASE + 209)
@@ -980,16 +1052,110 @@ struct kernel_io_event {
 #ifndef __NR_ioprio_get
 #define __NR_ioprio_get         (__NR_SYSCALL_BASE + 315)
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            (__NR_SYSCALL_BASE + 337)
-#endif
 #ifndef __NR_move_pages
 #define __NR_move_pages         (__NR_SYSCALL_BASE + 344)
 #endif
-#ifndef __NR_setns
-#define __NR_setns              (__NR_SYSCALL_BASE + 375)
+#ifndef __NR_getcpu
+#define __NR_getcpu             (__NR_SYSCALL_BASE + 345)
+#endif
+/* End of ARM 3/EABI definitions                                             */
+#elif defined(__aarch64__)
+#ifndef __NR_setxattr
+#define __NR_setxattr             5
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr            6
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr             8
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr            9
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr           11
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr          12
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set          30
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get          31
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat            35
+#endif
+#ifndef __NR_fallocate
+#define __NR_fallocate           47
+#endif
+#ifndef __NR_openat
+#define __NR_openat              56
+#endif
+#ifndef __NR_quotactl
+#define __NR_quotactl            60
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64          61
+#endif
+#ifndef __NR_getdents
+// when getdents is not available, getdents64 is used for both.
+#define __NR_getdents            __NR_getdents64
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64             67
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64            68
+#endif
+#ifndef __NR_ppoll
+#define __NR_ppoll               73
+#endif
+#ifndef __NR_readlinkat
+#define __NR_readlinkat          78
+#endif
+#ifndef __NR_newfstatat
+#define __NR_newfstatat          79
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address     96
 #endif
-/* End of ARM definitions                                                  */
+#ifndef __NR_futex
+#define __NR_futex               98
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime      113
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres       114
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity  122
+#define __NR_sched_getaffinity  123
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill              130
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid          147
+#define __NR_getresuid          148
+#define __NR_setresgid          149
+#define __NR_getresgid          150
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid             178
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead          213
+#endif
+#ifndef __NR_fadvise64
+#define __NR_fadvise64          223
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages         239
+#endif
+/* End of aarch64 definitions                                                */
 #elif defined(__x86_64__)
 #ifndef __NR_pread64
 #define __NR_pread64             17
@@ -1003,9 +1169,6 @@ struct kernel_io_event {
 #define __NR_setresgid          119
 #define __NR_getresgid          120
 #endif
-#ifndef __NR_mount
-#define __NR_mount              165
-#endif
 #ifndef __NR_quotactl
 #define __NR_quotactl           179
 #endif
@@ -1043,16 +1206,13 @@ struct kernel_io_event {
 #define __NR_sched_setaffinity  203
 #define __NR_sched_getaffinity  204
 #endif
-#ifndef __NR_io_setup
-#define __NR_io_setup           206
-#define __NR_io_destroy         207
-#define __NR_io_getevents       208
-#define __NR_io_submit          209
-#define __NR_io_cancel          210
-#endif
 #ifndef __NR_getdents64
 #define __NR_getdents64         217
 #endif
+#ifndef __NR_getdents
+// when getdents is not available, getdents64 is used for both.
+#define __NR_getdents           __NR_getdents64
+#endif
 #ifndef __NR_set_tid_address
 #define __NR_set_tid_address    218
 #endif
@@ -1080,30 +1240,15 @@ struct kernel_io_event {
 #ifndef __NR_unlinkat
 #define __NR_unlinkat           263
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            272
-#endif
 #ifndef __NR_move_pages
 #define __NR_move_pages         279
 #endif
 #ifndef __NR_fallocate
 #define __NR_fallocate          285
 #endif
-#ifndef __NR_preadv
-#define __NR_preadv             295
-#endif
-#ifndef __NR_pwritev
-#define __NR_pwritev            296
-#endif
-#ifndef __NR_setns
-#define __NR_setns              308
-#endif
 /* End of x86-64 definitions                                                 */
 #elif defined(__mips__)
 #if _MIPS_SIM == _MIPS_SIM_ABI32
-#ifndef __NR_mount
-#define __NR_mount              (__NR_Linux + 21)
-#endif
 #ifndef __NR_setresuid
 #define __NR_setresuid          (__NR_Linux + 185)
 #define __NR_getresuid          (__NR_Linux + 186)
@@ -1123,9 +1268,6 @@ struct kernel_io_event {
 #ifndef __NR_pwrite64
 #define __NR_pwrite64           (__NR_Linux + 201)
 #endif
-#ifndef __NR_capset
-#define __NR_capset             (__NR_Linux + 205)
-#endif
 #ifndef __NR_stat64
 #define __NR_stat64             (__NR_Linux + 213)
 #endif
@@ -1193,9 +1335,6 @@ struct kernel_io_event {
 #ifndef __NR_unlinkat
 #define __NR_unlinkat           (__NR_Linux + 294)
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            (__NR_Linux + 303)
-#endif
 #ifndef __NR_move_pages
 #define __NR_move_pages         (__NR_Linux + 308)
 #endif
@@ -1208,9 +1347,6 @@ struct kernel_io_event {
 #ifndef __NR_ioprio_get
 #define __NR_ioprio_get         (__NR_Linux + 315)
 #endif
-#ifndef __NR_setns
-#define __NR_setns              (__NR_Linux + 344)
-#endif
 /* End of MIPS (old 32bit API) definitions */
 #elif  _MIPS_SIM == _MIPS_SIM_ABI64
 #ifndef __NR_pread64
@@ -1225,12 +1361,6 @@ struct kernel_io_event {
 #define __NR_setresgid          (__NR_Linux + 117)
 #define __NR_getresgid          (__NR_Linux + 118)
 #endif
-#ifndef __NR_capset
-#define __NR_capset             (__NR_Linux + 124)
-#endif
-#ifndef __NR_mount
-#define __NR_mount              (__NR_Linux + 160)
-#endif
 #ifndef __NR_gettid
 #define __NR_gettid             (__NR_Linux + 178)
 #endif
@@ -1283,9 +1413,6 @@ struct kernel_io_event {
 #ifndef __NR_unlinkat
 #define __NR_unlinkat           (__NR_Linux + 253)
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            (__NR_Linux + 262)
-#endif
 #ifndef __NR_move_pages
 #define __NR_move_pages         (__NR_Linux + 267)
 #endif
@@ -1298,23 +1425,14 @@ struct kernel_io_event {
 #ifndef __NR_ioprio_get
 #define __NR_ioprio_get         (__NR_Linux + 274)
 #endif
-#ifndef __NR_setns
-#define __NR_setns              (__NR_Linux + 303)
-#endif
 /* End of MIPS (64bit API) definitions */
 #else
-#ifndef __NR_mount
-#define __NR_mount              (__NR_Linux + 160)
-#endif
 #ifndef __NR_setresuid
 #define __NR_setresuid          (__NR_Linux + 115)
 #define __NR_getresuid          (__NR_Linux + 116)
 #define __NR_setresgid          (__NR_Linux + 117)
 #define __NR_getresgid          (__NR_Linux + 118)
 #endif
-#ifndef __NR_capset
-#define __NR_capset             (__NR_Linux + 124)
-#endif
 #ifndef __NR_gettid
 #define __NR_gettid             (__NR_Linux + 178)
 #endif
@@ -1373,9 +1491,6 @@ struct kernel_io_event {
 #ifndef __NR_unlinkat
 #define __NR_unlinkat           (__NR_Linux + 257)
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            (__NR_Linux + 266)
-#endif
 #ifndef __NR_move_pages
 #define __NR_move_pages         (__NR_Linux + 271)
 #endif
@@ -1388,16 +1503,10 @@ struct kernel_io_event {
 #ifndef __NR_ioprio_get
 #define __NR_ioprio_get         (__NR_Linux + 278)
 #endif
-#ifndef __NR_setns
-#define __NR_setns              (__NR_Linux + 308)
-#endif
 /* End of MIPS (new 32bit API) definitions                                   */
 #endif
 /* End of MIPS definitions                                                   */
 #elif defined(__PPC__)
-#ifndef __NR_mount
-#define __NR_mount              21
-#endif
 #ifndef __NR_setfsuid
 #define __NR_setfsuid           138
 #define __NR_setfsgid           139
@@ -1421,23 +1530,18 @@ struct kernel_io_event {
 #ifndef __NR_pwrite64
 #define __NR_pwrite64           180
 #endif
-#ifndef __NR_capset
-#define __NR_capset             184
-#endif
 #ifndef __NR_ugetrlimit
 #define __NR_ugetrlimit         190
 #endif
 #ifndef __NR_readahead
 #define __NR_readahead          191
 #endif
-#ifndef __PPC64__
 #ifndef __NR_stat64
 #define __NR_stat64             195
 #endif
 #ifndef __NR_fstat64
 #define __NR_fstat64            197
 #endif
-#endif /* !defined(__PPC64__) */
 #ifndef __NR_getdents64
 #define __NR_getdents64         202
 #endif
@@ -1487,28 +1591,21 @@ struct kernel_io_event {
 #ifndef __NR_fstatfs64
 #define __NR_fstatfs64          253
 #endif
-#ifndef __PPC64__
 #ifndef __NR_fadvise64_64
 #define __NR_fadvise64_64       254
 #endif
-#endif /* !defined(__PPC64__) */
 #ifndef __NR_ioprio_set
 #define __NR_ioprio_set         273
 #endif
 #ifndef __NR_ioprio_get
 #define __NR_ioprio_get         274
 #endif
-#ifndef __NR_unshare
-#define __NR_unshare            282
-#endif
 #ifndef __NR_openat
 #define __NR_openat             286
 #endif
-#ifndef __PPC64__
 #ifndef __NR_fstatat64
 #define __NR_fstatat64          291
 #endif
-#endif /* !defined(__PPC64__) */
 #ifndef __NR_unlinkat
 #define __NR_unlinkat           292
 #endif
@@ -1518,75 +1615,256 @@ struct kernel_io_event {
 #ifndef __NR_getcpu
 #define __NR_getcpu             302
 #endif
-#ifndef __NR_setns
-#define __NR_setns              350
-#endif
 /* End of powerpc defininitions                                              */
+#elif defined(__s390__)
+#ifndef __NR_quotactl
+#define __NR_quotactl           131
 #endif
-
-
-/* After forking, we must make sure to only call system calls.               */
-#if __BOUNDED_POINTERS__
-  #error "Need to port invocations of syscalls for bounded ptrs"
-#else
-  /* The core dumper and the thread lister get executed after threads
-   * have been suspended. As a consequence, we cannot call any functions
-   * that acquire locks. Unfortunately, libc wraps most system calls
-   * (e.g. in order to implement pthread_atfork, and to make calls
-   * cancellable), which means we cannot call these functions. Instead,
-   * we have to call syscall() directly.
-   */
-  #undef LSS_ERRNO
-  #ifdef SYS_ERRNO
-    /* Allow the including file to override the location of errno. This can
-     * be useful when using clone() with the CLONE_VM option.
-     */
-    #define LSS_ERRNO SYS_ERRNO
-  #else
-    #define LSS_ERRNO errno
-  #endif
-
-  #undef LSS_INLINE
-  #ifdef SYS_INLINE
-    #define LSS_INLINE SYS_INLINE
-  #else
-    #define LSS_INLINE static inline
-  #endif
-
-  /* Allow the including file to override the prefix used for all new
-   * system calls. By default, it will be set to "sys_".
-   */
-  #undef LSS_NAME
-  #ifndef SYS_PREFIX
-    #define LSS_NAME(name) sys_##name
-  #elif SYS_PREFIX < 0
-    #define LSS_NAME(name) name
-  #elif SYS_PREFIX == 0
-    #define LSS_NAME(name) sys0_##name
-  #elif SYS_PREFIX == 1
-    #define LSS_NAME(name) sys1_##name
-  #elif SYS_PREFIX == 2
-    #define LSS_NAME(name) sys2_##name
-  #elif SYS_PREFIX == 3
-    #define LSS_NAME(name) sys3_##name
-  #elif SYS_PREFIX == 4
-    #define LSS_NAME(name) sys4_##name
-  #elif SYS_PREFIX == 5
-    #define LSS_NAME(name) sys5_##name
-  #elif SYS_PREFIX == 6
-    #define LSS_NAME(name) sys6_##name
-  #elif SYS_PREFIX == 7
-    #define LSS_NAME(name) sys7_##name
-  #elif SYS_PREFIX == 8
-    #define LSS_NAME(name) sys8_##name
-  #elif SYS_PREFIX == 9
-    #define LSS_NAME(name) sys9_##name
-  #endif
-
-  #undef  LSS_RETURN
-  #if (defined(__i386__) || defined(__x86_64__) || defined(__arm__))
-  /* Failing system calls return a negative result in the range of
-   * -1..-4095. These are "errno" values with the sign inverted.
+#ifndef __NR_rt_sigreturn
+#define __NR_rt_sigreturn       173
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigaction       174
+#endif
+#ifndef __NR_rt_sigprocmask
+#define __NR_rt_sigprocmask     175
+#endif
+#ifndef __NR_rt_sigpending
+#define __NR_rt_sigpending      176
+#endif
+#ifndef __NR_rt_sigsuspend
+#define __NR_rt_sigsuspend      179
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64            180
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64           181
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64         220
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead          222
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr           224
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr          225
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr           227
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr          228
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr          230
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr         231
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid             236
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill              237
+#endif
+#ifndef __NR_futex
+#define __NR_futex              238
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity  239
+#endif
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity  240
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address    252
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime      260
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres       261
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64           265
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64          266
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set         282
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get         283
+#endif
+#ifndef __NR_openat
+#define __NR_openat             288
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat           294
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages         310
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu             311
+#endif
+#ifndef __NR_fallocate
+#define __NR_fallocate          314
+#endif
+/* Some syscalls are named/numbered differently between s390 and s390x. */
+#ifdef __s390x__
+# ifndef __NR_getrlimit
+# define __NR_getrlimit          191
+# endif
+# ifndef __NR_setresuid
+# define __NR_setresuid          208
+# endif
+# ifndef __NR_getresuid
+# define __NR_getresuid          209
+# endif
+# ifndef __NR_setresgid
+# define __NR_setresgid          210
+# endif
+# ifndef __NR_getresgid
+# define __NR_getresgid          211
+# endif
+# ifndef __NR_setfsuid
+# define __NR_setfsuid           215
+# endif
+# ifndef __NR_setfsgid
+# define __NR_setfsgid           216
+# endif
+# ifndef __NR_fadvise64
+# define __NR_fadvise64          253
+# endif
+# ifndef __NR_newfstatat
+# define __NR_newfstatat         293
+# endif
+#else /* __s390x__ */
+# ifndef __NR_getrlimit
+# define __NR_getrlimit          76
+# endif
+# ifndef __NR_setfsuid
+# define __NR_setfsuid           138
+# endif
+# ifndef __NR_setfsgid
+# define __NR_setfsgid           139
+# endif
+# ifndef __NR_setresuid
+# define __NR_setresuid          164
+# endif
+# ifndef __NR_getresuid
+# define __NR_getresuid          165
+# endif
+# ifndef __NR_setresgid
+# define __NR_setresgid          170
+# endif
+# ifndef __NR_getresgid
+# define __NR_getresgid          171
+# endif
+# ifndef __NR_ugetrlimit
+# define __NR_ugetrlimit         191
+# endif
+# ifndef __NR_mmap2
+# define __NR_mmap2              192
+# endif
+# ifndef __NR_setresuid32
+# define __NR_setresuid32        208
+# endif
+# ifndef __NR_getresuid32
+# define __NR_getresuid32        209
+# endif
+# ifndef __NR_setresgid32
+# define __NR_setresgid32        210
+# endif
+# ifndef __NR_getresgid32
+# define __NR_getresgid32        211
+# endif
+# ifndef __NR_setfsuid32
+# define __NR_setfsuid32         215
+# endif
+# ifndef __NR_setfsgid32
+# define __NR_setfsgid32         216
+# endif
+# ifndef __NR_fadvise64_64
+# define __NR_fadvise64_64       264
+# endif
+# ifndef __NR_fstatat64
+# define __NR_fstatat64          293
+# endif
+#endif /* __s390__ */
+/* End of s390/s390x definitions                                             */
+#endif
+
+
+/* After forking, we must make sure to only call system calls.               */
+#if defined(__BOUNDED_POINTERS__)
+  #error "Need to port invocations of syscalls for bounded ptrs"
+#else
+  /* The core dumper and the thread lister get executed after threads
+   * have been suspended. As a consequence, we cannot call any functions
+   * that acquire locks. Unfortunately, libc wraps most system calls
+   * (e.g. in order to implement pthread_atfork, and to make calls
+   * cancellable), which means we cannot call these functions. Instead,
+   * we have to call syscall() directly.
+   */
+  #undef LSS_ERRNO
+  #ifdef SYS_ERRNO
+    /* Allow the including file to override the location of errno. This can
+     * be useful when using clone() with the CLONE_VM option.
+     */
+    #define LSS_ERRNO SYS_ERRNO
+  #else
+    #define LSS_ERRNO errno
+  #endif
+
+  #undef LSS_INLINE
+  #ifdef SYS_INLINE
+    #define LSS_INLINE SYS_INLINE
+  #else
+    #define LSS_INLINE static inline
+  #endif
+
+  /* Allow the including file to override the prefix used for all new
+   * system calls. By default, it will be set to "sys_".
+   */
+  #undef LSS_NAME
+  #ifndef SYS_PREFIX
+    #define LSS_NAME(name) sys_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX < 0
+    #define LSS_NAME(name) name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 0
+    #define LSS_NAME(name) sys0_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 1
+    #define LSS_NAME(name) sys1_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 2
+    #define LSS_NAME(name) sys2_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 3
+    #define LSS_NAME(name) sys3_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 4
+    #define LSS_NAME(name) sys4_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 5
+    #define LSS_NAME(name) sys5_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 6
+    #define LSS_NAME(name) sys6_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 7
+    #define LSS_NAME(name) sys7_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 8
+    #define LSS_NAME(name) sys8_##name
+  #elif defined(SYS_PREFIX) && SYS_PREFIX == 9
+    #define LSS_NAME(name) sys9_##name
+  #endif
+
+  #undef  LSS_RETURN
+  #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) \
+       || defined(__ARM_EABI__) || defined(__aarch64__) || defined(__s390__))
+  /* Failing system calls return a negative result in the range of
+   * -1..-4095. These are "errno" values with the sign inverted.
    */
   #define LSS_RETURN(type, res)                                               \
     do {                                                                      \
@@ -1603,7 +1881,8 @@ struct kernel_io_event {
   #define LSS_RETURN(type, res, err)                                          \
     do {                                                                      \
       if (err) {                                                              \
-        LSS_ERRNO = (res);                                                    \
+        unsigned long __errnovalue = (res);                                   \
+        LSS_ERRNO = __errnovalue;                                             \
         res = -1;                                                             \
       }                                                                       \
       return (type) (res);                                                    \
@@ -1622,15 +1901,6 @@ struct kernel_io_event {
    } while (0)
   #endif
   #if defined(__i386__)
-    #if defined(NO_FRAME_POINTER) && (100 * __GNUC__ + __GNUC_MINOR__ >= 404)
-      /* This only works for GCC-4.4 and above -- the first version to use
-         .cfi directives for dwarf unwind info.  */
-      #define CFI_ADJUST_CFA_OFFSET(adjust)                                   \
-                  ".cfi_adjust_cfa_offset " #adjust "\n"
-    #else
-      #define CFI_ADJUST_CFA_OFFSET(adjust) /**/
-    #endif
-
     /* In PIC mode (e.g. when building shared libraries), gcc for i386
      * reserves ebx. Unfortunately, most distribution ship with implementations
      * of _syscallX() which clobber ebx.
@@ -1639,15 +1909,58 @@ struct kernel_io_event {
      * at optimizing across __asm__ calls.
      * So, we just have to redefine all of the _syscallX() macros.
      */
+    #undef LSS_ENTRYPOINT
+    #ifdef SYS_SYSCALL_ENTRYPOINT
+    static inline void (**LSS_NAME(get_syscall_entrypoint)(void))(void) {
+      void (**entrypoint)(void);
+      asm volatile(".bss\n"
+                   ".align 8\n"
+                   ".globl " SYS_SYSCALL_ENTRYPOINT "\n"
+                   ".common " SYS_SYSCALL_ENTRYPOINT ",8,8\n"
+                   ".previous\n"
+                   /* This logically does 'lea "SYS_SYSCALL_ENTRYPOINT", %0' */
+                   "call 0f\n"
+                 "0:pop  %0\n"
+                   "add  $_GLOBAL_OFFSET_TABLE_+[.-0b], %0\n"
+                   "mov  " SYS_SYSCALL_ENTRYPOINT "@GOT(%0), %0\n"
+                   : "=r"(entrypoint));
+      return entrypoint;
+    }
+
+    #define LSS_ENTRYPOINT ".bss\n"                                           \
+                           ".align 8\n"                                       \
+                           ".globl " SYS_SYSCALL_ENTRYPOINT "\n"              \
+                           ".common " SYS_SYSCALL_ENTRYPOINT ",8,8\n"         \
+                           ".previous\n"                                      \
+                           /* Check the SYS_SYSCALL_ENTRYPOINT vector      */ \
+                           "push %%eax\n"                                     \
+                           "call 10000f\n"                                    \
+                     "10000:pop  %%eax\n"                                     \
+                           "add  $_GLOBAL_OFFSET_TABLE_+[.-10000b], %%eax\n"  \
+                           "mov  " SYS_SYSCALL_ENTRYPOINT                     \
+                                 "@GOT(%%eax), %%eax\n"                       \
+                           "mov  0(%%eax), %%eax\n"                           \
+                           "test %%eax, %%eax\n"                              \
+                           "jz   10002f\n"                                    \
+                           "push %%eax\n"                                     \
+                           "call 10001f\n"                                    \
+                     "10001:pop  %%eax\n"                                     \
+                           "add  $(10003f-10001b), %%eax\n"                   \
+                           "xchg 4(%%esp), %%eax\n"                           \
+                           "ret\n"                                            \
+                     "10002:pop  %%eax\n"                                     \
+                           "int $0x80\n"                                      \
+                     "10003:\n"
+    #else
+    #define LSS_ENTRYPOINT "int $0x80\n"
+    #endif
     #undef  LSS_BODY
     #define LSS_BODY(type,args...)                                            \
       long __res;                                                             \
       __asm__ __volatile__("push %%ebx\n"                                     \
-                           CFI_ADJUST_CFA_OFFSET(4)                           \
                            "movl %2,%%ebx\n"                                  \
-                           "int $0x80\n"                                      \
-                           "pop %%ebx\n"                                      \
-                           CFI_ADJUST_CFA_OFFSET(-4)                          \
+                           LSS_ENTRYPOINT                                     \
+                           "pop %%ebx"                                        \
                            args                                               \
                            : "esp", "memory");                                \
       LSS_RETURN(type,__res)
@@ -1655,7 +1968,7 @@ struct kernel_io_event {
     #define _syscall0(type,name)                                              \
       type LSS_NAME(name)(void) {                                             \
         long __res;                                                           \
-        __asm__ volatile("int $0x80"                                          \
+        __asm__ volatile(LSS_ENTRYPOINT                                       \
                          : "=a" (__res)                                       \
                          : "0" (__NR_##name)                                  \
                          : "memory");                                         \
@@ -1700,7 +2013,7 @@ struct kernel_io_event {
         __asm__ __volatile__("push %%ebx\n"                                   \
                              "movl %2,%%ebx\n"                                \
                              "movl %1,%%eax\n"                                \
-                             "int  $0x80\n"                                   \
+                             LSS_ENTRYPOINT                                   \
                              "pop  %%ebx"                                     \
                              : "=a" (__res)                                   \
                              : "i" (__NR_##name), "ri" ((long)(arg1)),        \
@@ -1721,7 +2034,7 @@ struct kernel_io_event {
                              "movl 4(%2),%%ebp\n"                             \
                              "movl 0(%2), %%ebx\n"                            \
                              "movl %1,%%eax\n"                                \
-                             "int  $0x80\n"                                   \
+                             LSS_ENTRYPOINT                                   \
                              "pop  %%ebx\n"                                   \
                              "pop  %%ebp"                                     \
                              : "=a" (__res)                                   \
@@ -1777,7 +2090,7 @@ struct kernel_io_event {
                            "pushl  %%ebx\n"
                            "movl   %%eax,%%ebx\n"
                            "movl   %2,%%eax\n"
-                           "int    $0x80\n"
+                           LSS_ENTRYPOINT
 
                            /* In the parent: restore %ebx
                             * In the child:  move "fn" into %ebx
@@ -1805,7 +2118,7 @@ struct kernel_io_event {
                             */
                            "movl   %%eax,%%ebx\n"
                            "movl   $1,%%eax\n"
-                           "int    $0x80\n"
+                           LSS_ENTRYPOINT
 
                            /* Return to parent.
                             */
@@ -1818,32 +2131,6 @@ struct kernel_io_event {
       LSS_RETURN(int, __res);
     }
 
-    #define __NR__fadvise64_64 __NR_fadvise64_64
-    LSS_INLINE _syscall6(int, _fadvise64_64, int, fd,
-                         unsigned, offset_lo, unsigned, offset_hi,
-                         unsigned, len_lo, unsigned, len_hi,
-                         int, advice)
-
-    LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset,
-                                       loff_t len, int advice) {
-      return LSS_NAME(_fadvise64_64)(fd,
-                                     (unsigned)offset, (unsigned)(offset >>32),
-                                     (unsigned)len, (unsigned)(len >> 32),
-                                     advice);
-    }
-
-    #define __NR__fallocate __NR_fallocate
-    LSS_INLINE _syscall6(int, _fallocate, int, fd,
-                         int, mode,
-                         unsigned, offset_lo, unsigned, offset_hi,
-                         unsigned, len_lo, unsigned, len_hi)
-
-    LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode,
-                                       loff_t offset, loff_t len) {
-      union { loff_t off; unsigned w[2]; } o = { offset }, l = { len };
-      return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]);
-    }
-
     LSS_INLINE _syscall1(int, set_thread_area, void *, u)
     LSS_INLINE _syscall1(int, get_thread_area, void *, u)
 
@@ -1858,7 +2145,7 @@ struct kernel_io_event {
       __asm__ __volatile__("call   2f\n"
                          "0:.align 16\n"
                          "1:movl   %1,%%eax\n"
-                           "int    $0x80\n"
+                           LSS_ENTRYPOINT
                          "2:popl   %0\n"
                            "addl   $(1b-0b),%0\n"
                            : "=a" (res)
@@ -1877,7 +2164,7 @@ struct kernel_io_event {
                          "0:.align 16\n"
                          "1:pop    %%eax\n"
                            "movl   %1,%%eax\n"
-                           "int    $0x80\n"
+                           LSS_ENTRYPOINT
                          "2:popl   %0\n"
                            "addl   $(1b-0b),%0\n"
                            : "=a" (res)
@@ -1891,74 +2178,171 @@ struct kernel_io_event {
      * location (e.g. when using the clone() system call with the CLONE_VM
      * option).
      */
+    #undef LSS_ENTRYPOINT
+    #ifdef SYS_SYSCALL_ENTRYPOINT
+    static inline void (**LSS_NAME(get_syscall_entrypoint)(void))(void) {
+      void (**entrypoint)(void);
+      asm volatile(".bss\n"
+                   ".align 8\n"
+                   ".globl " SYS_SYSCALL_ENTRYPOINT "\n"
+                   ".common " SYS_SYSCALL_ENTRYPOINT ",8,8\n"
+                   ".previous\n"
+                   "mov " SYS_SYSCALL_ENTRYPOINT "@GOTPCREL(%%rip), %0\n"
+                   : "=r"(entrypoint));
+      return entrypoint;
+    }
+
+    #define LSS_ENTRYPOINT                                                    \
+              ".bss\n"                                                        \
+              ".align 8\n"                                                    \
+              ".globl " SYS_SYSCALL_ENTRYPOINT "\n"                           \
+              ".common " SYS_SYSCALL_ENTRYPOINT ",8,8\n"                      \
+              ".previous\n"                                                   \
+              "mov " SYS_SYSCALL_ENTRYPOINT "@GOTPCREL(%%rip), %%rcx\n"       \
+              "mov  0(%%rcx), %%rcx\n"                                        \
+              "test %%rcx, %%rcx\n"                                           \
+              "jz   10001f\n"                                                 \
+              "call *%%rcx\n"                                                 \
+              "jmp  10002f\n"                                                 \
+        "10001:syscall\n"                                                     \
+        "10002:\n"
+
+    #else
+    #define LSS_ENTRYPOINT "syscall\n"
+    #endif
+
+    /* The x32 ABI has 32 bit longs, but the syscall interface is 64 bit.
+     * We need to explicitly cast to an unsigned 64 bit type to avoid implicit
+     * sign extension.  We can't cast pointers directly because those are
+     * 32 bits, and gcc will dump ugly warnings about casting from a pointer
+     * to an integer of a different size.
+     */
+    #undef  LSS_SYSCALL_ARG
+    #define LSS_SYSCALL_ARG(a) ((uint64_t)(uintptr_t)(a))
+    #undef  _LSS_RETURN
+    #define _LSS_RETURN(type, res, cast)                                      \
+      do {                                                                    \
+        if ((uint64_t)(res) >= (uint64_t)(-4095)) {                           \
+          LSS_ERRNO = -(res);                                                 \
+          res = -1;                                                           \
+        }                                                                     \
+        return (type)(cast)(res);                                             \
+      } while (0)
+    #undef  LSS_RETURN
+    #define LSS_RETURN(type, res) _LSS_RETURN(type, res, uintptr_t)
+
+    #undef  _LSS_BODY
+    #define _LSS_BODY(nr, type, name, cast, ...)                              \
+          long long __res;                                                    \
+          __asm__ __volatile__(LSS_BODY_ASM##nr LSS_ENTRYPOINT                \
+            : "=a" (__res)                                                    \
+            : "0" (__NR_##name) LSS_BODY_ARG##nr(__VA_ARGS__)                 \
+            : LSS_BODY_CLOBBER##nr "r11", "rcx", "memory");                   \
+          _LSS_RETURN(type, __res, cast)
     #undef  LSS_BODY
-    #define LSS_BODY(type,name, ...)                                          \
-          long __res;                                                         \
-          __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name),  \
-            ##__VA_ARGS__ : "r11", "rcx", "memory");                          \
-          LSS_RETURN(type, __res)
+    #define LSS_BODY(nr, type, name, args...) \
+      _LSS_BODY(nr, type, name, uintptr_t, ## args)
+
+    #undef  LSS_BODY_ASM0
+    #undef  LSS_BODY_ASM1
+    #undef  LSS_BODY_ASM2
+    #undef  LSS_BODY_ASM3
+    #undef  LSS_BODY_ASM4
+    #undef  LSS_BODY_ASM5
+    #undef  LSS_BODY_ASM6
+    #define LSS_BODY_ASM0
+    #define LSS_BODY_ASM1 LSS_BODY_ASM0
+    #define LSS_BODY_ASM2 LSS_BODY_ASM1
+    #define LSS_BODY_ASM3 LSS_BODY_ASM2
+    #define LSS_BODY_ASM4 LSS_BODY_ASM3 "movq %5,%%r10;"
+    #define LSS_BODY_ASM5 LSS_BODY_ASM4 "movq %6,%%r8;"
+    #define LSS_BODY_ASM6 LSS_BODY_ASM5 "movq %7,%%r9;"
+
+    #undef  LSS_BODY_CLOBBER0
+    #undef  LSS_BODY_CLOBBER1
+    #undef  LSS_BODY_CLOBBER2
+    #undef  LSS_BODY_CLOBBER3
+    #undef  LSS_BODY_CLOBBER4
+    #undef  LSS_BODY_CLOBBER5
+    #undef  LSS_BODY_CLOBBER6
+    #define LSS_BODY_CLOBBER0
+    #define LSS_BODY_CLOBBER1 LSS_BODY_CLOBBER0
+    #define LSS_BODY_CLOBBER2 LSS_BODY_CLOBBER1
+    #define LSS_BODY_CLOBBER3 LSS_BODY_CLOBBER2
+    #define LSS_BODY_CLOBBER4 LSS_BODY_CLOBBER3 "r10",
+    #define LSS_BODY_CLOBBER5 LSS_BODY_CLOBBER4 "r8",
+    #define LSS_BODY_CLOBBER6 LSS_BODY_CLOBBER5 "r9",
+
+    #undef  LSS_BODY_ARG0
+    #undef  LSS_BODY_ARG1
+    #undef  LSS_BODY_ARG2
+    #undef  LSS_BODY_ARG3
+    #undef  LSS_BODY_ARG4
+    #undef  LSS_BODY_ARG5
+    #undef  LSS_BODY_ARG6
+    #define LSS_BODY_ARG0()
+    #define LSS_BODY_ARG1(arg1) \
+      LSS_BODY_ARG0(), "D" (arg1)
+    #define LSS_BODY_ARG2(arg1, arg2) \
+      LSS_BODY_ARG1(arg1), "S" (arg2)
+    #define LSS_BODY_ARG3(arg1, arg2, arg3) \
+      LSS_BODY_ARG2(arg1, arg2), "d" (arg3)
+    #define LSS_BODY_ARG4(arg1, arg2, arg3, arg4) \
+      LSS_BODY_ARG3(arg1, arg2, arg3), "r" (arg4)
+    #define LSS_BODY_ARG5(arg1, arg2, arg3, arg4, arg5) \
+      LSS_BODY_ARG4(arg1, arg2, arg3, arg4), "r" (arg5)
+    #define LSS_BODY_ARG6(arg1, arg2, arg3, arg4, arg5, arg6) \
+      LSS_BODY_ARG5(arg1, arg2, arg3, arg4, arg5), "r" (arg6)
+
     #undef _syscall0
     #define _syscall0(type,name)                                              \
-      type LSS_NAME(name)() {                                                 \
-        LSS_BODY(type, name);                                                 \
+      type LSS_NAME(name)(void) {                                             \
+        LSS_BODY(0, type, name);                                              \
       }
     #undef _syscall1
     #define _syscall1(type,name,type1,arg1)                                   \
       type LSS_NAME(name)(type1 arg1) {                                       \
-        LSS_BODY(type, name, "D" ((long)(arg1)));                             \
+        LSS_BODY(1, type, name, LSS_SYSCALL_ARG(arg1));                       \
       }
     #undef _syscall2
     #define _syscall2(type,name,type1,arg1,type2,arg2)                        \
       type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
-        LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)));         \
+        LSS_BODY(2, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2));\
       }
     #undef _syscall3
     #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)             \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
-        LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)),          \
-                             "d" ((long)(arg3)));                             \
+        LSS_BODY(3, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3));                       \
       }
     #undef _syscall4
     #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
-          long __res;                                                         \
-          __asm__ __volatile__("movq %5,%%r10; syscall" :                     \
-            "=a" (__res) : "0" (__NR_##name),                                 \
-            "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)),       \
-            "r" ((long)(arg4)) : "r10", "r11", "rcx", "memory");              \
-          LSS_RETURN(type, __res);                                            \
+        LSS_BODY(4, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4));\
       }
     #undef _syscall5
     #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
                       type5,arg5)                                             \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
                           type5 arg5) {                                       \
-          long __res;                                                         \
-          __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" :       \
-            "=a" (__res) : "0" (__NR_##name),                                 \
-            "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)),       \
-            "r" ((long)(arg4)), "r" ((long)(arg5)) :                          \
-            "r8", "r10", "r11", "rcx", "memory");                             \
-          LSS_RETURN(type, __res);                                            \
+        LSS_BODY(5, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4), \
+                                LSS_SYSCALL_ARG(arg5));                       \
       }
     #undef _syscall6
     #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
                       type5,arg5,type6,arg6)                                  \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
                           type5 arg5, type6 arg6) {                           \
-          long __res;                                                         \
-          __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;"   \
-                               "syscall" :                                    \
-            "=a" (__res) : "0" (__NR_##name),                                 \
-            "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)),       \
-            "r" ((long)(arg4)), "r" ((long)(arg5)), "r" ((long)(arg6)) :      \
-            "r8", "r9", "r10", "r11", "rcx", "memory");                       \
-          LSS_RETURN(type, __res);                                            \
+        LSS_BODY(6, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \
+                                LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4), \
+                                LSS_SYSCALL_ARG(arg5), LSS_SYSCALL_ARG(arg6));\
       }
     LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
                                    int flags, void *arg, int *parent_tidptr,
                                    void *newtls, int *child_tidptr) {
-      long __res;
+      long long __res;
       {
         __asm__ __volatile__(/* if (fn == NULL)
                               *   return -EINVAL;
@@ -1972,10 +2356,8 @@ struct kernel_io_event {
                              "testq  %5,%5\n"
                              "jz     1f\n"
 
-                             /* Set up alignment of the child stack:
-                              * child_stack = (child_stack & ~0xF) - 16;
+                             /* childstack -= 2*sizeof(void *);
                               */
-                             "andq   $-16,%5\n"
                              "subq   $16,%5\n"
 
                              /* Push "arg" and "fn" onto the stack that will be
@@ -1994,7 +2376,7 @@ struct kernel_io_event {
                              "movq   %2,%%rax\n"
                              "movq   %9,%%r8\n"
                              "movq   %10,%%r10\n"
-                             "syscall\n"
+                             LSS_ENTRYPOINT
 
                              /* if (%rax != 0)
                               *   return;
@@ -2016,22 +2398,25 @@ struct kernel_io_event {
                               */
                              "movq   %%rax,%%rdi\n"
                              "movq   %3,%%rax\n"
-                             "syscall\n"
+                             LSS_ENTRYPOINT
 
                              /* Return to parent.
                               */
                            "1:\n"
                              : "=a" (__res)
                              : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
-                               "r"(fn), "S"(child_stack), "D"(flags), "r"(arg),
-                               "d"(parent_tidptr), "g"(newtls), "g"(child_tidptr)
-                             : "rsp", "memory", "r8", "r10", "r11", "rcx");
+                               "r"(LSS_SYSCALL_ARG(fn)),
+                               "S"(LSS_SYSCALL_ARG(child_stack)),
+                               "D"(LSS_SYSCALL_ARG(flags)),
+                               "r"(LSS_SYSCALL_ARG(arg)),
+                               "d"(LSS_SYSCALL_ARG(parent_tidptr)),
+                               "r"(LSS_SYSCALL_ARG(newtls)),
+                               "r"(LSS_SYSCALL_ARG(child_tidptr))
+                             : "memory", "r8", "r10", "r11", "rcx");
       }
       LSS_RETURN(int, __res);
     }
     LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a)
-    LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len,
-                         int,  advice)
 
     LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
       /* On x86-64, the kernel does not know how to return from
@@ -2040,175 +2425,111 @@ struct kernel_io_event {
        * Unfortunately, we cannot just reference the glibc version of this
        * function, as glibc goes out of its way to make it inaccessible.
        */
-      void (*res)(void);
-      __asm__ __volatile__("call   2f\n"
-                         "0:.align 16\n"
+      long long res;
+      __asm__ __volatile__("jmp    2f\n"
+                           ".align 16\n"
                          "1:movq   %1,%%rax\n"
-                           "syscall\n"
-                         "2:popq   %0\n"
-                           "addq   $(1b-0b),%0\n"
-                           : "=a" (res)
+                           LSS_ENTRYPOINT
+                         "2:leaq   1b(%%rip),%0\n"
+                           : "=r" (res)
                            : "i"  (__NR_rt_sigreturn));
-      return res;
+      return (void (*)(void))(uintptr_t)res;
     }
-  #elif defined(__arm__)
+  #elif defined(__ARM_ARCH_3__)
     /* Most definitions of _syscallX() neglect to mark "memory" as being
      * clobbered. This causes problems with compilers, that do a better job
      * at optimizing across __asm__ calls.
-     * So, we just have to redefine all fo the _syscallX() macros.
+     * So, we just have to redefine all of the _syscallX() macros.
      */
     #undef LSS_REG
     #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a
-
-    /* r0..r3 are scratch registers and not preserved across function
-     * calls.  We need to first evaluate the first 4 syscall arguments
-     * and store them on stack.  They must be loaded into r0..r3 after
-     * all function calls to avoid r0..r3 being clobbered.
-     */
-    #undef LSS_SAVE_ARG
-    #define LSS_SAVE_ARG(r,a) long __tmp##r = (long)a
-    #undef LSS_LOAD_ARG
-    #define LSS_LOAD_ARG(r) register long __r##r __asm__("r"#r) = __tmp##r
-
     #undef  LSS_BODY
-    #define LSS_BODY(type, name, args...)                                     \
+    #define LSS_BODY(type,name,args...)                                       \
           register long __res_r0 __asm__("r0");                               \
           long __res;                                                         \
-          __SYS_REG(name)                                                     \
-          __asm__ __volatile__ (__syscall_safe(name)                          \
-                                : "=r"(__res_r0)                              \
-                                : __SYS_REG_LIST(args)                        \
-                                : "lr", "memory");                            \
+          __asm__ __volatile__ (__syscall(name)                               \
+                                : "=r"(__res_r0) : args : "lr", "memory");    \
           __res = __res_r0;                                                   \
           LSS_RETURN(type, __res)
     #undef _syscall0
     #define _syscall0(type, name)                                             \
-      type LSS_NAME(name)() {                                                 \
+      type LSS_NAME(name)(void) {                                             \
         LSS_BODY(type, name);                                                 \
       }
     #undef _syscall1
     #define _syscall1(type, name, type1, arg1)                                \
       type LSS_NAME(name)(type1 arg1) {                                       \
-        /* There is no need for using a volatile temp.  */                    \
-        LSS_REG(0, arg1);                                                     \
-        LSS_BODY(type, name, "r"(__r0));                                      \
+        LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0));                    \
       }
     #undef _syscall2
     #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
       type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
-        LSS_SAVE_ARG(0, arg1);                                                \
-        LSS_SAVE_ARG(1, arg2);                                                \
-        LSS_LOAD_ARG(0);                                                      \
-        LSS_LOAD_ARG(1);                                                      \
+        LSS_REG(0, arg1); LSS_REG(1, arg2);                                   \
         LSS_BODY(type, name, "r"(__r0), "r"(__r1));                           \
       }
     #undef _syscall3
     #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
-        LSS_SAVE_ARG(0, arg1);                                                \
-        LSS_SAVE_ARG(1, arg2);                                                \
-        LSS_SAVE_ARG(2, arg3);                                                \
-        LSS_LOAD_ARG(0);                                                      \
-        LSS_LOAD_ARG(1);                                                      \
-        LSS_LOAD_ARG(2);                                                      \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
         LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2));                \
       }
     #undef _syscall4
-    #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3,      \
-                      type4, arg4)                                            \
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
-        LSS_SAVE_ARG(0, arg1);                                                \
-        LSS_SAVE_ARG(1, arg2);                                                \
-        LSS_SAVE_ARG(2, arg3);                                                \
-        LSS_SAVE_ARG(3, arg4);                                                \
-        LSS_LOAD_ARG(0);                                                      \
-        LSS_LOAD_ARG(1);                                                      \
-        LSS_LOAD_ARG(2);                                                      \
-        LSS_LOAD_ARG(3);                                                      \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4);                                                     \
         LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3));     \
       }
     #undef _syscall5
-    #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3,      \
-                      type4, arg4, type5, arg5)                               \
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
                           type5 arg5) {                                       \
-        LSS_SAVE_ARG(0, arg1);                                                \
-        LSS_SAVE_ARG(1, arg2);                                                \
-        LSS_SAVE_ARG(2, arg3);                                                \
-        LSS_SAVE_ARG(3, arg4);                                                \
-        LSS_REG(4, arg5);                                                     \
-        LSS_LOAD_ARG(0);                                                      \
-        LSS_LOAD_ARG(1);                                                      \
-        LSS_LOAD_ARG(2);                                                      \
-        LSS_LOAD_ARG(3);                                                      \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5);                                   \
         LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
                              "r"(__r4));                                      \
       }
     #undef _syscall6
-    #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3,      \
-                      type4, arg4, type5, arg5, type6, arg6)                  \
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
                           type5 arg5, type6 arg6) {                           \
-        LSS_SAVE_ARG(0, arg1);                                                \
-        LSS_SAVE_ARG(1, arg2);                                                \
-        LSS_SAVE_ARG(2, arg3);                                                \
-        LSS_SAVE_ARG(3, arg4);                                                \
-        LSS_REG(4, arg5);                                                     \
-        LSS_REG(5, arg6);                                                     \
-        LSS_LOAD_ARG(0);                                                      \
-        LSS_LOAD_ARG(1);                                                      \
-        LSS_LOAD_ARG(2);                                                      \
-        LSS_LOAD_ARG(3);                                                      \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6);                 \
         LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
                              "r"(__r4), "r"(__r5));                           \
       }
     LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
                                    int flags, void *arg, int *parent_tidptr,
                                    void *newtls, int *child_tidptr) {
-      register long __res __asm__("r5");
+      long __res;
       {
-        if (fn == NULL || child_stack == NULL) {
-            __res = -EINVAL;
-            goto clone_exit;
-        }
-
-        /* stash first 4 arguments on stack first because we can only load
-         * them after all function calls.
-         */
-        int    tmp_flags = flags;
-        int  * tmp_stack = (int*) child_stack;
-        void * tmp_ptid  = parent_tidptr;
-        void * tmp_tls   = newtls;
-
+        register int   __flags __asm__("r0") = flags;
+        register void *__stack __asm__("r1") = child_stack;
+        register void *__ptid  __asm__("r2") = parent_tidptr;
+        register void *__tls   __asm__("r3") = newtls;
         register int  *__ctid  __asm__("r4") = child_tidptr;
+        __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL)
+                              *   return -EINVAL;
+                              */
+                             "cmp   %2,#0\n"
+                             "cmpne %3,#0\n"
+                             "moveq %0,%1\n"
+                             "beq   1f\n"
 
-        /* Push "arg" and "fn" onto the stack that will be
-         * used by the child.
-         */
-        *(--tmp_stack) = (int) arg;
-        *(--tmp_stack) = (int) fn;
-
-        /* We must load r0..r3 last after all possible function calls.  */
-        register int   __flags __asm__("r0") = tmp_flags;
-        register void *__stack __asm__("r1") = tmp_stack;
-        register void *__ptid  __asm__("r2") = tmp_ptid;
-        register void *__tls   __asm__("r3") = tmp_tls;
-
-        /* %r0 = syscall(%r0 = flags,
-         *               %r1 = child_stack,
-         *               %r2 = parent_tidptr,
-         *               %r3 = newtls,
-         *               %r4 = child_tidptr)
-         */
-        __SYS_REG(clone)
-        __asm__ __volatile__(/* %r0 = syscall(%r0 = flags,
+                             /* Push "arg" and "fn" onto the stack that will be
+                              * used by the child.
+                              */
+                             "str   %5,[%3,#-4]!\n"
+                             "str   %2,[%3,#-4]!\n"
+
+                             /* %r0 = syscall(%r0 = flags,
                               *               %r1 = child_stack,
                               *               %r2 = parent_tidptr,
                               *               %r3 = newtls,
                               *               %r4 = child_tidptr)
                               */
-                             "push  {r7}\n"
-                             "mov   r7,%1\n"
                              __syscall(clone)"\n"
 
                              /* if (%r0 != 0)
@@ -2223,24 +2544,275 @@ struct kernel_io_event {
                              "mov   lr,pc\n"
                              "ldr   pc,[sp]\n"
 
-                             /* Call _exit(%r0), which never returns.  We only
-                              * need to set r7 for EABI syscall ABI but we do
-                              * this always to simplify code sharing between
-                              * old and new syscall ABIs.
+                             /* Call _exit(%r0).
+                              */
+                             __syscall(exit)"\n"
+                           "1:\n"
+                             : "=r" (__res)
+                             : "i"(-EINVAL),
+                               "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+                               "r"(__ptid), "r"(__tls), "r"(__ctid)
+                             : "cc", "lr", "memory");
+      }
+      LSS_RETURN(int, __res);
+    }
+  #elif defined(__ARM_EABI__)
+    /* Most definitions of _syscallX() neglect to mark "memory" as being
+     * clobbered. This causes problems with compilers, that do a better job
+     * at optimizing across __asm__ calls.
+     * So, we just have to redefine all fo the _syscallX() macros.
+     */
+    #undef LSS_REG
+    #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a
+    #undef  LSS_BODY
+    #define LSS_BODY(type,name,args...)                                       \
+          register long __res_r0 __asm__("r0");                               \
+          long __res;                                                         \
+          __asm__ __volatile__ ("push {r7}\n"                                 \
+                                "mov r7, %1\n"                                \
+                                "swi 0x0\n"                                   \
+                                "pop {r7}\n"                                  \
+                                : "=r"(__res_r0)                              \
+                                : "i"(__NR_##name) , ## args                  \
+                                : "lr", "memory");                            \
+          __res = __res_r0;                                                   \
+          LSS_RETURN(type, __res)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+      type LSS_NAME(name)(void) {                                             \
+        LSS_BODY(type, name);                                                 \
+      }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0));                    \
+      }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+      type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
+        LSS_REG(0, arg1); LSS_REG(1, arg2);                                   \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1));                           \
+      }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2));                \
+      }
+    #undef _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4);                                                     \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3));     \
+      }
+    #undef _syscall5
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5);                                   \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
+                             "r"(__r4));                                      \
+      }
+    #undef _syscall6
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6);                 \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
+                             "r"(__r4), "r"(__r5));                           \
+      }
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      long __res;
+      if (fn == NULL || child_stack == NULL) {
+        __res = -EINVAL;
+        LSS_RETURN(int, __res);
+      }
+
+      /* Push "arg" and "fn" onto the stack that will be
+       * used by the child.
+       */
+      {
+        uintptr_t* cstack = (uintptr_t*)child_stack - 2;
+        cstack[0] = (uintptr_t)fn;
+        cstack[1] = (uintptr_t)arg;
+        child_stack = cstack;
+      }
+      {
+        register int   __flags __asm__("r0") = flags;
+        register void *__stack __asm__("r1") = child_stack;
+        register void *__ptid  __asm__("r2") = parent_tidptr;
+        register void *__tls   __asm__("r3") = newtls;
+        register int  *__ctid  __asm__("r4") = child_tidptr;
+        __asm__ __volatile__(
+#ifdef __thumb2__
+            "push {r7}\n"
+#endif
+            /* %r0 = syscall(%r0 = flags,
+             *               %r1 = child_stack,
+             *               %r2 = parent_tidptr,
+             *               %r3 = newtls,
+             *               %r4 = child_tidptr)
+             */
+            "mov r7, %6\n"
+            "swi 0x0\n"
+
+            /* if (%r0 != 0)
+             *   return %r0;
+             */
+            "cmp   r0, #0\n"
+            "bne   1f\n"
+
+            /* In the child, now. Call "fn(arg)".
+             */
+            "ldr   r0,[sp, #4]\n"
+
+            "ldr   lr,[sp]\n"
+            "blx   lr\n"
+
+            /* Call _exit(%r0).
+             */
+            "mov r7, %7\n"
+            "swi 0x0\n"
+            /* Unreachable */
+            "bkpt #0\n"
+         "1:\n"
+#ifdef __thumb2__
+            "pop {r7}\n"
+#endif
+            "movs  %0,r0\n"
+            : "=r"(__res)
+            : "r"(__stack), "r"(__flags), "r"(__ptid), "r"(__tls), "r"(__ctid),
+              "i"(__NR_clone), "i"(__NR_exit)
+            : "cc", "lr", "memory"
+#ifndef __thumb2__
+            , "r7"
+#endif
+            );
+      }
+      LSS_RETURN(int, __res);
+    }
+  #elif defined(__aarch64__)
+    /* Most definitions of _syscallX() neglect to mark "memory" as being
+     * clobbered. This causes problems with compilers, that do a better job
+     * at optimizing across __asm__ calls.
+     * So, we just have to redefine all of the _syscallX() macros.
+     */
+    #undef LSS_REG
+    #define LSS_REG(r,a) register int64_t __r##r __asm__("x"#r) = (int64_t)a
+    #undef  LSS_BODY
+    #define LSS_BODY(type,name,args...)                                       \
+          register int64_t __res_x0 __asm__("x0");                            \
+          int64_t __res;                                                      \
+          __asm__ __volatile__ ("mov x8, %1\n"                                \
+                                "svc 0x0\n"                                   \
+                                : "=r"(__res_x0)                              \
+                                : "i"(__NR_##name) , ## args                  \
+                                : "x8", "memory");                            \
+          __res = __res_x0;                                                   \
+          LSS_RETURN(type, __res)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+      type LSS_NAME(name)(void) {                                             \
+        LSS_BODY(type, name);                                                 \
+      }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+      type LSS_NAME(name)(type1 arg1) {                                       \
+        LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0));                    \
+      }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+      type LSS_NAME(name)(type1 arg1, type2 arg2) {                           \
+        LSS_REG(0, arg1); LSS_REG(1, arg2);                                   \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1));                           \
+      }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {               \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2));                \
+      }
+    #undef _syscall4
+    #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) {   \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4);                                                     \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3));     \
+      }
+    #undef _syscall5
+    #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5)                                             \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5) {                                       \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5);                                   \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
+                             "r"(__r4));                                      \
+      }
+    #undef _syscall6
+    #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,  \
+                      type5,arg5,type6,arg6)                                  \
+      type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4,     \
+                          type5 arg5, type6 arg6) {                           \
+        LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3);                 \
+        LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6);                 \
+        LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),      \
+                             "r"(__r4), "r"(__r5));                           \
+      }
+
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      int64_t __res;
+      {
+        register uint64_t __flags __asm__("x0") = flags;
+        register void *__stack __asm__("x1") = child_stack;
+        register void *__ptid  __asm__("x2") = parent_tidptr;
+        register void *__tls   __asm__("x3") = newtls;
+        register int  *__ctid  __asm__("x4") = child_tidptr;
+        __asm__ __volatile__(/* Push "arg" and "fn" onto the stack that will be
+                              * used by the child.
+                              */
+                             "stp     %1, %4, [%2, #-16]!\n"
+
+                             /* %x0 = syscall(%x0 = flags,
+                              *               %x1 = child_stack,
+                              *               %x2 = parent_tidptr,
+                              *               %x3 = newtls,
+                              *               %x4 = child_tidptr)
+                              */
+                             "mov     x8, %8\n"
+                             "svc     0x0\n"
+
+                             /* if (%r0 != 0)
+                              *   return %r0;
+                              */
+                             "mov     %0, x0\n"
+                             "cbnz    x0, 1f\n"
+
+                             /* In the child, now. Call "fn(arg)".
                               */
-                             "mov   r7,%2\n"
-                             __syscall(exit)"\n"
+                             "ldp     x1, x0, [sp], #16\n"
+                             "blr     x1\n"
 
-                             /* Pop r7 from the stack only in the parent.
+                             /* Call _exit(%r0).
                               */
-                           "1: pop {r7}\n"
+                             "mov     x8, %9\n"
+                             "svc     0x0\n"
+                           "1:\n"
                              : "=r" (__res)
-                             : "r"(__sysreg),
-                               "i"(__NR_exit), "r"(__stack), "r"(__flags),
-                               "r"(__ptid), "r"(__tls), "r"(__ctid)
-                             : "cc", "lr", "memory");
+                             : "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+                               "r"(__ptid), "r"(__tls), "r"(__ctid),
+                               "i"(__NR_clone), "i"(__NR_exit)
+                             : "cc", "x8", "memory");
       }
-      clone_exit:
       LSS_RETURN(int, __res);
     }
   #elif defined(__mips__)
@@ -2248,17 +2820,26 @@ struct kernel_io_event {
     #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) =       \
                                  (unsigned long)(a)
     #undef  LSS_BODY
+    #undef LSS_SYSCALL_CLOBBERS
+    #if _MIPS_SIM == _MIPS_SIM_ABI32
+    #define LSS_SYSCALL_CLOBBERS "$1", "$3", "$8", "$9", "$10",               \
+                                 "$11", "$12", "$13", "$14", "$15",           \
+                                 "$24", "$25", "hi", "lo", "memory"
+    #else
+    #define LSS_SYSCALL_CLOBBERS "$1", "$3", "$10", "$11", "$12",             \
+                                 "$13", "$14", "$15", "$24", "$25",           \
+                                 "hi", "lo", "memory"
+    #endif
     #define LSS_BODY(type,name,r7,...)                                        \
           register unsigned long __v0 __asm__("$2") = __NR_##name;            \
           __asm__ __volatile__ ("syscall\n"                                   \
-                                : "=&r"(__v0), r7 (__r7)                      \
+                                : "=r"(__v0), r7 (__r7)                       \
                                 : "0"(__v0), ##__VA_ARGS__                    \
-                                : "$8", "$9", "$10", "$11", "$12",            \
-                                  "$13", "$14", "$15", "$24", "memory");      \
+                                : LSS_SYSCALL_CLOBBERS);                      \
           LSS_RETURN(type, __v0, __r7)
     #undef _syscall0
     #define _syscall0(type, name)                                             \
-      type LSS_NAME(name)() {                                                 \
+      type LSS_NAME(name)(void) {                                             \
         register unsigned long __r7 __asm__("$7");                            \
         LSS_BODY(type, name, "=r");                                           \
       }
@@ -2300,20 +2881,19 @@ struct kernel_io_event {
                           type5 arg5) {                                       \
         LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
         LSS_REG(7, arg4);                                                     \
-        register unsigned long __v0 __asm__("$2");                            \
+        register unsigned long __v0 __asm__("$2") = __NR_##name;              \
         __asm__ __volatile__ (".set noreorder\n"                              \
-                              "lw    $2, %6\n"                                \
                               "subu  $29, 32\n"                               \
-                              "sw    $2, 16($29)\n"                           \
-                              "li    $2, %2\n"                                \
+                              "sw    %5, 16($29)\n"                           \
                               "syscall\n"                                     \
                               "addiu $29, 32\n"                               \
                               ".set reorder\n"                                \
-                              : "=&r"(__v0), "+r" (__r7)                      \
-                              : "i" (__NR_##name), "r"(__r4), "r"(__r5),      \
-                                "r"(__r6), "m" ((unsigned long)arg5)          \
+                              : "+r"(__v0), "+r" (__r7)                       \
+                              : "r"(__r4), "r"(__r5),                         \
+                                "r"(__r6), "r" ((unsigned long)arg5)          \
                               : "$8", "$9", "$10", "$11", "$12",              \
-                                "$13", "$14", "$15", "$24", "memory");        \
+                                "$13", "$14", "$15", "$24", "$25",            \
+                                "memory");                                    \
         LSS_RETURN(type, __v0, __r7);                                         \
       }
     #else
@@ -2338,23 +2918,21 @@ struct kernel_io_event {
                           type5 arg5, type6 arg6) {                           \
         LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3);                 \
         LSS_REG(7, arg4);                                                     \
-        register unsigned long __v0 __asm__("$2");                            \
+        register unsigned long __v0 __asm__("$2") = __NR_##name;              \
         __asm__ __volatile__ (".set noreorder\n"                              \
-                              "lw    $2, %6\n"                                \
-                              "lw    $8, %7\n"                                \
                               "subu  $29, 32\n"                               \
-                              "sw    $2, 16($29)\n"                           \
-                              "sw    $8, 20($29)\n"                           \
-                              "li    $2, %2\n"                                \
+                              "sw    %5, 16($29)\n"                           \
+                              "sw    %6, 20($29)\n"                           \
                               "syscall\n"                                     \
                               "addiu $29, 32\n"                               \
                               ".set reorder\n"                                \
-                              : "=&r"(__v0), "+r" (__r7)                      \
-                              : "i" (__NR_##name), "r"(__r4), "r"(__r5),      \
+                              : "+r"(__v0), "+r" (__r7)                       \
+                              : "r"(__r4), "r"(__r5),                         \
                                 "r"(__r6), "r" ((unsigned long)arg5),         \
                                 "r" ((unsigned long)arg6)                     \
                               : "$8", "$9", "$10", "$11", "$12",              \
-                                "$13", "$14", "$15", "$24", "memory");        \
+                                "$13", "$14", "$15", "$24", "$25",            \
+                                "memory");                                    \
         LSS_RETURN(type, __v0, __r7);                                         \
       }
     #else
@@ -2371,7 +2949,7 @@ struct kernel_io_event {
     LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
                                    int flags, void *arg, int *parent_tidptr,
                                    void *newtls, int *child_tidptr) {
-      register unsigned long __v0 __asm__("$2");
+      register unsigned long __v0 __asm__("$2") = -EINVAL;
       register unsigned long __r7 __asm__("$7") = (unsigned long)newtls;
       {
         register int   __flags __asm__("$4") = flags;
@@ -2390,25 +2968,24 @@ struct kernel_io_event {
                              /* if (fn == NULL || child_stack == NULL)
                               *   return -EINVAL;
                               */
-                             "li    %0,%2\n"
+                             "beqz  %4,1f\n"
                              "beqz  %5,1f\n"
-                             "beqz  %6,1f\n"
 
                              /* Push "arg" and "fn" onto the stack that will be
                               * used by the child.
                               */
           #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
-                             "subu  %6,32\n"
-                             "sw    %5,0(%6)\n"
-                             "sw    %8,4(%6)\n"
+                             "subu  %5,32\n"
+                             "sw    %4,0(%5)\n"
+                             "sw    %7,4(%5)\n"
           #elif _MIPS_SIM == _MIPS_SIM_NABI32
-                             "sub   %6,32\n"
-                             "sw    %5,0(%6)\n"
-                             "sw    %8,8(%6)\n"
+                             "sub   %5,32\n"
+                             "sw    %4,0(%5)\n"
+                             "sw    %7,8(%5)\n"
           #else
-                             "dsubu %6,32\n"
-                             "sd    %5,0(%6)\n"
-                             "sd    %8,8(%6)\n"
+                             "dsubu %5,32\n"
+                             "sd    %4,0(%5)\n"
+                             "sd    %7,8(%5)\n"
           #endif
 
                              /* $7 = syscall($4 = flags,
@@ -2417,7 +2994,7 @@ struct kernel_io_event {
                               *              $7 = newtls,
                               *              $8 = child_tidptr)
                               */
-                             "li    $2,%3\n"
+                             "li    $2,%2\n"
                              "syscall\n"
 
                              /* if ($7 != 0)
@@ -2443,7 +3020,7 @@ struct kernel_io_event {
                              /* Call _exit($2)
                               */
                             "move  $4,$2\n"
-                            "li    $2,%4\n"
+                            "li    $2,%3\n"
                             "syscall\n"
 
                            "1:\n"
@@ -2454,12 +3031,12 @@ struct kernel_io_event {
           #else
                              "daddu $29,16\n"
           #endif
-                             : "=&r" (__v0), "=r" (__r7)
-                             : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
-                               "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
-                               "r"(__ptid), "r"(__r7), "r"(__ctid)
+                             : "+r" (__v0), "+r" (__r7)
+                             : "i"(__NR_clone), "i"(__NR_exit), "r"(fn),
+                               "r"(__stack), "r"(__flags), "r"(arg),
+                               "r"(__ptid), "r"(__ctid)
                              : "$9", "$10", "$11", "$12", "$13", "$14", "$15",
-                               "$24", "memory");
+                               "$24", "$25", "memory");
       }
       LSS_RETURN(int, __v0, __r7);
     }
@@ -2572,19 +3149,8 @@ struct kernel_io_event {
                                                type5 arg5, type6 arg6) {      \
           LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6);        \
        }
-
-    #undef LSS_PPC_MINIMUM_FRAME_SIZE
-    #undef LSS_SIZE_S
-    #ifdef __PPC64__
-      #define LSS_PPC_MINIMUM_FRAME_SIZE 112
-      #define LSS_SIZE_S "d"
-    #else
-      #define LSS_PPC_MINIMUM_FRAME_SIZE 16
-      #define LSS_SIZE_S "w"
-    #endif
-
     /* clone function adapted from glibc 2.3.6 clone.S                       */
-    /* TODO(user): consider wrapping some args up in a struct, like we
+    /* TODO(csilvers): consider wrapping some args up in a struct, like we
      * do for i386's _syscall6, so we can compile successfully on gcc 2.95
      */
     LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
@@ -2603,17 +3169,17 @@ struct kernel_io_event {
             /* check for fn == NULL
              * and child_stack == NULL
              */
-            "cmp" LSS_SIZE_S "i cr0, %6, 0\n\t"
-            "cmp" LSS_SIZE_S "i cr1, %7, 0\n\t"
+            "cmpwi cr0, %6, 0\n\t"
+            "cmpwi cr1, %7, 0\n\t"
             "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t"
             "beq- cr0, 1f\n\t"
 
             /* set up stack frame for child                                  */
-            "clrr" LSS_SIZE_S "i %7, %7, 4\n\t"
+            "clrrwi %7, %7, 4\n\t"
             "li 0, 0\n\t"
-            "st" LSS_SIZE_S "u 0, %13(%7)\n\t"
+            "stwu 0, -16(%7)\n\t"
 
-            /* fn, arg, child_stack are saved across the syscall: r27-29     */
+            /* fn, arg, child_stack are saved across the syscall: r28-30     */
             "mr 28, %6\n\t"
             "mr 29, %7\n\t"
             "mr 27, %9\n\t"
@@ -2629,30 +3195,14 @@ struct kernel_io_event {
             "sc\n\t"
 
             /* Test if syscall was successful                                */
-            "cmp" LSS_SIZE_S "i cr1, 3, 0\n\t"
+            "cmpwi cr1, 3, 0\n\t"
             "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t"
             "bne- cr1, 1f\n\t"
 
-            /* Do the function call.  On PowerPC64, a function pointer points
-             * a function descriptor instead of the first instruction.  We need
-             * to load the callee's entry point and TOC from the descriptor.
-             * Since the callee may have a differet TOC, we also need to
-             * save and restore caller's TOC around the call.
-             */
-
-    #ifdef __PPC64__
-            "std 2, 40(1)\n\t"  /* Save caller's TOC. */
-            "ld 4, 0(28)\n\t"   /* Get callee's entry address. */
-            "ld 2, 8(28)\n\t"   /* Load calee's TOC. */
-            "mtctr 4\n\t"
-            "mr 3, 27\n\t"
-            "bctrl\n\t"
-            "ld 2, 40(1)\n\t"  /* Restore caller's TOC after call. */
-    #else
+            /* Do the function call                                          */
             "mtctr 28\n\t"
             "mr 3, 27\n\t"
             "bctrl\n\t"
-    #endif
 
             /* Call _exit(r3)                                                */
             "li 0, %5\n\t"
@@ -2667,59 +3217,200 @@ struct kernel_io_event {
                 "i" (__NR_clone), "i" (__NR_exit),
                 "r" (__fn), "r" (__cstack), "r" (__flags),
                 "r" (__arg), "r" (__ptidptr), "r" (__newtls),
-                "r" (__ctidptr), "i"(-LSS_PPC_MINIMUM_FRAME_SIZE)
+                "r" (__ctidptr)
               : "cr0", "cr1", "memory", "ctr",
                 "r0", "r29", "r27", "r28");
       }
       LSS_RETURN(int, __ret, __err);
     }
-  #ifdef __PPC64__
-    LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len,
-                         int,  advice)
-  #else
-    /* fadvise64 wrapper not yet implemented for 32-bit PowerPC.  */
+  #elif defined(__s390__)
+    #undef  LSS_REG
+    #define LSS_REG(r, a) register unsigned long __r##r __asm__("r"#r) = (unsigned long) a
+    #undef  LSS_BODY
+    #define LSS_BODY(type, name, args...)                                     \
+        register unsigned long __nr __asm__("r1")                             \
+            = (unsigned long)(__NR_##name);                                   \
+        register long __res_r2 __asm__("r2");                                 \
+        long __res;                                                           \
+        __asm__ __volatile__                                                  \
+            ("svc 0\n\t"                                                      \
+             : "=d"(__res_r2)                                                 \
+             : "d"(__nr), ## args                                             \
+             : "memory");                                                     \
+        __res = __res_r2;                                                     \
+        LSS_RETURN(type, __res)
+    #undef _syscall0
+    #define _syscall0(type, name)                                             \
+       type LSS_NAME(name)(void) {                                            \
+          LSS_BODY(type, name);                                               \
+       }
+    #undef _syscall1
+    #define _syscall1(type, name, type1, arg1)                                \
+       type LSS_NAME(name)(type1 arg1) {                                      \
+          LSS_REG(2, arg1);                                                   \
+          LSS_BODY(type, name, "0"(__r2));                                    \
+       }
+    #undef _syscall2
+    #define _syscall2(type, name, type1, arg1, type2, arg2)                   \
+       type LSS_NAME(name)(type1 arg1, type2 arg2) {                          \
+          LSS_REG(2, arg1); LSS_REG(3, arg2);                                 \
+          LSS_BODY(type, name, "0"(__r2), "d"(__r3));                         \
+       }
+    #undef _syscall3
+    #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3)      \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) {              \
+          LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3);               \
+          LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4));              \
+       }
+    #undef _syscall4
+    #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                                  type4, arg4)                                \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3,                \
+                           type4 arg4) {                                      \
+          LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3);               \
+          LSS_REG(5, arg4);                                                   \
+          LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4),               \
+                               "d"(__r5));                                    \
+       }
+    #undef _syscall5
+    #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                                  type4, arg4, type5, arg5)                   \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3,                \
+                           type4 arg4, type5 arg5) {                          \
+          LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3);               \
+          LSS_REG(5, arg4); LSS_REG(6, arg5);                                 \
+          LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4),               \
+                               "d"(__r5), "d"(__r6));                         \
+       }
+    #undef _syscall6
+    #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3,      \
+                                  type4, arg4, type5, arg5, type6, arg6)      \
+       type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3,                \
+                           type4 arg4, type5 arg5, type6 arg6) {              \
+          LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3);               \
+          LSS_REG(5, arg4); LSS_REG(6, arg5); LSS_REG(7, arg6);               \
+          LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4),               \
+                               "d"(__r5), "d"(__r6), "d"(__r7));              \
+       }
+    LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+                                   int flags, void *arg, int *parent_tidptr,
+                                   void *newtls, int *child_tidptr) {
+      long __ret;
+      {
+        register int  (*__fn)(void *)    __asm__ ("r1")  = fn;
+        register void  *__cstack         __asm__ ("r2")  = child_stack;
+        register int    __flags          __asm__ ("r3")  = flags;
+        register void  *__arg            __asm__ ("r0")  = arg;
+        register int   *__ptidptr        __asm__ ("r4")  = parent_tidptr;
+        register void  *__newtls         __asm__ ("r6")  = newtls;
+        register int   *__ctidptr        __asm__ ("r5")  = child_tidptr;
+        __asm__ __volatile__ (
+    #ifndef __s390x__
+                                  /* arg already in r0 */
+          "ltr %4, %4\n\t"        /* check fn, which is already in r1 */
+          "jz 1f\n\t"             /* NULL function pointer, return -EINVAL */
+          "ltr %5, %5\n\t"        /* check child_stack, which is already in r2 */
+          "jz 1f\n\t"             /* NULL stack pointer, return -EINVAL */
+                                  /* flags already in r3 */
+                                  /* parent_tidptr already in r4 */
+                                  /* child_tidptr already in r5 */
+                                  /* newtls already in r6 */
+          "svc %2\n\t"            /* invoke clone syscall */
+          "ltr %0,%%r2\n\t"       /* load return code into __ret and test */
+          "jnz 1f\n\t"            /* return to parent if non-zero */
+                                  /* start child thread */
+          "lr %%r2, %7\n\t"       /* set first parameter to void *arg */
+          "ahi %%r15, -96\n\t"    /* make room on the stack for the save area */
+          "xc 0(4,%%r15), 0(%%r15)\n\t"
+          "basr %%r14, %4\n\t"    /* jump to fn */
+          "svc %3\n"              /* invoke exit syscall */
+          "1:\n"
+    #else
+                                  /* arg already in r0 */
+          "ltgr %4, %4\n\t"       /* check fn, which is already in r1 */
+          "jz 1f\n\t"             /* NULL function pointer, return -EINVAL */
+          "ltgr %5, %5\n\t"       /* check child_stack, which is already in r2 */
+          "jz 1f\n\t"             /* NULL stack pointer, return -EINVAL */
+                                  /* flags already in r3 */
+                                  /* parent_tidptr already in r4 */
+                                  /* child_tidptr already in r5 */
+                                  /* newtls already in r6 */
+          "svc %2\n\t"            /* invoke clone syscall */
+          "ltgr %0, %%r2\n\t"     /* load return code into __ret and test */
+          "jnz 1f\n\t"            /* return to parent if non-zero */
+                                  /* start child thread */
+          "lgr %%r2, %7\n\t"      /* set first parameter to void *arg */
+          "aghi %%r15, -160\n\t"  /* make room on the stack for the save area */
+          "xc 0(8,%%r15), 0(%%r15)\n\t"
+          "basr %%r14, %4\n\t"    /* jump to fn */
+          "svc %3\n"              /* invoke exit syscall */
+          "1:\n"
+    #endif
+          : "=r" (__ret)
+          : "0" (-EINVAL), "i" (__NR_clone), "i" (__NR_exit),
+            "d" (__fn), "d" (__cstack), "d" (__flags), "d" (__arg),
+            "d" (__ptidptr), "d" (__newtls), "d" (__ctidptr)
+          : "cc", "r14", "memory"
+        );
+      }
+      LSS_RETURN(int, __ret);
+    }
   #endif
-  #endif  /* defined (__PPC__) */
   #define __NR__exit   __NR_exit
   #define __NR__gettid __NR_gettid
   #define __NR__mremap __NR_mremap
-  LSS_INLINE _syscall1(int,     brk,             void *,      e)
-  LSS_INLINE _syscall2(int,     capset,
-                       struct kernel_cap_user_header*, h,
-                       struct kernel_cap_user_data*, d)
+  LSS_INLINE _syscall1(void *,  brk,             void *,      e)
   LSS_INLINE _syscall1(int,     chdir,           const char *,p)
-  LSS_INLINE _syscall1(int,     chroot,          const char *,p)
   LSS_INLINE _syscall1(int,     close,           int,         f)
   LSS_INLINE _syscall2(int,     clock_getres,    int,         c,
                        struct kernel_timespec*, t)
   LSS_INLINE _syscall2(int,     clock_gettime,   int,         c,
                        struct kernel_timespec*, t)
   LSS_INLINE _syscall1(int,     dup,             int,         f)
-  LSS_INLINE _syscall2(int,     dup2,            int,         s,
-                       int,            d)
+  #if defined(__NR_dup2)
+    // dup2 is polyfilled below when not available.
+    LSS_INLINE _syscall2(int,     dup2,            int,         s,
+                         int,            d)
+  #endif
+  #if defined(__NR_dup3)
+    LSS_INLINE _syscall3(int, dup3,  int, s, int, d, int, f)
+  #endif
   LSS_INLINE _syscall3(int,     execve,          const char*, f,
                        const char*const*,a,const char*const*, e)
   LSS_INLINE _syscall1(int,     _exit,           int,         e)
   LSS_INLINE _syscall1(int,     exit_group,      int,         e)
   LSS_INLINE _syscall3(int,     fcntl,           int,         f,
                        int,            c, long,   a)
-  LSS_INLINE _syscall0(pid_t,   fork)
+  #if defined(__NR_fork)
+    // fork is polyfilled below when not available.
+    LSS_INLINE _syscall0(pid_t,   fork)
+  #endif
   LSS_INLINE _syscall2(int,     fstat,           int,         f,
                       struct kernel_stat*,   b)
   LSS_INLINE _syscall2(int,     fstatfs,         int,         f,
                       struct kernel_statfs*, b)
-  LSS_INLINE _syscall2(int, ftruncate,           int,         f,
-                       off_t,          l)
-  LSS_INLINE _syscall4(int,     futex,           int*,        a,
-                       int,            o, int,    v,
-                      struct kernel_timespec*, t)
+  #if defined(__x86_64__)
+    /* Need to make sure off_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE int LSS_NAME(ftruncate)(int f, off_t l) {
+      LSS_BODY(2, int, ftruncate, LSS_SYSCALL_ARG(f), (uint64_t)(l));
+    }
+  #else
+    LSS_INLINE _syscall2(int, ftruncate,           int,         f,
+                         off_t,          l)
+  #endif
+  LSS_INLINE _syscall6(int,     futex,          int*,        u,
+                       int,     o,              int,         v,
+                       struct kernel_timespec*, t,
+                       int*,    u2,             int,         v2)
   LSS_INLINE _syscall3(int,     getdents,        int,         f,
-                      struct kernel_dirent*, d, int,    c)
+                       struct kernel_dirent*, d, int,    c)
   LSS_INLINE _syscall3(int,     getdents64,      int,         f,
                       struct kernel_dirent64*, d, int,    c)
   LSS_INLINE _syscall0(gid_t,   getegid)
   LSS_INLINE _syscall0(uid_t,   geteuid)
-  LSS_INLINE _syscall0(pid_t,   getpgrp)
+  #if defined(__NR_getpgrp)
+    LSS_INLINE _syscall0(pid_t,   getpgrp)
+  #endif
   LSS_INLINE _syscall0(pid_t,   getpid)
   LSS_INLINE _syscall0(pid_t,   getppid)
   LSS_INLINE _syscall2(int,     getpriority,     int,         a,
@@ -2728,15 +3419,14 @@ struct kernel_io_event {
                        gid_t *,         e,       gid_t *,     s)
   LSS_INLINE _syscall3(int,     getresuid,       uid_t *,     r,
                        uid_t *,         e,       uid_t *,     s)
-  #ifndef __ARM_EABI__
-  /* No available on ARM EABI Linux.  */
+#if !defined(__ARM_EABI__)
   LSS_INLINE _syscall2(int,     getrlimit,       int,         r,
                       struct kernel_rlimit*, l)
-  #endif
+#endif
   LSS_INLINE _syscall1(pid_t,   getsid,          pid_t,       p)
   LSS_INLINE _syscall0(pid_t,   _gettid)
-  LSS_INLINE _syscall2(int,     gettimeofday,    struct timeval *, v,
-                       struct timezone *, z)
+  LSS_INLINE _syscall2(pid_t,   gettimeofday,    struct kernel_timeval*, t,
+                       void*, tz)
   LSS_INLINE _syscall5(int,     setxattr,        const char *,p,
                        const char *,   n,        const void *,v,
                        size_t,         s,        int,         f)
@@ -2759,8 +3449,16 @@ struct kernel_io_event {
                        int,     who,             int,         ioprio)
   LSS_INLINE _syscall2(int,     kill,            pid_t,       p,
                        int,            s)
-  LSS_INLINE _syscall3(off_t,   lseek,           int,         f,
-                       off_t,          o, int,    w)
+  #if defined(__x86_64__)
+    /* Need to make sure off_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE off_t LSS_NAME(lseek)(int f, off_t o, int w) {
+      _LSS_BODY(3, off_t, lseek, off_t, LSS_SYSCALL_ARG(f), (uint64_t)(o),
+                                        LSS_SYSCALL_ARG(w));
+    }
+  #else
+    LSS_INLINE _syscall3(off_t,   lseek,           int,         f,
+                         off_t,          o, int,    w)
+  #endif
   LSS_INLINE _syscall2(int,     munmap,          void*,       s,
                        size_t,         l)
   LSS_INLINE _syscall6(long,    move_pages,      pid_t,       p,
@@ -2771,33 +3469,28 @@ struct kernel_io_event {
   LSS_INLINE _syscall5(void*,   _mremap,         void*,       o,
                        size_t,         os,       size_t,      ns,
                        unsigned long,  f, void *, a)
-  LSS_INLINE _syscall3(int,     open,            const char*, p,
-                       int,            f, int,    m)
-  LSS_INLINE _syscall3(int,     poll,           struct kernel_pollfd*, u,
-                       unsigned int,   n, int,    t)
-  LSS_INLINE _syscall2(int,     prctl,           int,         o,
-                       long,           a)
-  LSS_INLINE _syscall5(int, mount, const char *, source, const char *, target,
-                       const char *, filesystemtype, unsigned long, mountflags,
-                       const void *, data)
-  LSS_INLINE _syscall1(int, unshare, int, flags)
-  LSS_INLINE _syscall2(int, setns, int, fd, int, nstype)
-  #if defined(__NR_preadv)
-    // Defined on x86_64 / i386 only
-  LSS_INLINE _syscall5(ssize_t, preadv, unsigned long, fd,
-                       const struct kernel_iovec*, iovec,
-                       unsigned long, vlen, unsigned long, pos_l,
-                       unsigned long, pos_h)
+  #if defined(__NR_open)
+    // open is polyfilled below when not available.
+    LSS_INLINE _syscall3(int,     open,            const char*, p,
+                         int,            f, int,    m)
+  #endif
+  #if defined(__NR_poll)
+    // poll is polyfilled below when not available.
+    LSS_INLINE _syscall3(int,     poll,           struct kernel_pollfd*, u,
+                         unsigned int,   n, int,    t)
   #endif
+  #if defined(__NR_ppoll)
+    LSS_INLINE _syscall5(int, ppoll, struct kernel_pollfd *, u,
+                         unsigned int, n, const struct kernel_timespec *, t,
+                         const struct kernel_sigset_t *, sigmask, size_t, s)
+  #endif
+  LSS_INLINE _syscall5(int,     prctl,           int,         option,
+                       unsigned long,  arg2,
+                       unsigned long,  arg3,
+                       unsigned long,  arg4,
+                       unsigned long,  arg5)
   LSS_INLINE _syscall4(long,    ptrace,          int,         r,
                        pid_t,          p, void *, a, void *, d)
-  #if defined(__NR_pwritev)
-    // Defined on x86_64 / i386 only
-  LSS_INLINE _syscall5(ssize_t, pwritev, unsigned long, fd,
-                       const struct kernel_iovec*, iovec,
-                       unsigned long, vlen, unsigned long, pos_l,
-                       unsigned long, pos_h)
-  #endif
   #if defined(__NR_quotactl)
     // Defined on x86_64 / i386 only
     LSS_INLINE _syscall4(int,  quotactl,  int,  cmd,  const char *, special,
@@ -2805,8 +3498,15 @@ struct kernel_io_event {
   #endif
   LSS_INLINE _syscall3(ssize_t, read,            int,         f,
                        void *,         b, size_t, c)
-  LSS_INLINE _syscall3(int,     readlink,        const char*, p,
-                       char*,          b, size_t, s)
+  #if defined(__NR_readlink)
+    // readlink is polyfilled below when not available.
+    LSS_INLINE _syscall3(int,     readlink,        const char*, p,
+                         char*,          b, size_t, s)
+  #endif
+  #if defined(__NR_readlinkat)
+    LSS_INLINE _syscall4(int, readlinkat, int, d, const char *, p, char *, b,
+                         size_t, s)
+  #endif
   LSS_INLINE _syscall4(int,     rt_sigaction,    int,         s,
                        const struct kernel_sigaction*, a,
                        struct kernel_sigaction*, o, size_t,   c)
@@ -2814,10 +3514,11 @@ struct kernel_io_event {
                        size_t,         c)
   LSS_INLINE _syscall4(int, rt_sigprocmask,      int,         h,
                        const struct kernel_sigset_t*,  s,
-                       struct kernel_sigset_t*,        o, size_t, c);
-  LSS_INLINE _syscall1(int, rt_sigreturn,        unsigned long, u);
+                       struct kernel_sigset_t*,        o, size_t, c)
   LSS_INLINE _syscall2(int, rt_sigsuspend,
-                       const struct kernel_sigset_t*, s,  size_t, c);
+                       const struct kernel_sigset_t*, s,  size_t, c)
+  LSS_INLINE _syscall4(int, rt_sigtimedwait, const struct kernel_sigset_t*, s,
+                       siginfo_t*, i, const struct timespec*, t, size_t, c)
   LSS_INLINE _syscall3(int,     sched_getaffinity,pid_t,      p,
                        unsigned int,   l, unsigned long *, m)
   LSS_INLINE _syscall3(int,     sched_setaffinity,pid_t,      p,
@@ -2842,25 +3543,30 @@ struct kernel_io_event {
   LSS_INLINE _syscall2(int,     sigaltstack,     const stack_t*, s,
                        const stack_t*, o)
   #if defined(__NR_sigreturn)
-  LSS_INLINE _syscall1(int,     sigreturn,       unsigned long, u);
+    LSS_INLINE _syscall1(int,     sigreturn,       unsigned long, u)
+  #endif
+  #if defined(__NR_stat)
+    // stat is polyfilled below when not available.
+    LSS_INLINE _syscall2(int,     stat,            const char*, f,
+                        struct kernel_stat*,   b)
   #endif
-  LSS_INLINE _syscall2(int,     stat,            const char*, f,
-                      struct kernel_stat*,   b)
   LSS_INLINE _syscall2(int,     statfs,          const char*, f,
                       struct kernel_statfs*, b)
   LSS_INLINE _syscall3(int,     tgkill,          pid_t,       p,
                        pid_t,          t, int,            s)
   LSS_INLINE _syscall2(int,     tkill,           pid_t,       p,
                        int,            s)
+  #if defined(__NR_unlink)
+    // unlink is polyfilled below when not available.
+    LSS_INLINE _syscall1(int,     unlink,           const char*, f)
+  #endif
   LSS_INLINE _syscall3(ssize_t, write,            int,        f,
                        const void *,   b, size_t, c)
   LSS_INLINE _syscall3(ssize_t, writev,           int,        f,
                        const struct kernel_iovec*, v, size_t, c)
-  LSS_INLINE _syscall1(int,     umask,           unsigned,    m)
-  LSS_INLINE _syscall1(int,     unlink,          const char*, f)
   #if defined(__NR_getcpu)
     LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu,
-                         unsigned *, node, void *, unused);
+                         unsigned *, node, void *, unused)
   #endif
   #if defined(__x86_64__) ||                                                  \
      (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
@@ -2879,8 +3585,85 @@ struct kernel_io_event {
     LSS_INLINE _syscall4(int, socketpair,         int,   d,
                          int,                     t, int,       p, int*, s)
   #endif
+  #if defined(__NR_fadvise64)
+    #if defined(__x86_64__)
+    /* Need to make sure loff_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset, loff_t len,
+                                       int advice) {
+      LSS_BODY(4, int, fadvise64, LSS_SYSCALL_ARG(fd), (uint64_t)(offset),
+                                  (uint64_t)(len), LSS_SYSCALL_ARG(advice));
+    }
+    #else
+    LSS_INLINE _syscall4(int, fadvise64,
+                         int, fd, loff_t, offset, loff_t, len, int, advice)
+    #endif
+  #elif defined(__i386__)
+    #define __NR__fadvise64_64 __NR_fadvise64_64
+    LSS_INLINE _syscall6(int, _fadvise64_64, int, fd,
+                         unsigned, offset_lo, unsigned, offset_hi,
+                         unsigned, len_lo, unsigned, len_hi,
+                         int, advice)
+
+    LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset,
+                                       loff_t len, int advice) {
+      return LSS_NAME(_fadvise64_64)(fd,
+                                     (unsigned)offset, (unsigned)(offset >>32),
+                                     (unsigned)len, (unsigned)(len >> 32),
+                                     advice);
+    }
+
+  #elif defined(__s390__) && !defined(__s390x__)
+    #define __NR__fadvise64_64 __NR_fadvise64_64
+    struct kernel_fadvise64_64_args {
+      int fd;
+      long long offset;
+      long long len;
+      int advice;
+    };
+
+    LSS_INLINE _syscall1(int, _fadvise64_64,
+                         struct kernel_fadvise64_64_args *args)
+
+    LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset,
+                                       loff_t len, int advice) {
+      struct kernel_fadvise64_64_args args = { fd, offset, len, advice };
+      return LSS_NAME(_fadvise64_64)(&args);
+    }
+  #endif
+  #if defined(__NR_fallocate)
+    #if defined(__x86_64__)
+    /* Need to make sure loff_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE int LSS_NAME(fallocate)(int f, int mode, loff_t offset,
+                                       loff_t len) {
+      LSS_BODY(4, int, fallocate, LSS_SYSCALL_ARG(f), LSS_SYSCALL_ARG(mode),
+                                  (uint64_t)(offset), (uint64_t)(len));
+    }
+    #elif (defined(__i386__) || (defined(__s390__) && !defined(__s390x__)) \
+           || defined(__ARM_ARCH_3__) || defined(__ARM_EABI__) \
+           || (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) \
+           || defined(__PPC__))
+    #define __NR__fallocate __NR_fallocate
+    LSS_INLINE _syscall6(int, _fallocate, int, fd,
+                         int, mode,
+                         unsigned, offset_lo, unsigned, offset_hi,
+                         unsigned, len_lo, unsigned, len_hi)
 
-  #if defined(__x86_64__) || defined(__PPC__)
+    LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode,
+                                       loff_t offset, loff_t len) {
+      union { loff_t off; unsigned w[2]; } o = { offset }, l = { len };
+      return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]);
+    }
+    #else
+    LSS_INLINE _syscall4(int, fallocate,
+                         int, f, int, mode, loff_t, offset, loff_t, len)
+    #endif
+  #endif
+  #if defined(__NR_newfstatat)
+    LSS_INLINE _syscall4(int, newfstatat,         int,   d,
+                         const char *,            p,
+                         struct kernel_stat*,     b, int, f)
+  #endif
+  #if defined(__x86_64__) || defined(__s390x__)
     LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid,
                                          gid_t *egid,
                                          gid_t *sgid) {
@@ -2893,10 +3676,6 @@ struct kernel_io_event {
       return LSS_NAME(getresuid)(ruid, euid, suid);
     }
 
-    LSS_INLINE _syscall4(int, newfstatat,         int,   d,
-                         const char *,            p,
-                        struct kernel_stat*,       b, int, f)
-
     LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) {
       return LSS_NAME(setfsgid)(gid);
     }
@@ -2912,21 +3691,11 @@ struct kernel_io_event {
     LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) {
       return LSS_NAME(setresuid)(ruid, euid, suid);
     }
-  #endif // defined(__x86_64__) || defined(__PPC__)
-
-  #if defined(__x86_64__) || defined(__PPC64__)
-    LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode,
-                         loff_t, offset, loff_t, len)
-
-    LSS_INLINE _syscall6(void*, mmap,              void*, s,
-                         size_t,                   l, int,               p,
-                         int,                      f, int,               d,
-                         __off64_t,                o)
 
     LSS_INLINE int LSS_NAME(sigaction)(int signum,
                                        const struct kernel_sigaction *act,
                                        struct kernel_sigaction *oldact) {
-    #if defined(__x86_64__)
+      #if defined(__x86_64__)
       /* On x86_64, the kernel requires us to always set our own
        * SA_RESTORER in order to be able to return from a signal handler.
        * This function must have a "magic" signature that the "gdb"
@@ -2938,77 +3707,67 @@ struct kernel_io_event {
         a.sa_restorer = LSS_NAME(restore_rt)();
         return LSS_NAME(rt_sigaction)(signum, &a, oldact,
                                       (KERNEL_NSIG+7)/8);
-      } else {
+      } else
+      #endif
         return LSS_NAME(rt_sigaction)(signum, act, oldact,
                                       (KERNEL_NSIG+7)/8);
-      }
-    #else
-      return LSS_NAME(rt_sigaction)(signum, act, oldact, (KERNEL_NSIG+7)/8);
-    #endif
     }
 
     LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) {
       return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8);
     }
 
+    LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
+      return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
+    }
+  #endif
+  #if defined(__NR_rt_sigprocmask)
     LSS_INLINE int LSS_NAME(sigprocmask)(int how,
                                          const struct kernel_sigset_t *set,
                                          struct kernel_sigset_t *oldset) {
       return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
     }
-
-    LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
-      return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
+  #endif
+  #if defined(__NR_rt_sigtimedwait)
+    LSS_INLINE int LSS_NAME(sigtimedwait)(const struct kernel_sigset_t *set,
+                                          siginfo_t *info,
+                                          const struct timespec *timeout) {
+      return LSS_NAME(rt_sigtimedwait)(set, info, timeout, (KERNEL_NSIG+7)/8);
     }
-  #endif /* defined(__x86_64__) || defined(__PPC64__) */
-
-  #if defined(__x86_64__) || \
-      defined(__arm__) || \
-     (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
+  #endif
+  #if defined(__NR_wait4)
     LSS_INLINE _syscall4(pid_t, wait4,            pid_t, p,
                          int*,                    s, int,       o,
                         struct kernel_rusage*,     r)
-
-    LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){
-      return LSS_NAME(wait4)(pid, status, options, 0);
-    }
   #endif
-  #if defined(__x86_64__)|| \
-      defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_5T__) || \
-      defined(__mips__) || defined(__PPC__)
-    LSS_INLINE _syscall2(int,     setgroups,     size_t,      c,
-                         const gid_t *,  g)
-  #endif
-  #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \
-      defined(__PPC__)
+  #if defined(__NR_openat)
     LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m)
+  #endif
+  #if defined(__NR_unlinkat)
     LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f)
   #endif
-  #if defined(__i386__) || defined(__arm__)
+  #if defined(__i386__) || defined(__ARM_ARCH_3__) || defined(__ARM_EABI__) || \
+      (defined(__s390__) && !defined(__s390x__))
     #define __NR__getresgid32 __NR_getresgid32
     #define __NR__getresuid32 __NR_getresuid32
     #define __NR__setfsgid32  __NR_setfsgid32
     #define __NR__setfsuid32  __NR_setfsuid32
-    #define __NR__setgroups32 __NR_setgroups32
-    #define __NR__setgroups   __NR_setgroups
     #define __NR__setresgid32 __NR_setresgid32
     #define __NR__setresuid32 __NR_setresuid32
+#if defined(__ARM_EABI__)
     LSS_INLINE _syscall2(int,   ugetrlimit,        int,          r,
                         struct kernel_rlimit*, l)
+#endif
     LSS_INLINE _syscall3(int,     _getresgid32,    gid_t *,      r,
                          gid_t *,            e,    gid_t *,      s)
     LSS_INLINE _syscall3(int,     _getresuid32,    uid_t *,      r,
                          uid_t *,            e,    uid_t *,      s)
-    LSS_INLINE _syscall1(int,     _setfsgid32,      gid_t,       f)
-    LSS_INLINE _syscall1(int,     _setfsuid32,      uid_t,       f)
-    LSS_INLINE _syscall2(int,     _setgroups32,     int,         s,
-                         const unsigned int *, l)
-    LSS_INLINE _syscall2(int,     _setgroups,       size_t,      c,
-                         const unsigned short *, g)
-    LSS_INLINE _syscall3(int,     _setresgid32,     gid_t,       r,
-                         gid_t,              e,     gid_t,       s)
-    LSS_INLINE _syscall3(int,     _setresuid32,     uid_t,       r,
-                         uid_t,              e,     uid_t,       s)
+    LSS_INLINE _syscall1(int,     _setfsgid32,     gid_t,        f)
+    LSS_INLINE _syscall1(int,     _setfsuid32,     uid_t,        f)
+    LSS_INLINE _syscall3(int,     _setresgid32,    gid_t,        r,
+                         gid_t,              e,    gid_t,        s)
+    LSS_INLINE _syscall3(int,     _setresuid32,    uid_t,        r,
+                         uid_t,              e,    uid_t,        s)
 
     LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid,
                                          gid_t *egid,
@@ -3047,7 +3806,7 @@ struct kernel_io_event {
       if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 &&
           LSS_ERRNO == ENOSYS) {
         if ((unsigned int)gid & ~0xFFFFu) {
-          LSS_ERRNO = EINVAL;
+          rc = EINVAL;
         } else {
           rc = LSS_NAME(setfsgid)(gid);
         }
@@ -3060,7 +3819,7 @@ struct kernel_io_event {
       if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 &&
           LSS_ERRNO == ENOSYS) {
         if ((unsigned int)uid & ~0xFFFFu) {
-          LSS_ERRNO = EINVAL;
+          rc = EINVAL;
         } else {
           rc = LSS_NAME(setfsuid)(uid);
         }
@@ -3068,37 +3827,6 @@ struct kernel_io_event {
       return rc;
     }
 
-
-    // We cannot allocate memory so there is a problem with building the
-    // list of groups with the proper datatype.  Older kernels have limits
-    // on the number of groups that can be set at one time of up to 32.
-    // So we have an array on the stack of size 32 where to put the groups.
-    #define LSS_SET_GROUPS_SIZE 32
-    LSS_INLINE int LSS_NAME(setgroups)(size_t size, const unsigned int *list) {
-      int rc = 0;
-      if ((rc = LSS_NAME(_setgroups32)(size, list)) < 0 &&
-          LSS_ERRNO == ENOSYS) {
-        if (size > LSS_SET_GROUPS_SIZE) {
-          LSS_ERRNO = EINVAL;
-        } else {
-          unsigned short gid_list[LSS_SET_GROUPS_SIZE];
-          int i;
-          for (i = 0; i < size; ++i) {
-            if (list[i] & ~0xFFFFu) {
-              LSS_ERRNO = EINVAL;
-              break;
-            }
-            gid_list[i] = list[i];
-          }
-          if (LSS_ERRNO != EINVAL) {
-            rc = LSS_NAME(_setgroups)(size, gid_list);
-          }
-        }
-      }
-      return rc;
-    }
-    #undef LSS_SET_GROUPS_SIZE
-
     LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) {
       int rc;
       if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 &&
@@ -3106,7 +3834,7 @@ struct kernel_io_event {
         if ((unsigned int)rgid & ~0xFFFFu ||
             (unsigned int)egid & ~0xFFFFu ||
             (unsigned int)sgid & ~0xFFFFu) {
-          LSS_ERRNO = EINVAL;
+          rc = EINVAL;
         } else {
           rc = LSS_NAME(setresgid)(rgid, egid, sgid);
         }
@@ -3121,7 +3849,7 @@ struct kernel_io_event {
         if ((unsigned int)ruid & ~0xFFFFu ||
             (unsigned int)euid & ~0xFFFFu ||
             (unsigned int)suid & ~0xFFFFu) {
-          LSS_ERRNO = EINVAL;
+          rc = EINVAL;
         } else {
           rc = LSS_NAME(setresuid)(ruid, euid, suid);
         }
@@ -3173,34 +3901,41 @@ struct kernel_io_event {
                 (1UL << ((signum - 1) % (8*sizeof(set->sig[0])))));
     }
   }
-  #if defined(__i386__) || \
-      defined(__arm__) || \
-     (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || \
-     (defined(__PPC__) && !defined(__PPC64__))
+  #if defined(__i386__) ||                                                    \
+      defined(__ARM_ARCH_3__) || defined(__ARM_EABI__) ||                     \
+     (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) ||                   \
+      defined(__PPC__) ||                                                     \
+     (defined(__s390__) && !defined(__s390x__))
     #define __NR__sigaction   __NR_sigaction
     #define __NR__sigpending  __NR_sigpending
-    #define __NR__sigprocmask __NR_sigprocmask
     #define __NR__sigsuspend  __NR_sigsuspend
+    #define __NR__socketcall  __NR_socketcall
     LSS_INLINE _syscall2(int, fstat64,             int, f,
                          struct kernel_stat64 *, b)
-    LSS_INLINE _syscall5(int, _llseek,     uint, fd, ulong, hi, ulong, lo,
+    LSS_INLINE _syscall5(int, _llseek,     uint, fd,
+                         unsigned long, hi, unsigned long, lo,
                          loff_t *, res, uint, wh)
-
-    #ifndef __ARM_EABI__
-    /* Not available on ARM EABI Linux.  */
-    LSS_INLINE _syscall1(void*, mmap,              void*, a)
-    #endif
-    LSS_INLINE _syscall6(void*, mmap2,             void*, s,
+#if defined(__s390__) && !defined(__s390x__)
+    /* On s390, mmap2() arguments are passed in memory. */
+    LSS_INLINE void* LSS_NAME(_mmap2)(void *s, size_t l, int p, int f, int d,
+                                      off_t o) {
+      unsigned long buf[6] = { (unsigned long) s, (unsigned long) l,
+                               (unsigned long) p, (unsigned long) f,
+                               (unsigned long) d, (unsigned long) o };
+      LSS_REG(2, buf);
+      LSS_BODY(void*, mmap2, "0"(__r2));
+    }
+#else
+    #define __NR__mmap2 __NR_mmap2
+    LSS_INLINE _syscall6(void*, _mmap2,            void*, s,
                          size_t,                   l, int,               p,
                          int,                      f, int,               d,
                          off_t,                    o)
+#endif
     LSS_INLINE _syscall3(int,   _sigaction,        int,   s,
                          const struct kernel_old_sigaction*,  a,
                          struct kernel_old_sigaction*,        o)
     LSS_INLINE _syscall1(int,   _sigpending, unsigned long*, s)
-    LSS_INLINE _syscall3(int,   _sigprocmask,      int,   h,
-                         const unsigned long*,     s,
-                         unsigned long*,           o)
     #ifdef __PPC__
     LSS_INLINE _syscall1(int, _sigsuspend,         unsigned long, s)
     #else
@@ -3284,23 +4019,6 @@ struct kernel_io_event {
       return rc;
     }
 
-    LSS_INLINE int LSS_NAME(sigprocmask)(int how,
-                                         const struct kernel_sigset_t *set,
-                                         struct kernel_sigset_t *oldset) {
-      int olderrno = LSS_ERRNO;
-      int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
-      if (rc < 0 && LSS_ERRNO == ENOSYS) {
-        LSS_ERRNO = olderrno;
-        if (oldset) {
-          LSS_NAME(sigemptyset)(oldset);
-        }
-        rc = LSS_NAME(_sigprocmask)(how,
-                                    set ? &set->sig[0] : NULL,
-                                    oldset ? &oldset->sig[0] : NULL);
-      }
-      return rc;
-    }
-
     LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
       int olderrno = LSS_ERRNO;
       int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
@@ -3315,57 +4033,93 @@ struct kernel_io_event {
       return rc;
     }
   #endif
+  #if defined(__i386__) ||                                                    \
+      defined(__ARM_ARCH_3__) || defined(__ARM_EABI__) ||                     \
+     (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) ||                   \
+      defined(__PPC__) ||                                                     \
+     (defined(__s390__) && !defined(__s390x__))
+    /* On these architectures, implement mmap() with mmap2(). */
+    LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d,
+                                    int64_t o) {
+      if (o % 4096) {
+        LSS_ERRNO = EINVAL;
+        return (void *) -1;
+      }
+      return LSS_NAME(_mmap2)(s, l, p, f, d, (o / 4096));
+    }
+  #elif defined(__s390x__)
+    /* On s390x, mmap() arguments are passed in memory. */
+    LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d,
+                                    int64_t o) {
+      unsigned long buf[6] = { (unsigned long) s, (unsigned long) l,
+                               (unsigned long) p, (unsigned long) f,
+                               (unsigned long) d, (unsigned long) o };
+      LSS_REG(2, buf);
+      LSS_BODY(void*, mmap, "0"(__r2));
+    }
+  #elif defined(__x86_64__)
+    /* Need to make sure __off64_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d,
+                                    int64_t o) {
+      LSS_BODY(6, void*, mmap, LSS_SYSCALL_ARG(s), LSS_SYSCALL_ARG(l),
+                               LSS_SYSCALL_ARG(p), LSS_SYSCALL_ARG(f),
+                               LSS_SYSCALL_ARG(d), (uint64_t)(o));
+    }
+  #else
+    /* Remaining 64-bit architectures. */
+    LSS_INLINE _syscall6(void*, mmap, void*, addr, size_t, length, int, prot,
+                         int, flags, int, fd, int64_t, offset)
+  #endif
   #if defined(__PPC__)
     #undef LSS_SC_LOADARGS_0
     #define LSS_SC_LOADARGS_0(dummy...)
-    /* arg1 .. arg6 are passed in an unsigned long array pointed by r4.  */
     #undef LSS_SC_LOADARGS_1
     #define LSS_SC_LOADARGS_1(arg1)                                           \
-        sc_args[0] = (unsigned long) (arg1)
+        __sc_4  = (unsigned long) (arg1)
     #undef LSS_SC_LOADARGS_2
     #define LSS_SC_LOADARGS_2(arg1, arg2)                                     \
         LSS_SC_LOADARGS_1(arg1);                                              \
-        sc_args[1] = (unsigned long) (arg2)
+        __sc_5  = (unsigned long) (arg2)
     #undef LSS_SC_LOADARGS_3
     #define LSS_SC_LOADARGS_3(arg1, arg2, arg3)                               \
         LSS_SC_LOADARGS_2(arg1, arg2);                                        \
-        sc_args[2] = (unsigned long) (arg3)
+        __sc_6  = (unsigned long) (arg3)
     #undef LSS_SC_LOADARGS_4
     #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4)                         \
         LSS_SC_LOADARGS_3(arg1, arg2, arg3);                                  \
-        sc_args[3] = (unsigned long) (arg4)
+        __sc_7  = (unsigned long) (arg4)
     #undef LSS_SC_LOADARGS_5
     #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5)                   \
         LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4);                            \
-        sc_args[4] = (unsigned long) (arg5)
-    #undef LSS_SC_LOADARGS_6
-    #define LSS_SC_LOADARGS_6(arg1, arg2, arg3, arg4, arg5, arg6)             \
-        LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5);                      \
-        sc_args[5] = (unsigned long) (arg6)
+        __sc_8  = (unsigned long) (arg5)
     #undef LSS_SC_BODY
-    /*
-     * Do a socket system call using the generic socketcall() interface.
-     * We pack arguments into an array of unsigned longs and then
-     * call socketcall() with a function number and the argument array.
-     * Although some socket calls now have their own syscall numbers,
-     * we still use socketcall() to make our code work with older kernels.
-     */
     #define LSS_SC_BODY(nr, type, opt, args...)                               \
         long __sc_ret, __sc_err;                                              \
         {                                                                     \
-          unsigned long sc_args[6];                                           \
           register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall;     \
           register unsigned long __sc_3 __asm__ ("r3") = opt;                 \
           register unsigned long __sc_4 __asm__ ("r4");                       \
+          register unsigned long __sc_5 __asm__ ("r5");                       \
+          register unsigned long __sc_6 __asm__ ("r6");                       \
+          register unsigned long __sc_7 __asm__ ("r7");                       \
+          register unsigned long __sc_8 __asm__ ("r8");                       \
           LSS_SC_LOADARGS_##nr(args);                                         \
           __asm__ __volatile__                                                \
-              ("sc\n\t"                                                       \
+              ("stwu 1, -48(1)\n\t"                                           \
+               "stw 4, 20(1)\n\t"                                             \
+               "stw 5, 24(1)\n\t"                                             \
+               "stw 6, 28(1)\n\t"                                             \
+               "stw 7, 32(1)\n\t"                                             \
+               "stw 8, 36(1)\n\t"                                             \
+               "addi 4, 1, 20\n\t"                                            \
+               "sc\n\t"                                                       \
                "mfcr %0"                                                      \
-                 : "+r" (__sc_0),                                             \
-                   "+r" (__sc_3), "=r" (__sc_4)                               \
-                 : "2"(&sc_args)                                              \
-                 : "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",         \
-                   "cr0", "ctr", "memory");                                   \
+                 : "=&r" (__sc_0),                                            \
+                   "=&r" (__sc_3), "=&r" (__sc_4),                            \
+                   "=&r" (__sc_5), "=&r" (__sc_6),                            \
+                   "=&r" (__sc_7), "=&r" (__sc_8)                             \
+                 : LSS_ASMINPUT_##nr                                          \
+                 : "cr0", "ctr", "memory");                                   \
           __sc_ret = __sc_3;                                                  \
           __sc_err = __sc_0;                                                  \
         }                                                                     \
@@ -3382,12 +4136,15 @@ struct kernel_io_event {
       LSS_SC_BODY(3, ssize_t, 16, s, msg, flags);
     }
 
+    // TODO(csilvers): why is this ifdef'ed out?
+#if 0
     LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len,
                                         int flags,
                                         const struct kernel_sockaddr *to,
                                         unsigned int tolen) {
-      LSS_SC_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen);
+      LSS_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen);
     }
+#endif
 
     LSS_INLINE int LSS_NAME(shutdown)(int s, int how) {
       LSS_SC_BODY(2, int, 13, s, how);
@@ -3402,105 +4159,72 @@ struct kernel_io_event {
       LSS_SC_BODY(4, int, 8, d, type, protocol, sv);
     }
   #endif
-  #if defined(__i386__) || \
-      (defined(__arm__) && !defined(__ARM_EABI__)) || \
-      (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
-
-    /* See sys_socketcall in net/socket.c in kernel source.
-     * It de-multiplexes on its first arg and unpacks the arglist
-     * array in its second arg.
-     */
-    LSS_INLINE _syscall2(long, socketcall, int, c, unsigned long*, a)
+  #if defined(__ARM_EABI__) || defined (__aarch64__)
+    LSS_INLINE _syscall3(ssize_t, recvmsg, int, s, struct kernel_msghdr*, msg,
+                         int, flags)
+    LSS_INLINE _syscall3(ssize_t, sendmsg, int, s, const struct kernel_msghdr*,
+                         msg, int, flags)
+    LSS_INLINE _syscall6(ssize_t, sendto, int, s, const void*, buf, size_t,len,
+                         int, flags, const struct kernel_sockaddr*, to,
+                         unsigned int, tolen)
+    LSS_INLINE _syscall2(int, shutdown, int, s, int, how)
+    LSS_INLINE _syscall3(int, socket, int, domain, int, type, int, protocol)
+    LSS_INLINE _syscall4(int, socketpair, int, d, int, type, int, protocol,
+                         int*, sv)
+  #endif
+  #if defined(__i386__) || defined(__ARM_ARCH_3__) ||                         \
+      (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) ||                  \
+      defined(__s390__)
+    #define __NR__socketcall  __NR_socketcall
+    LSS_INLINE _syscall2(int,      _socketcall,    int,   c,
+                         va_list,                  a)
+    LSS_INLINE int LSS_NAME(socketcall)(int op, ...) {
+      int rc;
+      va_list ap;
+      va_start(ap, op);
+      rc = LSS_NAME(_socketcall)(op, ap);
+      va_end(ap);
+      return rc;
+    }
 
     LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg,
                                          int flags){
-      unsigned long args[3] = {
-        (unsigned long) s,
-        (unsigned long) msg,
-        (unsigned long) flags
-      };
-      return (ssize_t) LSS_NAME(socketcall)(17, args);
+      return (ssize_t)LSS_NAME(socketcall)(17, s, msg, flags);
     }
 
     LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s,
                                          const struct kernel_msghdr *msg,
                                          int flags) {
-      unsigned long args[3] = {
-        (unsigned long) s,
-        (unsigned long) msg,
-        (unsigned long) flags
-      };
-      return (ssize_t) LSS_NAME(socketcall)(16, args);
+      return (ssize_t)LSS_NAME(socketcall)(16, s, msg, flags);
     }
 
     LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len,
                                         int flags,
                                         const struct kernel_sockaddr *to,
                                         unsigned int tolen) {
-      unsigned long args[6] = {
-        (unsigned long) s,
-        (unsigned long) buf,
-        (unsigned long) len,
-        (unsigned long) flags,
-        (unsigned long) to,
-        (unsigned long) tolen
-      };
-      return (ssize_t) LSS_NAME(socketcall)(11, args);
+      return (ssize_t)LSS_NAME(socketcall)(11, s, buf, len, flags, to, tolen);
     }
 
     LSS_INLINE int LSS_NAME(shutdown)(int s, int how) {
-      unsigned long args[2] = {
-        (unsigned long) s,
-        (unsigned long) how
-      };
-      return LSS_NAME(socketcall)(13, args);
+      return LSS_NAME(socketcall)(13, s, how);
     }
 
     LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) {
-      unsigned long args[3] = {
-        (unsigned long) domain,
-        (unsigned long) type,
-        (unsigned long) protocol
-      };
-      return LSS_NAME(socketcall)(1, args);
+      return LSS_NAME(socketcall)(1, domain, type, protocol);
     }
 
     LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol,
                                         int sv[2]) {
-      unsigned long args[4] = {
-        (unsigned long) d,
-        (unsigned long) type,
-        (unsigned long) protocol,
-        (unsigned long) sv
-      };
-      return LSS_NAME(socketcall)(8, args);
+      return LSS_NAME(socketcall)(8, d, type, protocol, sv);
     }
-  #elif defined(__ARM_EABI__)
-    /* ARM EABI Linix does not have socketcall.  */
-    LSS_INLINE _syscall3(ssize_t, recvmsg,             int,   s,
-                         struct kernel_msghdr*,     m, int, f)
-    LSS_INLINE _syscall3(ssize_t, sendmsg,            int,   s,
-                         const struct kernel_msghdr*, m, int, f)
-    LSS_INLINE _syscall6(ssize_t, sendto,         int,   s,
-                         const void*,             b, size_t, l,
-                         int,                     f,
-                         const struct kernel_sockaddr*, to,
-                         unsigned int,            tl)
-    LSS_INLINE _syscall2(int, shutdown,           int,   s,
-                         int,                     h)
-    LSS_INLINE _syscall3(int, socket,             int,   d,
-                         int,                     t, int,       p)
-    LSS_INLINE _syscall4(int, socketpair,         int,   d,
-                         int,                     t, int,       p, int*, s)
   #endif
-  #if defined(__i386__) || (defined(__PPC__) && !defined(__PPC64__)) || \
-      defined(__arm__)
+  #if defined(__NR_fstatat64)
     LSS_INLINE _syscall4(int,   fstatat64,        int,   d,
                          const char *,      p,
                          struct kernel_stat64 *,   b,    int,   f)
   #endif
-  #if defined(__i386__) || defined(__PPC__) ||                                \
-     (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+  #if defined(__NR_waitpid)
+    // waitpid is polyfilled below when not available.
     LSS_INLINE _syscall3(pid_t, waitpid,          pid_t, p,
                          int*,              s,    int,   o)
   #endif
@@ -3513,12 +4237,13 @@ struct kernel_io_event {
       register unsigned long __v1 __asm__("$3");
       register unsigned long __r7 __asm__("$7");
       __asm__ __volatile__ ("syscall\n"
-                            : "=&r"(__v0), "=&r"(__v1), "+r" (__r7)
+                            : "=r"(__v0), "=r"(__v1), "=r" (__r7)
                             : "0"(__v0)
                             : "$8", "$9", "$10", "$11", "$12",
-                              "$13", "$14", "$15", "$24", "memory");
+                              "$13", "$14", "$15", "$24", "$25", "memory");
       if (__r7) {
-        LSS_ERRNO = __v0;
+        unsigned long __errnovalue = __v0;
+        LSS_ERRNO = __errnovalue;
         return -1;
       } else {
         p[0] = __v0;
@@ -3526,13 +4251,18 @@ struct kernel_io_event {
         return 0;
       }
     }
-  #else
+  #elif defined(__NR_pipe)
+    // pipe is polyfilled below when not available.
     LSS_INLINE _syscall1(int,     pipe,           int *, p)
   #endif
-  /* TODO(user): see if ppc can/should support this as well              */
-  #if defined(__i386__) || \
-      defined(__arm__) || \
-     (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
+  #if defined(__NR_pipe2)
+    LSS_INLINE _syscall2(int, pipe2, int *, pipefd, int, flags)
+  #endif
+  /* TODO(csilvers): see if ppc can/should support this as well              */
+  #if defined(__i386__) || defined(__ARM_ARCH_3__) ||                         \
+      defined(__ARM_EABI__) ||                                                \
+     (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) ||                   \
+     (defined(__s390__) && !defined(__s390x__))
     #define __NR__statfs64  __NR_statfs64
     #define __NR__fstatfs64 __NR_fstatfs64
     LSS_INLINE _syscall3(int, _statfs64,     const char*, p,
@@ -3553,7 +4283,7 @@ struct kernel_io_event {
     return LSS_NAME(execve)(path, argv, (const char *const *)environ);
   }
 
-  LSS_INLINE pid_t LSS_NAME(gettid)() {
+  LSS_INLINE pid_t LSS_NAME(gettid)(void) {
     pid_t tid = LSS_NAME(_gettid)();
     if (tid != -1) {
       return tid;
@@ -3574,41 +4304,54 @@ struct kernel_io_event {
   }
 
   LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) {
-    return LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0);
+    /* PTRACE_DETACH can sometimes forget to wake up the tracee and it
+     * then sends job control signals to the real parent, rather than to
+     * the tracer. We reduce the risk of this happening by starting a
+     * whole new time slice, and then quickly sending a SIGCONT signal
+     * right after detaching from the tracee.
+     *
+     * We use tkill to ensure that we only issue a wakeup for the thread being
+     * detached.  Large multi threaded apps can take a long time in the kernel
+     * processing SIGCONT.
+     */
+    int rc, err;
+    LSS_NAME(sched_yield)();
+    rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0);
+    err = LSS_ERRNO;
+    LSS_NAME(tkill)(pid, SIGCONT);
+    /* Old systems don't have tkill */
+    if (LSS_ERRNO == ENOSYS)
+      LSS_NAME(kill)(pid, SIGCONT);
+    LSS_ERRNO = err;
+    return rc;
   }
 
   LSS_INLINE int LSS_NAME(raise)(int sig) {
     return LSS_NAME(kill)(LSS_NAME(getpid)(), sig);
   }
 
-  LSS_INLINE int LSS_NAME(setpgrp)() {
+  LSS_INLINE int LSS_NAME(setpgrp)(void) {
     return LSS_NAME(setpgid)(0, 0);
   }
 
-  LSS_INLINE int LSS_NAME(sysconf)(int name) {
-    extern int __getpagesize(void);
-    switch (name) {
-      case _SC_OPEN_MAX: {
-        struct kernel_rlimit limit;
+  #if defined(__x86_64__)
+    /* Need to make sure loff_t isn't truncated to 32-bits under x32.  */
+    LSS_INLINE ssize_t LSS_NAME(pread64)(int f, void *b, size_t c, loff_t o) {
+      LSS_BODY(4, ssize_t, pread64, LSS_SYSCALL_ARG(f), LSS_SYSCALL_ARG(b),
+                                    LSS_SYSCALL_ARG(c), (uint64_t)(o));
+    }
 
-        /* On some systems getrlimit is obsolete, use ugetrlimit instead. */
-  #ifndef __NR_getrlimit
-        return LSS_NAME(ugetrlimit)(RLIMIT_NOFILE, &limit) < 0
-               ? 8192 : limit.rlim_cur;
-  #else
-        return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0
-               ? 8192 : limit.rlim_cur;
-  #endif
-      }
-      case _SC_PAGESIZE:
-        return __getpagesize();
-      default:
-        LSS_ERRNO = ENOSYS;
-        return -1;
+    LSS_INLINE ssize_t LSS_NAME(pwrite64)(int f, const void *b, size_t c,
+                                          loff_t o) {
+      LSS_BODY(4, ssize_t, pwrite64, LSS_SYSCALL_ARG(f), LSS_SYSCALL_ARG(b),
+                                     LSS_SYSCALL_ARG(c), (uint64_t)(o));
     }
-  }
-  #if defined(__x86_64__) ||                                                  \
-     (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64)
+
+    LSS_INLINE int LSS_NAME(readahead)(int f, loff_t o, unsigned c) {
+      LSS_BODY(3, int, readahead, LSS_SYSCALL_ARG(f), (uint64_t)(o),
+                                  LSS_SYSCALL_ARG(c));
+    }
+  #elif defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64
     LSS_INLINE _syscall4(ssize_t, pread64,        int,         f,
                          void *,         b, size_t,   c,
                          loff_t,         o)
@@ -3621,14 +4364,32 @@ struct kernel_io_event {
     #define __NR__pread64   __NR_pread64
     #define __NR__pwrite64  __NR_pwrite64
     #define __NR__readahead __NR_readahead
-    LSS_INLINE _syscall5(ssize_t, _pread64,        int,         f,
-                         void *,         b, size_t, c, unsigned, o1,
-                         unsigned, o2)
-    LSS_INLINE _syscall5(ssize_t, _pwrite64,       int,         f,
-                         const void *,   b, size_t, c, unsigned, o1,
-                         long, o2)
-    LSS_INLINE _syscall4(int, _readahead,          int,         f,
-                         unsigned,       o1, unsigned, o2, size_t, c);
+    #if defined(__ARM_EABI__) || defined(__mips__)
+      /* On ARM and MIPS, a 64-bit parameter has to be in an even-odd register
+       * pair. Hence these calls ignore their fourth argument (r3) so that their
+       * fifth and sixth make such a pair (r4,r5).
+       */
+      #define LSS_LLARG_PAD 0,
+      LSS_INLINE _syscall6(ssize_t, _pread64,        int,         f,
+                           void *,         b, size_t, c,
+                           unsigned, skip, unsigned, o1, unsigned, o2)
+      LSS_INLINE _syscall6(ssize_t, _pwrite64,       int,         f,
+                           const void *,   b, size_t, c,
+                           unsigned, skip, unsigned, o1, unsigned, o2)
+      LSS_INLINE _syscall5(int, _readahead,          int,         f,
+                           unsigned,     skip,
+                           unsigned,       o1, unsigned, o2, size_t, c)
+    #else
+      #define LSS_LLARG_PAD
+      LSS_INLINE _syscall5(ssize_t, _pread64,        int,         f,
+                           void *,         b, size_t, c, unsigned, o1,
+                           unsigned, o2)
+      LSS_INLINE _syscall5(ssize_t, _pwrite64,       int,         f,
+                           const void *,   b, size_t, c, unsigned, o1,
+                           long, o2)
+      LSS_INLINE _syscall4(int, _readahead,          int,         f,
+                           unsigned,       o1, unsigned, o2, size_t, c)
+    #endif
     /* We force 64bit-wide parameters onto the stack, then access each
      * 32-bit component individually. This guarantees that we build the
      * correct parameters independent of the native byte-order of the
@@ -3637,41 +4398,136 @@ struct kernel_io_event {
     LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count,
                                          loff_t off) {
       union { loff_t off; unsigned arg[2]; } o = { off };
-      return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]);
+      return LSS_NAME(_pread64)(fd, buf, count,
+                                LSS_LLARG_PAD o.arg[0], o.arg[1]);
     }
     LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf,
                                           size_t count, loff_t off) {
       union { loff_t off; unsigned arg[2]; } o = { off };
-      return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]);
+      return LSS_NAME(_pwrite64)(fd, buf, count,
+                                 LSS_LLARG_PAD o.arg[0], o.arg[1]);
     }
     LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) {
       union { loff_t off; unsigned arg[2]; } o = { off };
-      return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len);
+      return LSS_NAME(_readahead)(fd, LSS_LLARG_PAD o.arg[0], o.arg[1], len);
     }
   #endif
-  #if defined(__NR_io_setup)
-    LSS_INLINE _syscall2(int, io_setup,
-                         int,                     maxevents,
-                         unsigned long *,         ctxp);
-    LSS_INLINE _syscall3(int, io_submit,
-                         unsigned long,           ctx_id,
-                         long,                    nr,
-                         struct kernel_iocb **,   ios);
-    LSS_INLINE _syscall5(int, io_getevents,
-                         unsigned long,            ctx_id,
-                         long,                     min_nr,
-                         long,                     nr,
-                         struct kernel_io_event *, events,
-                         struct kernel_timespec*,  timeout);
-    LSS_INLINE _syscall1(int, io_destroy,
-                         unsigned long,            ctx);
-    LSS_INLINE _syscall3(int, io_cancel,
-                         unsigned long,            ctx_id,
-                         struct kernel_iocb*,      iocb,
-                         struct kernel_io_event*,  result);
-  #endif
 #endif
 
+/*
+ * Polyfills for deprecated syscalls.
+ */
+
+#if !defined(__NR_dup2)
+  LSS_INLINE int LSS_NAME(dup2)(int s, int d) {
+    return LSS_NAME(dup3)(s, d, 0);
+  }
+#endif
+
+#if !defined(__NR_open)
+  LSS_INLINE int LSS_NAME(open)(const char *pathname, int flags, int mode) {
+    return LSS_NAME(openat)(AT_FDCWD, pathname, flags, mode);
+  }
+#endif
+
+#if !defined(__NR_unlink)
+  LSS_INLINE int LSS_NAME(unlink)(const char *pathname) {
+    return LSS_NAME(unlinkat)(AT_FDCWD, pathname, 0);
+  }
+#endif
+
+#if !defined(__NR_readlink)
+  LSS_INLINE int LSS_NAME(readlink)(const char *pathname, char *buffer,
+                                    size_t size) {
+    return LSS_NAME(readlinkat)(AT_FDCWD, pathname, buffer, size);
+  }
+#endif
+
+#if !defined(__NR_pipe)
+  LSS_INLINE int LSS_NAME(pipe)(int *pipefd) {
+    return LSS_NAME(pipe2)(pipefd, 0);
+  }
+#endif
+
+#if !defined(__NR_poll)
+  LSS_INLINE int LSS_NAME(poll)(struct kernel_pollfd *fds, unsigned int nfds,
+                                int timeout) {
+   struct kernel_timespec timeout_ts;
+   struct kernel_timespec *timeout_ts_p = NULL;
+
+    if (timeout >= 0) {
+      timeout_ts.tv_sec = timeout / 1000;
+      timeout_ts.tv_nsec = (timeout % 1000) * 1000000;
+      timeout_ts_p = &timeout_ts;
+    }
+    return LSS_NAME(ppoll)(fds, nfds, timeout_ts_p, NULL, 0);
+  }
+#endif
+
+#if !defined(__NR_stat)
+  LSS_INLINE int LSS_NAME(stat)(const char *pathname,
+                                struct kernel_stat *buf) {
+    return LSS_NAME(newfstatat)(AT_FDCWD, pathname, buf, 0);
+  }
+#endif
+
+#if !defined(__NR_waitpid)
+  LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options) {
+    return LSS_NAME(wait4)(pid, status, options, 0);
+  }
+#endif
+
+#if !defined(__NR_fork)
+// TODO: define this in an arch-independant way instead of inlining the clone
+//       syscall body.
+
+# if defined(__aarch64__)
+  LSS_INLINE pid_t LSS_NAME(fork)(void) {
+    // No fork syscall on aarch64 - implement by means of the clone syscall.
+    // Note that this does not reset glibc's cached view of the PID/TID, so
+    // some glibc interfaces might go wrong in the forked subprocess.
+    int flags = SIGCHLD;
+    void *child_stack = NULL;
+    void *parent_tidptr = NULL;
+    void *newtls = NULL;
+    void *child_tidptr = NULL;
+
+    LSS_REG(0, flags);
+    LSS_REG(1, child_stack);
+    LSS_REG(2, parent_tidptr);
+    LSS_REG(3, newtls);
+    LSS_REG(4, child_tidptr);
+    LSS_BODY(pid_t, clone, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3),
+             "r"(__r4));
+  }
+# elif defined(__x86_64__)
+  LSS_INLINE pid_t LSS_NAME(fork)(void) {
+    // Android disallows the fork syscall on x86_64 - implement by means of the
+    // clone syscall as above for aarch64.
+    int flags = SIGCHLD;
+    void *child_stack = NULL;
+    void *parent_tidptr = NULL;
+    void *newtls = NULL;
+    void *child_tidptr = NULL;
+
+    LSS_BODY(5, pid_t, clone, LSS_SYSCALL_ARG(flags),
+             LSS_SYSCALL_ARG(child_stack), LSS_SYSCALL_ARG(parent_tidptr),
+             LSS_SYSCALL_ARG(newtls), LSS_SYSCALL_ARG(child_tidptr));
+  }
+# else
+#  error missing fork polyfill for this architecture
+# endif
+#endif
+
+/* These restore the original values of these macros saved by the
+ * corresponding #pragma push_macro near the top of this file. */
+#pragma pop_macro("stat64")
+#pragma pop_macro("fstat64")
+#pragma pop_macro("lstat64")
+#pragma pop_macro("pread64")
+#pragma pop_macro("pwrite64")
+#pragma pop_macro("getdents64")
+
 #if defined(__cplusplus) && !defined(SYS_CPLUSPLUS)
 }
 #endif
diff --git a/be/src/gutil/spinlock_linux-inl.h b/be/src/gutil/spinlock_linux-inl.h
index c9838e4..2e66428 100644
--- a/be/src/gutil/spinlock_linux-inl.h
+++ b/be/src/gutil/spinlock_linux-inl.h
@@ -51,15 +51,10 @@ static struct InitModule {
     int x = 0;
     // futexes are ints, so we can use them only when
     // that's the same size as the lockword_ in SpinLock.
-#ifdef __arm__
-    // ARM linux doesn't support sys_futex1(void*, int, int, struct timespec*);
-    have_futex = 0;
-#else
     have_futex = (sizeof (Atomic32) == sizeof (int) &&
-                  sys_futex(&x, FUTEX_WAKE, 1, 0) >= 0);
-#endif
+                  sys_futex(&x, FUTEX_WAKE, 1, NULL, NULL, 0) >= 0);
     if (have_futex &&
-        sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, 0) < 0) {
+        sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, NULL, NULL, 0) < 0) {
       futex_private_flag = 0;
     }
   }
@@ -85,7 +80,8 @@ void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
       tm.tv_nsec *= 16;  // increase the delay; we expect explicit wakeups
       sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
                 FUTEX_WAIT | futex_private_flag,
-                value, reinterpret_cast<struct kernel_timespec *>(&tm));
+                value, reinterpret_cast<struct kernel_timespec *>(&tm),
+                NULL, 0);
     } else {
       nanosleep(&tm, NULL);
     }
@@ -96,9 +92,11 @@ void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) {
 void SpinLockWake(volatile Atomic32 *w, bool all) {
   if (have_futex) {
     sys_futex(reinterpret_cast<int *>(const_cast<Atomic32 *>(w)),
-              FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1, 0);
+              FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1,
+              NULL, NULL, 0);
   }
 }
 
 } // namespace internal
 } // namespace base
+
diff --git a/be/src/kudu/util/debug-util.cc b/be/src/kudu/util/debug-util.cc
index 03556d6..2a46735 100644
--- a/be/src/kudu/util/debug-util.cc
+++ b/be/src/kudu/util/debug-util.cc
@@ -163,6 +163,7 @@ class CompletionFlag {
     sys_futex(reinterpret_cast<int32_t*>(&complete_),
               FUTEX_WAKE | FUTEX_PRIVATE_FLAG,
               INT_MAX, // wake all
+	      NULL, NULL,
               0 /* ignored */);
 #endif
   }
@@ -181,7 +182,7 @@ class CompletionFlag {
       sys_futex(reinterpret_cast<int32_t*>(&complete_),
                 FUTEX_WAIT | FUTEX_PRIVATE_FLAG,
                 0, // wait if value is still 0
-                reinterpret_cast<struct kernel_timespec *>(&ts));
+                reinterpret_cast<struct kernel_timespec *>(&ts), NULL, 0);
 #else
       sched_yield();
 #endif

[impala] 01/02: IMPALA-9373: more tactical IWYU fixes

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit da5b498c18ba1a22b122682da73481af633a3398
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Mon Mar 9 15:56:37 2020 -0700

    IMPALA-9373: more tactical IWYU fixes
    
    This is a grab-bag of fixes that I did with a mix of manual
    inspection. The techniques used were:
    * Getting preprocessor output for a few files by modifying
      command lines from compiler_commands.json to include -E.
      This is revealing because you see all the random unrelated
      cruft that gets pulled in. A useful one liner to extract
      an (approximate) list of headers from preprocessor output is:
      grep '^#.*h' be/src/util/CMakeFiles/Util.dir/os-info.cc.i | \
          grep -o '".*"' | sort -u
    * Looking at the IWYU recommendations for guidance on what
      headers can be removed (and what need to be added).
    * Grepping for includes of headers, especially in other headers
      where they become viral. An example one-liner to find these:
      git grep -l 'include.*<iostream>' | grep '\.h$'
    
    Non-exhaustive list of changes made:
    -----------------------------------
    Unnest classes from TmpFileMgr so we can forward-declare them.
    This lets us remove tmp-file-mgr.h from buffer-pool.h and
    query-state.h, which are both widely included headers in the
    codebase.
    
    Also remove webserver.h from other headers, since it
    pulls in openssl-util.h and consequently a lot of
    openssl headers.
    
    Avoid including runtime/multi-precision.h in other headers.
    It pulls in a lot of boost multiprecision headers that
    are only needed for internal implementations of math
    and decimal operations. This required replacing some
    references to int128_t with __int128_t, which I don't
    think significantly hurts code readability.
    
    Also remove references to decimal-util.h where they're
    not needed, since it transitively pulls in
    multi-precision.h
    
    Reduce includes of boost/date_time modules, which are
    transitively many places via timestamp-value.h.
    
    Remove transitive dependencies of timestamp-value.h
    to avoid pulling in remaining boost date_time headers
    where not needed. Dependent headers are:
    scalar-expr-evaluator.h, expr-value.h
    
    Remove references to debug-util.h in other headers,
    because it pulls in a lot of thread headers.
    
    Remove references to llvm-codegen.h where possible,
    because it pulls in many llvm headers.
    
    Other opportunities:
    --------------------
    * boost/algorithm/string.hpp includes many string algorithms
      and pulls in a lot of headers.
    * util/string-parser.h is a giant header with many dependencies.
    * There's lots of redundancy between boost and standard c++
      headers. Both pull in vast numbers of utility headers for
      C++ metaprogramming and similar things. If we reduced virality
      of boost headers this would help a lot, and also if we switch
      to equivalent standard headers where possible (e.g. unordered_map,
      unordered_set, function, bind, etc).
    
    Compile time with clang/ASAN:
    -----------------------------
    Before:
    real    9m6.311s
    user    62m25.006s
    sys     2m44.798s
    
    After:
    real    8m17.073s
    user    55m38.425s
    sys     2m25.808s
    
    Change-Id: I8de71866bdf3211e53560d9bfe930e7657c4d7f1
    Reviewed-on: http://gerrit.cloudera.org:8080/15248
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/benchmarks/atod-benchmark.cc                |   1 +
 be/src/benchmarks/bloom-filter-benchmark.cc        |   1 +
 be/src/benchmarks/overflow-benchmark.cc            |  18 +-
 be/src/codegen/codegen-anyval.cc                   |   3 +-
 be/src/codegen/codegen-anyval.h                    |   3 +-
 be/src/codegen/llvm-codegen.cc                     |   2 +-
 be/src/common/init.cc                              |   1 -
 be/src/common/logging.cc                           |  42 ++
 be/src/common/logging.h                            |   9 +-
 be/src/common/status.cc                            |  11 +-
 be/src/common/thread-debug-info-test.cc            |   1 +
 be/src/common/thread-debug-info.h                  |   8 +-
 be/src/exec/aggregator.cc                          |   1 +
 be/src/exec/blocking-plan-root-sink.cc             |   2 +
 be/src/exec/buffered-plan-root-sink.cc             |   2 +
 be/src/exec/catalog-op-executor.cc                 |   1 +
 be/src/exec/data-sink.cc                           |   1 +
 be/src/exec/data-sink.h                            |   2 +-
 be/src/exec/exec-node.cc                           |   2 +-
 be/src/exec/exec-node.h                            |  15 +-
 be/src/exec/filter-context.cc                      |   1 +
 be/src/exec/filter-context.h                       |   7 +-
 be/src/exec/grouping-aggregator.cc                 |   1 +
 be/src/exec/hash-table-test.cc                     |   1 -
 be/src/exec/hdfs-avro-scanner-ir.cc                |   2 +-
 be/src/exec/hdfs-columnar-scanner-ir.cc            |   2 +
 be/src/exec/hdfs-columnar-scanner.cc               |   4 +
 be/src/exec/hdfs-columnar-scanner.h                |  12 +-
 be/src/exec/hdfs-orc-scanner.cc                    |   2 +
 be/src/exec/hdfs-scan-node.cc                      |   1 +
 be/src/exec/hdfs-scanner.cc                        |   1 +
 be/src/exec/hdfs-sequence-scanner.cc               |  34 +-
 be/src/exec/hdfs-text-scanner.cc                   |  36 +-
 be/src/exec/join-builder.cc                        |   3 +
 be/src/exec/kudu-scan-node.cc                      |   1 +
 be/src/exec/kudu-scanner.cc                        |   1 +
 be/src/exec/kudu-table-sink.cc                     |   1 +
 be/src/exec/kudu-table-sink.h                      |   4 +-
 be/src/exec/orc-column-readers.cc                  |   7 +-
 be/src/exec/orc-column-readers.h                   |   1 +
 be/src/exec/parquet/hdfs-parquet-scanner.cc        |   7 +-
 be/src/exec/parquet/hdfs-parquet-scanner.h         |   1 -
 be/src/exec/parquet/parquet-column-chunk-reader.cc |   1 +
 be/src/exec/parquet/parquet-column-chunk-reader.h  |   3 +-
 be/src/exec/parquet/parquet-column-readers.cc      |   3 +
 be/src/exec/parquet/parquet-common.h               |   2 +
 be/src/exec/parquet/parquet-version-test.cc        |   1 -
 be/src/exec/partitioned-hash-join-builder-ir.cc    |  16 +-
 be/src/exec/partitioned-hash-join-builder.cc       |   1 +
 be/src/exec/plan-root-sink.cc                      |   2 +
 be/src/exec/read-write-util-test.cc                |   1 -
 be/src/exec/row-batch-cache.h                      |   1 -
 be/src/exec/row-batch-list-test.cc                 |   1 -
 be/src/exec/scan-node.cc                           |   2 +
 be/src/exec/topn-node.cc                           |   1 +
 be/src/exec/zigzag-test.cc                         |   1 -
 be/src/exprs/agg-fn-evaluator.cc                   |  23 +-
 be/src/exprs/agg-fn.h                              |   4 +
 be/src/exprs/aggregate-functions-ir.cc             |  17 +-
 be/src/exprs/aggregate-functions-test.cc           |   5 +-
 be/src/exprs/anyval-util.cc                        |   3 -
 be/src/exprs/anyval-util.h                         |   1 -
 be/src/exprs/decimal-functions-ir.cc               |   7 +-
 be/src/exprs/decimal-operators-ir.cc               |   2 +-
 be/src/exprs/expr-test.cc                          |   4 +-
 be/src/exprs/expr-value.h                          |  14 +-
 be/src/exprs/hive-udf-call.cc                      |   3 +-
 be/src/exprs/literal.cc                            |   1 +
 be/src/exprs/math-functions-ir.cc                  |   7 +-
 be/src/exprs/scalar-expr-evaluator.h               |   8 +
 be/src/exprs/timestamp-functions-ir.cc             |   5 +-
 be/src/exprs/timezone_db.cc                        |  12 +-
 be/src/exprs/udf-builtins.cc                       |  11 +
 be/src/rpc/thrift-util-test.cc                     |   2 -
 be/src/runtime/CMakeLists.txt                      |   1 -
 be/src/runtime/bufferpool/buffer-allocator.h       |   2 +
 be/src/runtime/bufferpool/buffer-pool-internal.h   |  17 +-
 be/src/runtime/bufferpool/buffer-pool-test.cc      |  29 +-
 be/src/runtime/bufferpool/buffer-pool.cc           |  10 +-
 be/src/runtime/bufferpool/buffer-pool.h            |   8 +-
 be/src/runtime/client-cache.h                      |  25 +-
 be/src/runtime/coordinator-backend-state.cc        |   1 +
 .../runtime/datetime-iso-sql-format-tokenizer.cc   |  10 +
 be/src/runtime/datetime-parser-common.cc           |   1 +
 be/src/runtime/datetime-parser-common.h            |   2 +
 be/src/runtime/decimal-test.cc                     |  13 +-
 be/src/runtime/decimal-value.h                     |   4 +-
 be/src/runtime/decimal-value.inline.h              |  48 +-
 be/src/runtime/descriptors.h                       |  16 +-
 be/src/runtime/exec-env.cc                         |   1 +
 be/src/runtime/io/data-cache-test.cc               |   1 -
 be/src/runtime/io/disk-io-mgr-test.cc              |   7 +-
 be/src/runtime/mem-tracker.h                       |  18 +-
 be/src/runtime/multi-precision.cc                  |  66 ---
 be/src/runtime/multi-precision.h                   |  36 +-
 be/src/runtime/query-state.cc                      |   3 +-
 be/src/runtime/query-state.h                       |   6 +-
 be/src/runtime/runtime-filter-bank.cc              |   1 +
 be/src/runtime/sorted-run-merger.cc                |   1 +
 be/src/runtime/sorted-run-merger.h                 |   4 +-
 be/src/runtime/sorter.cc                           |   1 +
 be/src/runtime/sorter.h                            |   2 +-
 be/src/runtime/test-env.cc                         |   2 +
 be/src/runtime/test-env.h                          |   2 -
 be/src/runtime/timestamp-parse-util.cc             |  22 +-
 be/src/runtime/timestamp-test.cc                   |   2 +-
 be/src/runtime/timestamp-value.h                   |  23 +-
 be/src/runtime/tmp-file-mgr-internal.h             |  17 +-
 be/src/runtime/tmp-file-mgr-test.cc                | 102 ++--
 be/src/runtime/tmp-file-mgr.cc                     | 118 ++--
 be/src/runtime/tmp-file-mgr.h                      | 624 +++++++++++----------
 be/src/runtime/tuple.h                             |   3 +
 be/src/service/client-request-state-map.cc         |   1 +
 be/src/service/client-request-state.cc             |   2 +
 be/src/service/client-request-state.h              |   1 -
 be/src/service/control-service.h                   |   1 -
 be/src/service/impala-beeswax-server.cc            |   1 +
 be/src/service/impala-http-handler.cc              |   1 +
 be/src/service/impala-internal-service.cc          |   1 +
 be/src/service/impala-server.h                     |   3 +-
 be/src/statestore/statestore.h                     |   1 -
 be/src/udf/uda-test.cc                             |   2 -
 be/src/util/CMakeLists.txt                         |   1 -
 be/src/util/arithmetic-util.h                      |  27 +
 be/src/util/auth-util.cc                           |   1 +
 be/src/util/auth-util.h                            |   1 -
 be/src/util/bit-stream-utils-test.cc               |   2 +
 be/src/util/bit-util-test.cc                       |  47 +-
 be/src/util/bit-util.h                             |  62 --
 be/src/util/bitmap-test.cc                         |   3 -
 be/src/util/bloom-filter.h                         |   1 -
 be/src/util/cgroup-util.cc                         |   1 -
 be/src/util/codec.cc                               |   3 +
 be/src/util/codec.h                                |   6 +-
 be/src/util/debug-util.h                           |   6 +-
 .../util/{decimal-util.cc => decimal-constants.h}  |  22 +-
 be/src/util/decimal-util.h                         |  14 +-
 be/src/util/decompress.cc                          |   1 +
 be/src/util/decompress.h                           |   1 -
 be/src/util/dict-test.cc                           |   5 -
 be/src/util/event-metrics.h                        |   1 -
 be/src/util/logging-support-test.cc                |   1 -
 be/src/util/mem-info.cc                            |   2 -
 be/src/util/memory-metrics.h                       |   2 -
 be/src/util/metrics.h                              |  14 +-
 be/src/util/os-info.cc                             |   7 +-
 be/src/util/pretty-printer.h                       |   3 +-
 be/src/util/proc-info-test.cc                      |   8 -
 be/src/util/process-state-info.cc                  |   1 -
 be/src/util/redactor-test-utils.h                  |   2 -
 be/src/util/runtime-profile.cc                     |   1 +
 be/src/util/string-parser-test.cc                  |   1 -
 be/src/util/string-parser.h                        |   3 +
 be/src/util/symbols-util-test.cc                   |   2 -
 be/src/util/system-state-info.cc                   |   4 +-
 be/src/util/tuple-row-compare.h                    |   1 -
 be/src/util/uid-util-test.cc                       |   4 -
 be/src/util/uid-util.cc                            |   2 +
 be/src/util/uid-util.h                             |  12 +-
 159 files changed, 1069 insertions(+), 890 deletions(-)

diff --git a/be/src/benchmarks/atod-benchmark.cc b/be/src/benchmarks/atod-benchmark.cc
index 1e86844..0afc531 100644
--- a/be/src/benchmarks/atod-benchmark.cc
+++ b/be/src/benchmarks/atod-benchmark.cc
@@ -23,6 +23,7 @@
 #include "runtime/string-value.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
+#include "util/decimal-util.h"
 #include "util/string-parser.h"
 
 #include "common/names.h"
diff --git a/be/src/benchmarks/bloom-filter-benchmark.cc b/be/src/benchmarks/bloom-filter-benchmark.cc
index 1e4938d..a3aa2f7 100644
--- a/be/src/benchmarks/bloom-filter-benchmark.cc
+++ b/be/src/benchmarks/bloom-filter-benchmark.cc
@@ -21,6 +21,7 @@
 #include <iostream>
 #include <vector>
 
+#include "gen-cpp/data_stream_service.pb.h"
 #include "kudu/rpc/rpc_controller.h"
 #include "runtime/bufferpool/buffer-allocator.h"
 #include "runtime/bufferpool/reservation-tracker.h"
diff --git a/be/src/benchmarks/overflow-benchmark.cc b/be/src/benchmarks/overflow-benchmark.cc
index c14b74c..72b6547 100644
--- a/be/src/benchmarks/overflow-benchmark.cc
+++ b/be/src/benchmarks/overflow-benchmark.cc
@@ -24,6 +24,8 @@
 #include "runtime/string-value.h"
 #include "util/benchmark.h"
 #include "util/cpu-info.h"
+#include "util/decimal-constants.h"
+#include "util/decimal-util.h"
 #include "util/string-parser.h"
 
 #include "common/names.h"
@@ -101,14 +103,14 @@ static bool AdjustToSameScale(const Decimal16Value& x, int x_scale,
     *y_scaled = y.value();
   } else if (delta_scale > 0) {
     if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION &&
-        DecimalUtil::MAX_UNSCALED_DECIMAL16 / scale_factor < abs(y.value())) {
+        MAX_UNSCALED_DECIMAL16 / scale_factor < abs(y.value())) {
       return true;
     }
     *x_scaled = x.value();
     *y_scaled = y.value() * scale_factor;
   } else {
     if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION &&
-        DecimalUtil::MAX_UNSCALED_DECIMAL16 / scale_factor < abs(x.value())) {
+        MAX_UNSCALED_DECIMAL16 / scale_factor < abs(x.value())) {
       return true;
     }
     *x_scaled = x.value() * scale_factor;
@@ -150,7 +152,7 @@ DecimalValue<RESULT_T> BuiltinAdd(const Decimal16Value& val, int this_scale,
   if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION) {
     RESULT_T result = 0;
     *overflow |= __builtin_add_overflow(x, y, &result);
-    *overflow |= abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
+    *overflow |= abs(result) > MAX_UNSCALED_DECIMAL16;
     return DecimalValue<RESULT_T>(result);
   } else {
     DCHECK(!*overflow) << "Cannot overflow unless result is Decimal16Value";
@@ -174,7 +176,7 @@ DecimalValue<RESULT_T> AddLookupTbl(const Decimal16Value& val, int this_scale,
         result_precision == ColumnType::MAX_PRECISION) {
       // Can only overflow if the signs are the same and result precision reaches
       // max precision.
-      *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
+      *overflow |= MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
       // TODO: faster to return here? We don't care at all about the perf on
       // the overflow case but what makes the normal path faster?
     }
@@ -199,7 +201,7 @@ DecimalValue<RESULT_T> Add(const Decimal16Value& val, int this_scale,
         result_precision == ColumnType::MAX_PRECISION) {
       // Can only overflow if the signs are the same and result precision reaches
       // max precision.
-      *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
+      *overflow |= MAX_UNSCALED_DECIMAL16 - abs(x) < abs(y);
       // TODO: faster to return here? We don't care at all about the perf on
       // the overflow case but what makes the normal path faster?
     }
@@ -263,7 +265,7 @@ DecimalValue<RESULT_T> BuiltinMultiply(const Decimal16Value& val, int this_scale
   if (sizeof(RESULT_T) == 16 && result_precision == ColumnType::MAX_PRECISION) {
     // Check overflow
     *overflow |= __builtin_mul_overflow(x, y, &result);
-    *overflow |= abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
+    *overflow |= abs(result) > MAX_UNSCALED_DECIMAL16;
   } else {
     result = x * y;
   }
@@ -299,7 +301,7 @@ DecimalValue<RESULT_T> MultiplyCheckMSB(const Decimal16Value& val, int this_scal
     // Check overflow
     if (result_precision == ColumnType::MAX_PRECISION &&
         DecimalUtil::Clz(abs(x)) + DecimalUtil::Clz(abs(y)) < 130) {
-      *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
+      *overflow |= MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
     }
   }
   RESULT_T result = x * y;
@@ -332,7 +334,7 @@ DecimalValue<RESULT_T> Multiply(const Decimal16Value& val, int this_scale,
   if (sizeof(RESULT_T) == 16) {
     // Check overflow
     if (result_precision == ColumnType::MAX_PRECISION) {
-      *overflow |= DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
+      *overflow |= MAX_UNSCALED_DECIMAL16 / abs(y) < abs(x);
     }
   }
   RESULT_T result = x * y;
diff --git a/be/src/codegen/codegen-anyval.cc b/be/src/codegen/codegen-anyval.cc
index c702315..66d79e7 100644
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc
@@ -18,6 +18,7 @@
 #include "codegen/codegen-anyval.h"
 
 #include "codegen/codegen-util.h"
+#include "runtime/multi-precision.h"
 #include "runtime/raw-value.h"
 #include "common/names.h"
 
@@ -408,7 +409,7 @@ void CodegenAnyVal::SetVal(int64_t val) {
   SetVal(builder_->getInt64(val));
 }
 
-void CodegenAnyVal::SetVal(int128_t val) {
+void CodegenAnyVal::SetVal(__int128_t val) {
   DCHECK_EQ(type_.type, TYPE_DECIMAL);
   vector<uint64_t> vals({LowBits(val), HighBits(val)});
   llvm::Value* ir_val =
diff --git a/be/src/codegen/codegen-anyval.h b/be/src/codegen/codegen-anyval.h
index 91b5594..ded24a5 100644
--- a/be/src/codegen/codegen-anyval.h
+++ b/be/src/codegen/codegen-anyval.h
@@ -20,7 +20,6 @@
 
 #include "codegen/llvm-codegen.h"
 #include "runtime/descriptors.h"
-#include "runtime/multi-precision.h"
 
 namespace llvm {
 class Type;
@@ -166,7 +165,7 @@ class CodegenAnyVal {
   void SetVal(int16_t val);
   void SetVal(int32_t val);
   void SetVal(int64_t val);
-  void SetVal(int128_t val);
+  void SetVal(__int128_t val);
   void SetVal(float val);
   void SetVal(double val);
 
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index 4975192..0d14bdf 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -18,7 +18,6 @@
 #include "codegen/llvm-codegen.h"
 
 #include <fstream>
-#include <iostream>
 #include <sstream>
 #include <unordered_set>
 
@@ -75,6 +74,7 @@
 #include "runtime/string-value.h"
 #include "runtime/timestamp-value.h"
 #include "util/cpu-info.h"
+#include "util/debug-util.h"
 #include "util/hdfs-util.h"
 #include "util/path-builder.h"
 #include "util/runtime-profile-counters.h"
diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index 022406b..e59c11b 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -42,7 +42,6 @@
 #include "util/cgroup-util.h"
 #include "util/cpu-info.h"
 #include "util/debug-util.h"
-#include "util/decimal-util.h"
 #include "util/disk-info.h"
 #include "util/jni-util.h"
 #include "util/logging-support.h"
diff --git a/be/src/common/logging.cc b/be/src/common/logging.cc
index f4a1de8..7e9e4f7 100644
--- a/be/src/common/logging.cc
+++ b/be/src/common/logging.cc
@@ -21,6 +21,7 @@
 #include <cerrno>
 #include <ctime>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <map>
 #include <mutex>
@@ -32,6 +33,7 @@
 
 #include "common/logging.h"
 #include "service/impala-server.h"
+#include "util/debug-util.h"
 #include "util/error-util.h"
 #include "util/logging-support.h"
 #include "util/redactor.h"
@@ -221,3 +223,43 @@ void impala::CheckAndRotateAuditEventLogFiles(int max_log_files) {
 
   impala::LoggingSupport::DeleteOldLogs(fname, max_log_files);
 }
+
+static const uint32_t ONE_BILLION = 1000000000;
+
+// Print the value in base 10 by converting v into parts that are base
+// 1 billion (large multiple of 10 that's easy to work with).
+ostream& impala::operator<<(ostream& os, const __int128_t& val) {
+  __int128_t v = val;
+  if (v == 0) {
+    os << "0";
+    return os;
+  }
+
+  if (v < 0) {
+    v = -v;
+    os << "-";
+  }
+
+  // 1B^5 covers the range for __int128_t
+  // parts[0] is the least significant place.
+  uint32_t parts[5];
+  int index = 0;
+  while (v > 0) {
+    parts[index++] = v % ONE_BILLION;
+    v /= ONE_BILLION;
+  }
+  --index;
+
+  // Accumulate into a temporary stringstream so format options on 'os' do
+  // not mess up printing val.
+  // TODO: This is likely pretty expensive with the string copies. We don't
+  // do this in paths we care about currently but might need to revisit.
+  stringstream ss;
+  ss << parts[index];
+  for (int i = index - 1; i >= 0; --i) {
+    // The remaining parts need to be padded with leading zeros.
+    ss << setfill('0') << setw(9) << parts[i];
+  }
+  os << ss.str();
+  return os;
+}
diff --git a/be/src/common/logging.h b/be/src/common/logging.h
index dc926e3..d23d391 100644
--- a/be/src/common/logging.h
+++ b/be/src/common/logging.h
@@ -68,10 +68,10 @@
   DCHECK(a == b) << "[ " #a " = " << static_cast<int>(a) << " , " #b " = " \
                  << static_cast<int>(b) << " ]"
 
+namespace impala {
 /// IR modules don't use these methods, and can't see the google namespace used in
 /// GetFullLogFilename()'s prototype.
 #ifndef IR_COMPILE
-namespace impala {
 
 /// glog doesn't allow multiple invocations of InitGoogleLogging(). This method
 /// conditionally calls InitGoogleLogging() only if it hasn't been called before.
@@ -96,8 +96,13 @@ void CheckAndRotateLogFiles(int max_log_files);
 /// directory and removes the oldest ones given an upper bound of number of audit event
 /// logfiles to keep.
 void CheckAndRotateAuditEventLogFiles(int max_log_files);
-}
 
 #endif // IR_COMPILE
 
+/// Prints v in base 10.
+/// Defined here so that __int128_t can be used in log messages (the C++ standard library
+/// does not provide support for __int128_t by default).
+std::ostream& operator<<(std::ostream& os, const __int128_t& val);
+
+} // namespace impala
 #endif
diff --git a/be/src/common/status.cc b/be/src/common/status.cc
index eca2104..7e8febc 100644
--- a/be/src/common/status.cc
+++ b/be/src/common/status.cc
@@ -15,15 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <boost/algorithm/string/join.hpp>
+#include "common/status.h"
+
 #include <ostream>
 
-#include "common/status.h"
+#include <boost/algorithm/string/join.hpp>
 
+#include "gen-cpp/ErrorCodes_types.h"
+#include "gen-cpp/common.pb.h"
 #include "util/debug-util.h"
+#include "util/error-util.h"
+
 #include "common/names.h"
-#include "gen-cpp/common.pb.h"
-#include "gen-cpp/ErrorCodes_types.h"
 
 using namespace apache::hive::service::cli::thrift;
 
diff --git a/be/src/common/thread-debug-info-test.cc b/be/src/common/thread-debug-info-test.cc
index 7b1ee52..33a02de 100644
--- a/be/src/common/thread-debug-info-test.cc
+++ b/be/src/common/thread-debug-info-test.cc
@@ -20,6 +20,7 @@
 #include "common/thread-debug-info.h"
 #include "testutil/gtest-util.h"
 #include "util/container-util.h"
+#include "util/debug-util.h"
 #include "util/thread.h"
 
 #include "common/names.h"
diff --git a/be/src/common/thread-debug-info.h b/be/src/common/thread-debug-info.h
index 4b19db3..a2be4f3 100644
--- a/be/src/common/thread-debug-info.h
+++ b/be/src/common/thread-debug-info.h
@@ -18,19 +18,17 @@
 #ifndef IMPALA_COMMON_THREAD_DEBUG_INFO_H
 #define IMPALA_COMMON_THREAD_DEBUG_INFO_H
 
+#include <cstdint>
 #include <string>
-#include <sys/syscall.h>
+#include <syscall.h>
 #include <unistd.h>
 
-#include "glog/logging.h"
+#include "gen-cpp/Types_types.h"
 #include "gutil/macros.h"
 #include "gutil/strings/util.h"
-#include "util/debug-util.h"
 
 namespace impala {
 
-class ScopedThreadContext;
-
 /// Stores information about the current thread that can be useful in a debug session.
 /// An object of this class needs to be allocated on the stack in order to include
 /// it in minidumps. While this object is alive, it is available through the global
diff --git a/be/src/exec/aggregator.cc b/be/src/exec/aggregator.cc
index 1347670..705102b 100644
--- a/be/src/exec/aggregator.cc
+++ b/be/src/exec/aggregator.cc
@@ -25,6 +25,7 @@
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/expr-value.h"
 #include "exprs/scalar-expr.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
diff --git a/be/src/exec/blocking-plan-root-sink.cc b/be/src/exec/blocking-plan-root-sink.cc
index 5db0ebe..714f105 100644
--- a/be/src/exec/blocking-plan-root-sink.cc
+++ b/be/src/exec/blocking-plan-root-sink.cc
@@ -21,7 +21,9 @@
 #include "runtime/row-batch.h"
 #include "runtime/tuple-row.h"
 #include "service/query-result-set.h"
+#include "util/debug-util.h"
 #include "util/pretty-printer.h"
+#include "util/runtime-profile-counters.h"
 
 #include <memory>
 #include <mutex>
diff --git a/be/src/exec/buffered-plan-root-sink.cc b/be/src/exec/buffered-plan-root-sink.cc
index c365f69..277eac6 100644
--- a/be/src/exec/buffered-plan-root-sink.cc
+++ b/be/src/exec/buffered-plan-root-sink.cc
@@ -17,6 +17,8 @@
 
 #include "exec/buffered-plan-root-sink.h"
 #include "service/query-result-set.h"
+#include "util/debug-util.h"
+#include "util/runtime-profile-counters.h"
 
 #include "common/names.h"
 
diff --git a/be/src/exec/catalog-op-executor.cc b/be/src/exec/catalog-op-executor.cc
index 1ed2ef0..a0566fd 100644
--- a/be/src/exec/catalog-op-executor.cc
+++ b/be/src/exec/catalog-op-executor.cc
@@ -28,6 +28,7 @@
 #include "service/frontend.h"
 #include "service/impala-server.h"
 #include "service/hs2-util.h"
+#include "util/debug-util.h"
 #include "util/runtime-profile-counters.h"
 #include "util/string-parser.h"
 #include "util/test-info.h"
diff --git a/be/src/exec/data-sink.cc b/be/src/exec/data-sink.cc
index 9ce4574..a49df19 100644
--- a/be/src/exec/data-sink.cc
+++ b/be/src/exec/data-sink.cc
@@ -32,6 +32,7 @@
 #include "exec/partitioned-hash-join-builder.h"
 #include "exec/plan-root-sink.h"
 #include "exprs/scalar-expr.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "gen-cpp/ImpalaInternalService_constants.h"
 #include "gen-cpp/ImpalaInternalService_types.h"
 #include "gutil/strings/substitute.h"
diff --git a/be/src/exec/data-sink.h b/be/src/exec/data-sink.h
index 8736692..9445ff2 100644
--- a/be/src/exec/data-sink.h
+++ b/be/src/exec/data-sink.h
@@ -24,13 +24,13 @@
 
 #include "common/status.h"
 #include "runtime/runtime-state.h"  // for PartitionStatusMap
-#include "runtime/mem-tracker.h"
 #include "gen-cpp/Exprs_types.h"
 
 namespace impala {
 
 class DataSink;
 class MemPool;
+class MemTracker;
 class ObjectPool;
 class RowBatch;
 class RuntimeProfile;
diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc
index 54e3e04..55589e5 100644
--- a/be/src/exec/exec-node.cc
+++ b/be/src/exec/exec-node.cc
@@ -460,7 +460,7 @@ bool ExecNode::CheckLimitAndTruncateRowBatchIfNeededShared(
 bool ExecNode::EvalConjuncts(
     ScalarExprEvaluator* const* evals, int num_conjuncts, TupleRow* row) {
   for (int i = 0; i < num_conjuncts; ++i) {
-    if (!EvalPredicate(evals[i], row)) return false;
+    if (!evals[i]->EvalPredicate(row)) return false;
   }
   return true;
 }
diff --git a/be/src/exec/exec-node.h b/be/src/exec/exec-node.h
index ec74ebc..ec91276 100644
--- a/be/src/exec/exec-node.h
+++ b/be/src/exec/exec-node.h
@@ -22,7 +22,6 @@
 #include <vector>
 
 #include "common/status.h"
-#include "exprs/scalar-expr-evaluator.h"
 #include "gen-cpp/PlanNodes_types.h"
 #include "gutil/threading/thread_collision_warner.h"
 #include "runtime/bufferpool/buffer-pool.h"
@@ -32,6 +31,10 @@
 #include "util/runtime-profile-counters.h"
 #include "util/runtime-profile.h"
 
+namespace llvm {
+class Function;
+}
+
 namespace impala {
 
 class DataSink;
@@ -41,6 +44,7 @@ class ObjectPool;
 class RowBatch;
 class RuntimeState;
 class ScalarExpr;
+class ScalarExprEvaluator;
 class SubplanNode;
 class SubplanPlanNode;
 class TPlan;
@@ -218,9 +222,6 @@ class ExecNode {
   /// Collect all scan node types.
   void CollectScanNodes(std::vector<ExecNode*>* nodes);
 
-  /// Evaluates the predicate in 'eval' over 'row' and returns the result.
-  static bool EvalPredicate(ScalarExprEvaluator* eval, TupleRow* row);
-
   /// Evaluate the conjuncts in 'evaluators' over 'row'.
   /// Returns true if all exprs return true.
   static bool EvalConjuncts(
@@ -515,10 +516,4 @@ class ExecNode {
   /// reservations pool in Close().
   ReservationManager reservation_manager_;
 };
-
-inline bool ExecNode::EvalPredicate(ScalarExprEvaluator* eval, TupleRow* row) {
-  BooleanVal v = eval->GetBooleanVal(row);
-  if (v.is_null || !v.val) return false;
-  return true;
-}
 } // namespace impala
diff --git a/be/src/exec/filter-context.cc b/be/src/exec/filter-context.cc
index 8bc0f3c..32d78cc 100644
--- a/be/src/exec/filter-context.cc
+++ b/be/src/exec/filter-context.cc
@@ -18,6 +18,7 @@
 #include "exec/filter-context.h"
 
 #include "codegen/codegen-anyval.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/runtime-filter.inline.h"
 #include "runtime/tuple-row.h"
 #include "util/min-max-filter.h"
diff --git a/be/src/exec/filter-context.h b/be/src/exec/filter-context.h
index e20f7f2..c84a730 100644
--- a/be/src/exec/filter-context.h
+++ b/be/src/exec/filter-context.h
@@ -20,16 +20,21 @@
 #define IMPALA_EXEC_FILTER_CONTEXT_H
 
 #include <boost/unordered_map.hpp>
-#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/runtime-filter.h"
 #include "util/runtime-profile.h"
 
+namespace llvm {
+class Function;
+}
+
 namespace impala {
 
 class BloomFilter;
 class LlvmCodeGen;
 class MinMaxFilter;
+class RuntimeState;
 class ScalarExpr;
+class ScalarExprEvaluator;
 class TupleRow;
 
 /// Container struct for per-filter statistics, with statistics for each granularity of
diff --git a/be/src/exec/grouping-aggregator.cc b/be/src/exec/grouping-aggregator.cc
index 14aa6c1..bed5c94 100644
--- a/be/src/exec/grouping-aggregator.cc
+++ b/be/src/exec/grouping-aggregator.cc
@@ -33,6 +33,7 @@
 #include "runtime/mem-tracker.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
+#include "runtime/string-value.h"
 #include "runtime/tuple-row.h"
 #include "runtime/tuple.h"
 #include "util/runtime-profile-counters.h"
diff --git a/be/src/exec/hash-table-test.cc b/be/src/exec/hash-table-test.cc
index 1a98804..2a0c1ec 100644
--- a/be/src/exec/hash-table-test.cc
+++ b/be/src/exec/hash-table-test.cc
@@ -19,7 +19,6 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
 #include <limits>
 #include <vector>
 
diff --git a/be/src/exec/hdfs-avro-scanner-ir.cc b/be/src/exec/hdfs-avro-scanner-ir.cc
index 1d8ec60..5e2599d 100644
--- a/be/src/exec/hdfs-avro-scanner-ir.cc
+++ b/be/src/exec/hdfs-avro-scanner-ir.cc
@@ -309,7 +309,7 @@ bool HdfsAvroScanner::ReadAvroDecimal(int slot_byte_size, uint8_t** data,
         break;
       }
       case 16: {
-        int128_t* decimal = reinterpret_cast<int128_t*>(slot);
+        __int128_t* decimal = reinterpret_cast<__int128_t*>(slot);
         *decimal >>= bytes_to_fill * 8;
         break;
       }
diff --git a/be/src/exec/hdfs-columnar-scanner-ir.cc b/be/src/exec/hdfs-columnar-scanner-ir.cc
index 6b160d9..2c2a6a6 100644
--- a/be/src/exec/hdfs-columnar-scanner-ir.cc
+++ b/be/src/exec/hdfs-columnar-scanner-ir.cc
@@ -16,6 +16,8 @@
 // under the License.
 
 #include "exec/hdfs-columnar-scanner.h"
+#include "runtime/row-batch.h"
+#include "exec/scratch-tuple-batch.h"
 
 namespace impala {
 
diff --git a/be/src/exec/hdfs-columnar-scanner.cc b/be/src/exec/hdfs-columnar-scanner.cc
index 06e9295..b7a33fa 100644
--- a/be/src/exec/hdfs-columnar-scanner.cc
+++ b/be/src/exec/hdfs-columnar-scanner.cc
@@ -20,6 +20,10 @@
 #include <algorithm>
 
 #include "codegen/llvm-codegen.h"
+#include "exec/hdfs-scan-node-base.h"
+#include "exec/scratch-tuple-batch.h"
+#include "runtime/row-batch.h"
+#include "runtime/runtime-state.h"
 
 namespace impala {
 
diff --git a/be/src/exec/hdfs-columnar-scanner.h b/be/src/exec/hdfs-columnar-scanner.h
index 5c627bf..a5c747c 100644
--- a/be/src/exec/hdfs-columnar-scanner.h
+++ b/be/src/exec/hdfs-columnar-scanner.h
@@ -21,14 +21,14 @@
 
 #include <boost/scoped_ptr.hpp>
 
-#include "codegen/impala-ir.h"
-#include "exec/hdfs-scan-node-base.h"
-#include "exec/scratch-tuple-batch.h"
-#include "runtime/row-batch.h"
-#include "runtime/runtime-state.h"
-
 namespace impala {
 
+class HdfsScanNodeBase;
+class HdfsScanPlanNode;
+class RowBatch;
+class RuntimeState;
+struct ScratchTupleBatch;
+
 /// Parent class for scanners that read values into a scratch batch before applying
 /// conjuncts and runtime filters.
 class HdfsColumnarScanner : public HdfsScanner {
diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index d1be330..3308818 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -21,10 +21,12 @@
 
 #include "exec/orc-column-readers.h"
 #include "exec/scanner-context.inline.h"
+#include "exec/scratch-tuple-batch.h"
 #include "exprs/expr.h"
 #include "runtime/collection-value-builder.h"
 #include "runtime/exec-env.h"
 #include "runtime/io/request-context.h"
+#include "runtime/mem-tracker.h"
 #include "runtime/runtime-filter.inline.h"
 #include "runtime/timestamp-value.inline.h"
 #include "runtime/tuple-row.h"
diff --git a/be/src/exec/hdfs-scan-node.cc b/be/src/exec/hdfs-scan-node.cc
index 4e19736..8601599 100644
--- a/be/src/exec/hdfs-scan-node.cc
+++ b/be/src/exec/hdfs-scan-node.cc
@@ -26,6 +26,7 @@
 #include "exec/exec-node-util.h"
 #include "exec/hdfs-scanner.h"
 #include "exec/scanner-context.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/blocking-row-batch-queue.h"
 #include "runtime/descriptors.h"
 #include "runtime/fragment-instance-state.h"
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index ee6eb5b..e1e998a 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -24,6 +24,7 @@
 #include "exec/hdfs-scan-node-mt.h"
 #include "exec/read-write-util.h"
 #include "exec/text-converter.inline.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/collection-value-builder.h"
 #include "runtime/hdfs-fs-cache.h"
 #include "runtime/runtime-filter.inline.h"
diff --git a/be/src/exec/hdfs-sequence-scanner.cc b/be/src/exec/hdfs-sequence-scanner.cc
index 2cc08a0..9520dcd 100644
--- a/be/src/exec/hdfs-sequence-scanner.cc
+++ b/be/src/exec/hdfs-sequence-scanner.cc
@@ -17,20 +17,46 @@
 
 #include "exec/hdfs-sequence-scanner.h"
 
-#include "codegen/llvm-codegen.h"
+#include <string.h>
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "common/compiler-util.h"
+#include "common/logging.h"
+#include "exec/delimited-text-parser.h"
 #include "exec/delimited-text-parser.inline.h"
-#include "exec/hdfs-scan-node.h"
+#include "exec/hdfs-scan-node-base.h"
+#include "exec/read-write-util.h"
+#include "exec/scanner-context.h"
 #include "exec/scanner-context.inline.h"
-#include "exec/text-converter.inline.h"
+#include "exec/text-converter.h"
+#include "gen-cpp/ErrorCodes_types.h"
 #include "runtime/descriptors.h"
+#include "runtime/mem-pool.h"
+#include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "runtime/tuple.h"
-#include "runtime/tuple-row.h"
 #include "util/codec.h"
+#include "util/error-util.h"
 #include "util/runtime-profile-counters.h"
+#include "util/stopwatch.h"
 
 #include "common/names.h"
 
+namespace impala {
+class LlvmCodeGen;
+class ScalarExpr;
+class TupleRow;
+}
+
+namespace llvm {
+class Function;
+}
+
 using namespace impala;
 
 const char* const HdfsSequenceScanner::SEQFILE_VALUE_CLASS_NAME =
diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc
index acd7f54..e996f0b 100644
--- a/be/src/exec/hdfs-text-scanner.cc
+++ b/be/src/exec/hdfs-text-scanner.cc
@@ -17,26 +17,54 @@
 
 #include "exec/hdfs-text-scanner.h"
 
+#include <string.h>
+#include <algorithm>
+#include <map>
 #include <memory>
+#include <ostream>
+#include <utility>
 
-#include "codegen/llvm-codegen.h"
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "common/compiler-util.h"
+#include "common/logging.h"
 #include "exec/delimited-text-parser.h"
 #include "exec/delimited-text-parser.inline.h"
 #include "exec/hdfs-plugin-text-scanner.h"
+#include "exec/hdfs-scan-node-base.h"
 #include "exec/hdfs-scan-node.h"
+#include "exec/scanner-context.h"
 #include "exec/scanner-context.inline.h"
 #include "exec/text-converter.h"
 #include "exec/text-converter.inline.h"
+#include "gen-cpp/ErrorCodes_types.h"
+#include "gutil/strings/substitute.h"
+#include "runtime/descriptors.h"
+#include "runtime/io/request-context.h"
+#include "runtime/io/request-ranges.h"
+#include "runtime/mem-pool.h"
+#include "runtime/mem-tracker.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "runtime/tuple-row.h"
+#include "runtime/tuple.h"
 #include "util/codec.h"
-#include "util/decompress.h"
-#include "util/cpu-info.h"
-#include "util/debug-util.h"
+#include "util/error-util.h"
+#include "util/runtime-profile-counters.h"
+#include "util/stopwatch.h"
 
 #include "common/names.h"
 
+namespace impala {
+class LlvmCodeGen;
+class ScalarExpr;
+}
+
+namespace llvm {
+class Function;
+}
+
 using boost::algorithm::ends_with;
 using boost::algorithm::to_lower;
 using namespace impala;
diff --git a/be/src/exec/join-builder.cc b/be/src/exec/join-builder.cc
index a85a210..504c7db 100644
--- a/be/src/exec/join-builder.cc
+++ b/be/src/exec/join-builder.cc
@@ -17,6 +17,9 @@
 
 #include "exec/join-builder.h"
 
+#include "util/debug-util.h"
+#include "util/runtime-profile-counters.h"
+
 #include "common/names.h"
 
 namespace impala {
diff --git a/be/src/exec/kudu-scan-node.cc b/be/src/exec/kudu-scan-node.cc
index de71ace..ad31a5e 100644
--- a/be/src/exec/kudu-scan-node.cc
+++ b/be/src/exec/kudu-scan-node.cc
@@ -33,6 +33,7 @@
 #include "runtime/scanner-mem-limiter.h"
 #include "runtime/thread-resource-mgr.h"
 #include "runtime/tuple-row.h"
+#include "util/debug-util.h"
 #include "util/runtime-profile-counters.h"
 
 #include "common/names.h"
diff --git a/be/src/exec/kudu-scanner.cc b/be/src/exec/kudu-scanner.cc
index 6c96c21..a2c5b49 100644
--- a/be/src/exec/kudu-scanner.cc
+++ b/be/src/exec/kudu-scanner.cc
@@ -38,6 +38,7 @@
 #include "runtime/tuple-row.h"
 #include "gutil/gscoped_ptr.h"
 #include "gutil/strings/substitute.h"
+#include "util/debug-util.h"
 #include "util/jni-util.h"
 #include "util/min-max-filter.h"
 #include "util/periodic-counter-updater.h"
diff --git a/be/src/exec/kudu-table-sink.cc b/be/src/exec/kudu-table-sink.cc
index 1372476..1b2e80d 100644
--- a/be/src/exec/kudu-table-sink.cc
+++ b/be/src/exec/kudu-table-sink.cc
@@ -28,6 +28,7 @@
 #include "exprs/scalar-expr-evaluator.h"
 #include "gen-cpp/ImpalaInternalService_constants.h"
 #include "gutil/gscoped_ptr.h"
+#include "runtime/descriptors.h"
 #include "runtime/exec-env.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/row-batch.h"
diff --git a/be/src/exec/kudu-table-sink.h b/be/src/exec/kudu-table-sink.h
index 5cde878..99fd7af 100644
--- a/be/src/exec/kudu-table-sink.h
+++ b/be/src/exec/kudu-table-sink.h
@@ -25,11 +25,11 @@
 #include "common/status.h"
 #include "exec/kudu-util.h"
 #include "exec/data-sink.h"
-#include "exprs/scalar-expr.h"
-#include "exprs/scalar-expr-evaluator.h"
 
 namespace impala {
 
+class KuduTableDescriptor;
+
 class KuduTableSinkConfig : public DataSinkConfig {
  public:
   DataSink* CreateSink(const TPlanFragmentCtx& fragment_ctx,
diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 40f4418..872282e 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -20,6 +20,9 @@
 #include <queue>
 
 #include "runtime/collection-value-builder.h"
+#include "runtime/date-value.h"
+#include "runtime/decimal-value.h"
+#include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.inline.h"
 #include "common/names.h"
 
@@ -248,13 +251,13 @@ Status OrcDecimal16ColumnReader::ReadValue(int row_idx, Tuple* tuple, MemPool* p
   orc::Int128 orc_val = batch_->values.data()[row_idx];
 
   DCHECK_EQ(slot_desc_->type().GetByteSize(), 16);
-  int128_t val = orc_val.getHighBits();
+  __int128_t val = orc_val.getHighBits();
   val <<= 64;
   val |= orc_val.getLowBits();
   // Use memcpy to avoid gcc generating unaligned instructions like movaps
   // for int128_t. They will raise SegmentFault when addresses are not
   // aligned to 16 bytes.
-  memcpy(GetSlot(tuple), &val, sizeof(int128_t));
+  memcpy(GetSlot(tuple), &val, sizeof(__int128_t));
   return Status::OK();
 }
 
diff --git a/be/src/exec/orc-column-readers.h b/be/src/exec/orc-column-readers.h
index bff154c..0e6a0fc 100644
--- a/be/src/exec/orc-column-readers.h
+++ b/be/src/exec/orc-column-readers.h
@@ -22,6 +22,7 @@
 #include <queue>
 
 #include "exec/hdfs-orc-scanner.h"
+#include "exec/scratch-tuple-batch.h"
 
 namespace impala {
 
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 7b3f4d1..1fdffaf 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -29,6 +29,8 @@
 #include "exec/parquet/parquet-collection-column-reader.h"
 #include "exec/parquet/parquet-column-readers.h"
 #include "exec/scanner-context.inline.h"
+#include "exec/scratch-tuple-batch.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "rpc/thrift-util.h"
 #include "runtime/collection-value-builder.h"
 #include "runtime/exec-env.h"
@@ -36,6 +38,7 @@
 #include "runtime/io/request-context.h"
 #include "runtime/runtime-filter.inline.h"
 #include "runtime/runtime-state.h"
+#include "runtime/scoped-buffer.h"
 #include "util/dict-encoding.h"
 #include "util/pretty-printer.h"
 #include "util/scope-exit-trigger.h"
@@ -536,7 +539,7 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
     if (stats_read) {
       TupleRow row;
       row.SetTuple(0, min_max_tuple_);
-      if (!ExecNode::EvalPredicate(eval, &row)) {
+      if (!eval->EvalPredicate(&row)) {
         *skip_row_group = true;
         break;
       }
@@ -766,7 +769,7 @@ Status HdfsParquetScanner::EvaluatePageIndex(bool* filter_pages) {
       if (!is_null_page && !value_read) continue;
       TupleRow row;
       row.SetTuple(0, min_max_tuple_);
-      if (is_null_page || !ExecNode::EvalPredicate(eval, &row)) {
+      if (is_null_page || !eval->EvalPredicate(&row)) {
         BaseScalarColumnReader* scalar_reader = scalar_reader_map_[col_idx];
         RETURN_IF_ERROR(page_index_.DeserializeOffsetIndex(col_chunk,
             &scalar_reader->offset_index_));
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.h b/be/src/exec/parquet/hdfs-parquet-scanner.h
index 9ed3cd2..674441c 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.h
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.h
@@ -24,7 +24,6 @@
 #include "exec/parquet/parquet-common.h"
 #include "exec/parquet/parquet-metadata-utils.h"
 #include "exec/parquet/parquet-page-index.h"
-#include "runtime/scoped-buffer.h"
 #include "util/runtime-profile-counters.h"
 
 namespace impala {
diff --git a/be/src/exec/parquet/parquet-column-chunk-reader.cc b/be/src/exec/parquet/parquet-column-chunk-reader.cc
index 573fd04..0dec59f 100644
--- a/be/src/exec/parquet/parquet-column-chunk-reader.cc
+++ b/be/src/exec/parquet/parquet-column-chunk-reader.cc
@@ -21,6 +21,7 @@
 
 #include "runtime/mem-pool.h"
 #include "runtime/runtime-state.h"
+#include "runtime/scoped-buffer.h"
 #include "util/codec.h"
 
 #include "common/names.h"
diff --git a/be/src/exec/parquet/parquet-column-chunk-reader.h b/be/src/exec/parquet/parquet-column-chunk-reader.h
index 3dca706..3eb0693 100644
--- a/be/src/exec/parquet/parquet-column-chunk-reader.h
+++ b/be/src/exec/parquet/parquet-column-chunk-reader.h
@@ -24,8 +24,9 @@
 
 namespace impala {
 
-class MemPool;
 class Codec;
+class MemPool;
+class ScopedBuffer;
 
 /// A class to read data from Parquet pages. It handles the page headers, decompression
 /// and the possible copying of the data buffers.
diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc
index 0a4a753..512632b 100644
--- a/be/src/exec/parquet/parquet-column-readers.cc
+++ b/be/src/exec/parquet/parquet-column-readers.cc
@@ -24,8 +24,11 @@
 #include "exec/parquet/parquet-bool-decoder.h"
 #include "exec/parquet/parquet-level-decoder.h"
 #include "exec/parquet/parquet-metadata-utils.h"
+#include "exec/scratch-tuple-batch.h"
 #include "parquet-collection-column-reader.h"
 #include "runtime/runtime-state.h"
+#include "runtime/scoped-buffer.h"
+#include "runtime/string-value.inline.h"
 #include "runtime/tuple.h"
 #include "util/debug-util.h"
 #include "util/dict-encoding.h"
diff --git a/be/src/exec/parquet/parquet-common.h b/be/src/exec/parquet/parquet-common.h
index 8859ea8..a295f58 100644
--- a/be/src/exec/parquet/parquet-common.h
+++ b/be/src/exec/parquet/parquet-common.h
@@ -19,6 +19,8 @@
 #ifndef IMPALA_EXEC_PARQUET_COMMON_H
 #define IMPALA_EXEC_PARQUET_COMMON_H
 
+#include <boost/preprocessor/repetition/repeat_from_to.hpp>
+
 #include "common/compiler-util.h"
 #include "gen-cpp/Descriptors_types.h"
 #include "gen-cpp/parquet_types.h"
diff --git a/be/src/exec/parquet/parquet-version-test.cc b/be/src/exec/parquet/parquet-version-test.cc
index 28a2d71..355ed68 100644
--- a/be/src/exec/parquet/parquet-version-test.cc
+++ b/be/src/exec/parquet/parquet-version-test.cc
@@ -18,7 +18,6 @@
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <iostream>
 
 #include "exec/parquet/parquet-metadata-utils.h"
 #include "testutil/gtest-util.h"
diff --git a/be/src/exec/partitioned-hash-join-builder-ir.cc b/be/src/exec/partitioned-hash-join-builder-ir.cc
index 51d2239..e791a36 100644
--- a/be/src/exec/partitioned-hash-join-builder-ir.cc
+++ b/be/src/exec/partitioned-hash-join-builder-ir.cc
@@ -17,16 +17,22 @@
 
 #include "exec/partitioned-hash-join-builder.h"
 
-#include "codegen/impala-ir.h"
+#include "common/compiler-util.h"
+#include "common/logging.h"
+#include "common/status.h"
+#include "exec/filter-context.h"
+#include "exec/hash-table.h"
 #include "exec/hash-table.inline.h"
-#include "runtime/buffered-tuple-stream.inline.h"
-#include "runtime/raw-value.inline.h"
+#include "gen-cpp/Types_types.h"
+#include "runtime/buffered-tuple-stream.h"
 #include "runtime/row-batch.h"
-#include "runtime/runtime-filter.h"
-#include "util/bloom-filter.h"
 
 #include "common/names.h"
 
+namespace impala {
+class TupleRow;
+}
+
 using namespace impala;
 
 inline bool PhjBuilder::AppendRow(
diff --git a/be/src/exec/partitioned-hash-join-builder.cc b/be/src/exec/partitioned-hash-join-builder.cc
index 2c41545..1bee0e9 100644
--- a/be/src/exec/partitioned-hash-join-builder.cc
+++ b/be/src/exec/partitioned-hash-join-builder.cc
@@ -17,6 +17,7 @@
 
 #include "exec/partitioned-hash-join-builder.h"
 
+#include <iomanip>
 #include <numeric>
 
 #include <gutil/strings/substitute.h>
diff --git a/be/src/exec/plan-root-sink.cc b/be/src/exec/plan-root-sink.cc
index 6013842..43568d1 100644
--- a/be/src/exec/plan-root-sink.cc
+++ b/be/src/exec/plan-root-sink.cc
@@ -24,7 +24,9 @@
 #include "runtime/row-batch.h"
 #include "runtime/tuple-row.h"
 #include "service/query-result-set.h"
+#include "util/debug-util.h"
 #include "util/pretty-printer.h"
+#include "util/runtime-profile-counters.h"
 
 #include <memory>
 #include <mutex>
diff --git a/be/src/exec/read-write-util-test.cc b/be/src/exec/read-write-util-test.cc
index a044886..8b68f53 100644
--- a/be/src/exec/read-write-util-test.cc
+++ b/be/src/exec/read-write-util-test.cc
@@ -17,7 +17,6 @@
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <iostream>
 #include <limits.h>
 
 #include "exec/read-write-util.h"
diff --git a/be/src/exec/row-batch-cache.h b/be/src/exec/row-batch-cache.h
index ceb0260..0b7012b 100644
--- a/be/src/exec/row-batch-cache.h
+++ b/be/src/exec/row-batch-cache.h
@@ -23,7 +23,6 @@
 #include <vector>
 
 #include "runtime/row-batch.h"
-#include "util/debug-util.h"
 
 namespace impala {
 
diff --git a/be/src/exec/row-batch-list-test.cc b/be/src/exec/row-batch-list-test.cc
index 22aa97a..05ed808 100644
--- a/be/src/exec/row-batch-list-test.cc
+++ b/be/src/exec/row-batch-list-test.cc
@@ -17,7 +17,6 @@
 
 #include <cstdlib>
 #include <cstdio>
-#include <iostream>
 #include <vector>
 #include <boost/scoped_ptr.hpp>
 
diff --git a/be/src/exec/scan-node.cc b/be/src/exec/scan-node.cc
index 267f624..1615cad 100644
--- a/be/src/exec/scan-node.cc
+++ b/be/src/exec/scan-node.cc
@@ -25,6 +25,7 @@
 #include "exec/kudu-scan-node-mt.h"
 #include "exec/kudu-scan-node.h"
 #include "exprs/scalar-expr.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/blocking-row-batch-queue.h"
 #include "runtime/io/disk-io-mgr.h"
 #include "runtime/query-state.h"
@@ -32,6 +33,7 @@
 #include "runtime/runtime-filter.inline.h"
 #include "runtime/runtime-state.h"
 #include "runtime/scanner-mem-limiter.h"
+#include "util/debug-util.h"
 #include "util/disk-info.h"
 #include "util/pretty-printer.h"
 #include "util/runtime-profile-counters.h"
diff --git a/be/src/exec/topn-node.cc b/be/src/exec/topn-node.cc
index 5f7db08..3108040 100644
--- a/be/src/exec/topn-node.cc
+++ b/be/src/exec/topn-node.cc
@@ -22,6 +22,7 @@
 #include "codegen/llvm-codegen.h"
 #include "exec/exec-node-util.h"
 #include "exprs/scalar-expr.h"
+#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
diff --git a/be/src/exec/zigzag-test.cc b/be/src/exec/zigzag-test.cc
index 601d7a7..99e2d62 100644
--- a/be/src/exec/zigzag-test.cc
+++ b/be/src/exec/zigzag-test.cc
@@ -17,7 +17,6 @@
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <iostream>
 #include <limits.h>
 #include "common/status.h"
 #include "exec/read-write-util.h"
diff --git a/be/src/exprs/agg-fn-evaluator.cc b/be/src/exprs/agg-fn-evaluator.cc
index 584abbe..7eafa1a 100644
--- a/be/src/exprs/agg-fn-evaluator.cc
+++ b/be/src/exprs/agg-fn-evaluator.cc
@@ -17,31 +17,32 @@
 
 #include "exprs/agg-fn-evaluator.h"
 
+#include <endian.h>
+#include <string.h>
+#include <cstdint>
 #include <sstream>
+#include <utility>
 
-#include "codegen/llvm-codegen.h"
+#include "common/compiler-util.h"
 #include "common/logging.h"
-#include "exprs/aggregate-functions.h"
+#include "common/object-pool.h"
 #include "exprs/anyval-util.h"
-#include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "exprs/scalar-fn-call.h"
+#include "exprs/scalar-expr.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/date-value.h"
-#include "runtime/lib-cache.h"
+#include "runtime/descriptors.h"
 #include "runtime/raw-value.h"
-#include "runtime/runtime-state.h"
-#include "runtime/string-value.inline.h"
+#include "runtime/string-value.h"
+#include "runtime/timestamp-value.h"
+#include "runtime/tuple.h"
+#include "runtime/types.h"
 #include "udf/udf-internal.h"
-#include "util/debug-util.h"
-
-#include <thrift/protocol/TDebugProtocol.h>
 
 #include "common/names.h"
 
 using namespace impala;
 using namespace impala_udf;
-using std::move;
 
 // typedef for builtin aggregate functions. Unfortunately, these type defs don't
 // really work since the actual builtin is implemented not in terms of the base
diff --git a/be/src/exprs/agg-fn.h b/be/src/exprs/agg-fn.h
index 9c8dbe1..c97effe 100644
--- a/be/src/exprs/agg-fn.h
+++ b/be/src/exprs/agg-fn.h
@@ -23,6 +23,10 @@
 #include "runtime/descriptors.h"
 #include "udf/udf.h"
 
+namespace llvm {
+class Function;
+}
+
 namespace impala {
 
 using impala_udf::FunctionContext;
diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index 699e80e..5c3c877 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -32,6 +32,7 @@
 #include "exprs/hll-bias.h"
 #include "runtime/date-value.h"
 #include "runtime/decimal-value.inline.h"
+#include "runtime/multi-precision.h"
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.h"
@@ -422,20 +423,20 @@ IR_ALWAYS_INLINE void AggregateFunctions::DecimalAvgAddOrRemove(FunctionContext*
     case 4:
       avg->sum_val16 += m * src.val4;
       if (UNLIKELY(decimal_v2 &&
-          abs(avg->sum_val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16)) {
+          abs(avg->sum_val16) > MAX_UNSCALED_DECIMAL16)) {
         ctx->SetError("Avg computation overflowed");
       }
       break;
     case 8:
       avg->sum_val16 += m * src.val8;
       if (UNLIKELY(decimal_v2 &&
-          abs(avg->sum_val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16)) {
+          abs(avg->sum_val16) > MAX_UNSCALED_DECIMAL16)) {
         ctx->SetError("Avg computation overflowed");
       }
       break;
     case 16:
       if (UNLIKELY(decimal_v2 && (avg->sum_val16 >= 0) == (src.val16 >= 0) &&
-          abs(avg->sum_val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(src.val16))) {
+          abs(avg->sum_val16) > MAX_UNSCALED_DECIMAL16 - abs(src.val16))) {
         // We can't check for overflow after performing the addition like in the other
         // cases because the result may not fit into int128.
         ctx->SetError("Avg computation overflowed");
@@ -463,7 +464,7 @@ void AggregateFunctions::DecimalAvgMerge(FunctionContext* ctx,
   bool decimal_v2 = ctx->impl()->GetConstFnAttr(FunctionContextImpl::DECIMAL_V2);
   bool overflow = decimal_v2 &&
       abs(dst_struct->sum_val16) >
-      DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(src_struct->sum_val16);
+      MAX_UNSCALED_DECIMAL16 - abs(src_struct->sum_val16);
   if (UNLIKELY(overflow)) ctx->SetError("Avg computation overflowed");
   dst_struct->sum_val16 =
       ArithmeticUtil::AsUnsigned<std::plus>(dst_struct->sum_val16, src_struct->sum_val16);
@@ -559,18 +560,18 @@ IR_ALWAYS_INLINE void AggregateFunctions::SumDecimalAddOrSubtract(FunctionContex
   if (precision <= 9) {
     dst->val16 += m * src.val4;
     if (UNLIKELY(decimal_v2 &&
-        abs(dst->val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16)) {
+        abs(dst->val16) > MAX_UNSCALED_DECIMAL16)) {
       ctx->SetError("Sum computation overflowed");
     }
   } else if (precision <= 19) {
     dst->val16 += m * src.val8;
     if (UNLIKELY(decimal_v2 &&
-        abs(dst->val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16)) {
+        abs(dst->val16) > MAX_UNSCALED_DECIMAL16)) {
       ctx->SetError("Sum computation overflowed");
     }
   } else {
     if (UNLIKELY(decimal_v2 && (dst->val16 >= 0) == (src.val16 >= 0) &&
-        abs(dst->val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(src.val16))) {
+        abs(dst->val16) > MAX_UNSCALED_DECIMAL16 - abs(src.val16))) {
       // We can't check for overflow after performing the addition like in the other
       // cases because the result may not fit into int128.
       ctx->SetError("Sum computation overflowed");
@@ -585,7 +586,7 @@ void AggregateFunctions::SumDecimalMerge(FunctionContext* ctx,
   if (dst->is_null) InitZero<DecimalVal>(ctx, dst);
   bool decimal_v2 = ctx->impl()->GetConstFnAttr(FunctionContextImpl::DECIMAL_V2);
   bool overflow = decimal_v2 &&
-      abs(dst->val16) > DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(src.val16);
+      abs(dst->val16) > MAX_UNSCALED_DECIMAL16 - abs(src.val16);
   if (UNLIKELY(overflow)) ctx->SetError("Sum computation overflowed");
   dst->val16 = ArithmeticUtil::AsUnsigned<std::plus>(dst->val16, src.val16);
 }
diff --git a/be/src/exprs/aggregate-functions-test.cc b/be/src/exprs/aggregate-functions-test.cc
index c0857e0..76ffb67 100644
--- a/be/src/exprs/aggregate-functions-test.cc
+++ b/be/src/exprs/aggregate-functions-test.cc
@@ -23,11 +23,10 @@
 #include <boost/accumulators/statistics/variance.hpp>
 
 #include "exprs/aggregate-functions.h"
-#include "runtime/multi-precision.h"
 #include "testutil/gtest-util.h"
 #include "udf/udf.h"
 #include "udf/uda-test-harness.h"
-#include "util/decimal-util.h"
+#include "util/decimal-constants.h"
 
 #include "common/names.h"
 
@@ -124,7 +123,7 @@ TEST(HistogramTest, TestDecimal) {
   // All input values are x, result should be constant.
   {
     vector<DecimalVal> input;
-    int128_t val = DecimalUtil::MAX_UNSCALED_DECIMAL16;
+    __int128_t val = MAX_UNSCALED_DECIMAL16;
     stringstream ss;
     for (int i = 0; i < INPUT_SIZE; ++i) input.push_back(DecimalVal(val));
     for (int i = 0; i < NUM_BUCKETS; ++i) {
diff --git a/be/src/exprs/anyval-util.cc b/be/src/exprs/anyval-util.cc
index 78d1eb0..8f4f927 100644
--- a/be/src/exprs/anyval-util.cc
+++ b/be/src/exprs/anyval-util.cc
@@ -16,10 +16,7 @@
 // under the License.
 
 #include "exprs/anyval-util.h"
-#include "codegen/llvm-codegen.h"
 
-#include "common/object-pool.h"
-#include "gutil/strings/substitute.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 
diff --git a/be/src/exprs/anyval-util.h b/be/src/exprs/anyval-util.h
index 92aa9e7..c930012 100644
--- a/be/src/exprs/anyval-util.h
+++ b/be/src/exprs/anyval-util.h
@@ -25,7 +25,6 @@
 #include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.h"
 #include "udf/udf-internal.h"
-#include "util/decimal-util.h"
 #include "util/hash-util.h"
 
 namespace impala {
diff --git a/be/src/exprs/decimal-functions-ir.cc b/be/src/exprs/decimal-functions-ir.cc
index 015a518..abb2cb2 100644
--- a/be/src/exprs/decimal-functions-ir.cc
+++ b/be/src/exprs/decimal-functions-ir.cc
@@ -19,12 +19,15 @@
 
 #include "codegen/impala-ir.h"
 #include "exprs/anyval-util.h"
+#include "runtime/multi-precision.h"
 
-#include <ctype.h>
-#include <math.h>
+#include <cctype>
+#include <cmath>
 
 #include "common/names.h"
 
+using std::abs;
+
 namespace impala {
 
 IntVal DecimalFunctions::Precision(FunctionContext* ctx, const DecimalVal& val) {
diff --git a/be/src/exprs/decimal-operators-ir.cc b/be/src/exprs/decimal-operators-ir.cc
index 07087a6..b0dbc54 100644
--- a/be/src/exprs/decimal-operators-ir.cc
+++ b/be/src/exprs/decimal-operators-ir.cc
@@ -437,7 +437,7 @@ IR_ALWAYS_INLINE DecimalVal DecimalOperators::RoundDecimalNegativeScale(
       int128_t delta = d * base - (val16.value() % base);
       // Need to check for overflow. This can't happen in the other cases since the
       // FE should have picked a high enough precision.
-      if (DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(delta) < abs(val16.value())) {
+      if (MAX_UNSCALED_DECIMAL16 - abs(delta) < abs(val16.value())) {
         ctx->AddWarning("Expression overflowed, returning NULL");
         return DecimalVal::null();
       }
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 665bf36..097dc8a 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -50,6 +50,7 @@
 #include "runtime/date-value.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
+#include "runtime/multi-precision.h"
 #include "runtime/raw-value.inline.h"
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.h"
@@ -65,6 +66,7 @@
 #include "udf/udf-test-harness.h"
 #include "util/asan.h"
 #include "util/debug-util.h"
+#include "util/decimal-util.h"
 #include "util/metrics.h"
 #include "util/string-parser.h"
 #include "util/string-util.h"
@@ -3034,7 +3036,7 @@ void TestScaleBy() {
           // overflows in all cases.
           EXPECT_TRUE((-scaled_up_dividend) / scale_multiplier ==
               ConvertToInt256(-dividend));
-          int256_t max_divisor = ConvertToInt256(DecimalUtil::MAX_UNSCALED_DECIMAL16);
+          int256_t max_divisor = ConvertToInt256(MAX_UNSCALED_DECIMAL16);
           EXPECT_TRUE(scaled_up_dividend / max_divisor > max_divisor);
           EXPECT_TRUE((-scaled_up_dividend) / max_divisor < -max_divisor);
         } else {
diff --git a/be/src/exprs/expr-value.h b/be/src/exprs/expr-value.h
index c40a38c..549b7d3 100644
--- a/be/src/exprs/expr-value.h
+++ b/be/src/exprs/expr-value.h
@@ -23,7 +23,7 @@
 #include "runtime/decimal-value.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.h"
-#include "util/decimal-util.h"
+#include "util/decimal-constants.h"
 
 namespace impala {
 
@@ -133,13 +133,13 @@ struct ExprValue {
       case TYPE_DECIMAL:
         switch (type.GetByteSize()) {
           case 4:
-            decimal4_val = -DecimalUtil::MAX_UNSCALED_DECIMAL4;
+            decimal4_val = -MAX_UNSCALED_DECIMAL4;
             return &decimal4_val;
           case 8:
-            decimal8_val = -DecimalUtil::MAX_UNSCALED_DECIMAL8;
+            decimal8_val = -MAX_UNSCALED_DECIMAL8;
             return &decimal8_val;
           case 16:
-            decimal16_val = -DecimalUtil::MAX_UNSCALED_DECIMAL16;
+            decimal16_val = -MAX_UNSCALED_DECIMAL16;
             return &decimal16_val;
         }
       case TYPE_FLOAT:
@@ -182,13 +182,13 @@ struct ExprValue {
       case TYPE_DECIMAL:
         switch (type.GetByteSize()) {
           case 4:
-            decimal4_val = DecimalUtil::MAX_UNSCALED_DECIMAL4;
+            decimal4_val = MAX_UNSCALED_DECIMAL4;
             return &decimal4_val;
           case 8:
-            decimal8_val = DecimalUtil::MAX_UNSCALED_DECIMAL8;
+            decimal8_val = MAX_UNSCALED_DECIMAL8;
             return &decimal8_val;
           case 16:
-            decimal16_val = DecimalUtil::MAX_UNSCALED_DECIMAL16;
+            decimal16_val = MAX_UNSCALED_DECIMAL16;
             return &decimal16_val;
         }
       case TYPE_FLOAT:
diff --git a/be/src/exprs/hive-udf-call.cc b/be/src/exprs/hive-udf-call.cc
index dd7c914..f8a851f 100644
--- a/be/src/exprs/hive-udf-call.cc
+++ b/be/src/exprs/hive-udf-call.cc
@@ -21,7 +21,6 @@
 #include <sstream>
 #include <string>
 
-#include "codegen/llvm-codegen.h"
 #include "exprs/anyval-util.h"
 #include "exprs/scalar-expr-evaluator.h"
 #include "rpc/jni-thrift-util.h"
@@ -40,6 +39,8 @@ const char* EXECUTOR_CLOSE_SIGNATURE = "()V";
 
 namespace impala {
 
+class LlvmCodeGen;
+
 jclass HiveUdfCall::executor_cl_ = NULL;
 jmethodID HiveUdfCall::executor_ctor_id_ = NULL;
 jmethodID HiveUdfCall::executor_evaluate_id_ = NULL;
diff --git a/be/src/exprs/literal.cc b/be/src/exprs/literal.cc
index f1bd986..86d2d82 100644
--- a/be/src/exprs/literal.cc
+++ b/be/src/exprs/literal.cc
@@ -28,6 +28,7 @@
 #include "runtime/decimal-value.inline.h"
 #include "runtime/runtime-state.h"
 #include "runtime/timestamp-parse-util.h"
+#include "util/decimal-util.h"
 
 #include "common/names.h"
 
diff --git a/be/src/exprs/math-functions-ir.cc b/be/src/exprs/math-functions-ir.cc
index 527b51b..49a9f8b 100644
--- a/be/src/exprs/math-functions-ir.cc
+++ b/be/src/exprs/math-functions-ir.cc
@@ -27,6 +27,7 @@
 #include "exprs/math-functions.h"
 #include "util/string-parser.h"
 #include "runtime/decimal-value.inline.h"
+#include "runtime/multi-precision.h"
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.inline.h"
 #include "thirdparty/pcg-cpp-0.98/include/pcg_random.hpp"
@@ -474,7 +475,7 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, const DoubleVal& a,
 // dist_from_min * num_buckets
 //
 // For all the above cases we use a bigger integer type provided by the
-// BitUtil::DoubleWidth<> metafunction.
+// DoubleWidth<> metafunction.
 template <class  T1>
 BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
     const T1& expr, const T1& min_range,
@@ -508,7 +509,7 @@ BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
   if (max_range_val >= 0 && min_range_val < 0) {
     if (static_cast<UnsignedType<ActualType>>(max_range_val) +
         static_cast<UnsignedType<ActualType>>(abs(min_range_val)) >=
-        static_cast<UnsignedType<ActualType>>(BitUtil::Max<ActualType>())) {
+        static_cast<UnsignedType<ActualType>>(ArithmeticUtil::Max<ActualType>())) {
       bigger_type_needed = true;
     }
   }
@@ -517,7 +518,7 @@ BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
     DCHECK(lhs > 0 && rhs > 0);
     using ActualType = decltype(lhs);
     return BitUtil::CountLeadingZeros(lhs) + BitUtil::CountLeadingZeros(rhs) <=
-        BitUtil::UnsignedWidth<ActualType>() + 1;
+        ArithmeticUtil::UnsignedWidth<ActualType>() + 1;
   };
 
   // It is likely that this can be evaluated during codegen:
diff --git a/be/src/exprs/scalar-expr-evaluator.h b/be/src/exprs/scalar-expr-evaluator.h
index cee3ad9..069214e 100644
--- a/be/src/exprs/scalar-expr-evaluator.h
+++ b/be/src/exprs/scalar-expr-evaluator.h
@@ -165,6 +165,14 @@ class ScalarExprEvaluator {
   DecimalVal GetDecimalVal(const TupleRow* row);
   DateVal GetDateVal(const TupleRow* row);
 
+  /// Helper to evaluate a boolean expression with predicate semantics, where NULL is
+  /// equivalent to false.
+  bool EvalPredicate(TupleRow* row) {
+    BooleanVal v = GetBooleanVal(row);
+    if (v.is_null || !v.val) return false;
+    return true;
+  }
+
   /// Returns an error status if there was any error in evaluating the expression
   /// or its sub-expressions. 'start_idx' and 'end_idx' correspond to the range
   /// within the vector of FunctionContext for the sub-expressions of interest.
diff --git a/be/src/exprs/timestamp-functions-ir.cc b/be/src/exprs/timestamp-functions-ir.cc
index 664609d..5ffab1f 100644
--- a/be/src/exprs/timestamp-functions-ir.cc
+++ b/be/src/exprs/timestamp-functions-ir.cc
@@ -17,15 +17,16 @@
 
 #include "exprs/timestamp-functions.h"
 
+#include <ctime>
+#include <iomanip>
+
 #include <boost/date_time/compiler_config.hpp>
 #include <boost/date_time/posix_time/posix_time_types.hpp>
 #include <boost/date_time/gregorian/gregorian_types.hpp>
-#include <ctime>
 #include <gutil/strings/substitute.h>
 
 #include "exprs/anyval-util.h"
 #include "runtime/datetime-simple-date-format-parser.h"
-#include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.inline.h"
 #include "runtime/timestamp-value.h"
 #include "udf/udf.h"
diff --git a/be/src/exprs/timezone_db.cc b/be/src/exprs/timezone_db.cc
index ad50e6b..e3acd12 100644
--- a/be/src/exprs/timezone_db.cc
+++ b/be/src/exprs/timezone_db.cc
@@ -17,12 +17,14 @@
 
 #include "exprs/timezone_db.h"
 
-#include <libgen.h>
-
-#include <iostream>
-#include <string>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
 #include <regex>
-#include <boost/algorithm/string.hpp>
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string/trim.hpp>
 
 #include "common/compiler-util.h"
 #include "common/logging.h"
diff --git a/be/src/exprs/udf-builtins.cc b/be/src/exprs/udf-builtins.cc
index 362f16b..5fea3d5 100644
--- a/be/src/exprs/udf-builtins.cc
+++ b/be/src/exprs/udf-builtins.cc
@@ -20,6 +20,17 @@
 
 #include "exprs/udf-builtins.h"
 
+#include <boost/date_time/date.hpp>
+#include <boost/date_time/gregorian/greg_calendar.hpp>
+#include <boost/date_time/gregorian/greg_date.hpp>
+#include <boost/date_time/gregorian/greg_duration.hpp>
+#include <boost/date_time/gregorian_calendar.hpp>
+#include <boost/date_time/posix_time/posix_time_config.hpp>
+#include <boost/date_time/posix_time/posix_time_duration.hpp>
+#include <boost/date_time/posix_time/ptime.hpp>
+#include <boost/date_time/time.hpp>
+#include <boost/date_time/time_duration.hpp>
+
 #include <gutil/walltime.h>
 
 #include "gen-cpp/Exprs_types.h"
diff --git a/be/src/rpc/thrift-util-test.cc b/be/src/rpc/thrift-util-test.cc
index c9980ed..a4dd78b 100644
--- a/be/src/rpc/thrift-util-test.cc
+++ b/be/src/rpc/thrift-util-test.cc
@@ -15,9 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <stdlib.h>
 #include <stdio.h>
-#include <iostream>
 
 #include "rpc/thrift-util.h"
 #include "testutil/gtest-util.h"
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index eda4af9..b6550e2 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -57,7 +57,6 @@ add_library(Runtime
   lib-cache.cc
   mem-tracker.cc
   mem-pool.cc
-  multi-precision.cc
   query-exec-mgr.cc
   query-state.cc
   test-env.cc
diff --git a/be/src/runtime/bufferpool/buffer-allocator.h b/be/src/runtime/bufferpool/buffer-allocator.h
index 244a039..6bab7ac 100644
--- a/be/src/runtime/bufferpool/buffer-allocator.h
+++ b/be/src/runtime/bufferpool/buffer-allocator.h
@@ -20,9 +20,11 @@
 
 #include <boost/scoped_ptr.hpp>
 
+#include "common/atomic.h"
 #include "runtime/bufferpool/buffer-pool-internal.h"
 #include "runtime/bufferpool/free-list.h"
 #include "util/aligned-new.h"
+#include "util/spinlock.h"
 
 namespace impala {
 
diff --git a/be/src/runtime/bufferpool/buffer-pool-internal.h b/be/src/runtime/bufferpool/buffer-pool-internal.h
index 2b2d31c..8224179 100644
--- a/be/src/runtime/bufferpool/buffer-pool-internal.h
+++ b/be/src/runtime/bufferpool/buffer-pool-internal.h
@@ -82,6 +82,8 @@
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/reservation-tracker.h"
 #include "util/condition-variable.h"
+#include "util/internal-queue.h"
+#include "util/spinlock.h"
 
 // Ensure that DCheckConsistency() function calls get removed in release builds.
 #ifndef NDEBUG
@@ -92,11 +94,16 @@
 
 namespace impala {
 
+class TmpFileGroup;
+class TmpWriteHandle;
+
 /// The internal representation of a page, which can be pinned or unpinned. See the
 /// class comment for explanation of the different page states.
 struct BufferPool::Page : public InternalList<Page>::Node {
-  Page(Client* client, int64_t len)
-    : client(client), len(len), pin_count(0), pin_in_flight(false) {}
+  // Define constructor and destructor out-of-line to avoid include of TmpWriteHandle
+  // body in header.
+  Page(Client* client, int64_t len);
+  ~Page();
 
   std::string DebugString();
 
@@ -119,7 +126,7 @@ struct BufferPool::Page : public InternalList<Page>::Node {
   bool pin_in_flight;
 
   /// Non-null if there is a write in flight, the page is clean, or the page is evicted.
-  std::unique_ptr<TmpFileMgr::WriteHandle> write_handle;
+  std::unique_ptr<TmpWriteHandle> write_handle;
 
   /// Condition variable signalled when a write for this page completes. Protected by
   /// client->lock_.
@@ -192,7 +199,7 @@ class BufferPool::PageList {
 /// The internal state for the client.
 class BufferPool::Client {
  public:
-  Client(BufferPool* pool, TmpFileMgr::FileGroup* file_group, const string& name,
+  Client(BufferPool* pool, TmpFileGroup* file_group, const string& name,
       ReservationTracker* parent_reservation, MemTracker* mem_tracker,
       MemLimit mem_limit_mode, int64_t reservation_limit, RuntimeProfile* profile);
 
@@ -342,7 +349,7 @@ class BufferPool::Client {
 
   /// The file group that should be used for allocating scratch space. If NULL, spilling
   /// is disabled.
-  TmpFileMgr::FileGroup* const file_group_;
+  TmpFileGroup* const file_group_;
 
   /// A name identifying the client.
   const std::string name_;
diff --git a/be/src/runtime/bufferpool/buffer-pool-test.cc b/be/src/runtime/bufferpool/buffer-pool-test.cc
index b6d6f91..15b3b71 100644
--- a/be/src/runtime/bufferpool/buffer-pool-test.cc
+++ b/be/src/runtime/bufferpool/buffer-pool-test.cc
@@ -33,8 +33,10 @@
 #include "runtime/bufferpool/buffer-pool-internal.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/reservation-tracker.h"
+#include "runtime/mem-tracker.h"
 #include "runtime/query-state.h"
 #include "runtime/test-env.h"
+#include "runtime/tmp-file-mgr.h"
 #include "service/fe-support.h"
 #include "testutil/cpu-util.h"
 #include "testutil/death-test-util.h"
@@ -65,7 +67,6 @@ namespace impala {
 
 using BufferHandle = BufferPool::BufferHandle;
 using ClientHandle = BufferPool::ClientHandle;
-using FileGroup = TmpFileMgr::FileGroup;
 using PageHandle = BufferPool::PageHandle;
 
 class BufferPoolTest : public ::testing::Test {
@@ -84,7 +85,7 @@ class BufferPoolTest : public ::testing::Test {
       ReservationTracker* tracker = entry.second;
       tracker->Close();
     }
-    for (TmpFileMgr::FileGroup* file_group : file_groups_) {
+    for (TmpFileGroup* file_group : file_groups_) {
       file_group->Close();
     }
     global_reservations_.Close();
@@ -109,7 +110,7 @@ class BufferPoolTest : public ::testing::Test {
       int64_t initial_query_reservation, int64_t query_reservation_limit, mt19937* rng);
 
   /// Create and destroy a page multiple times.
-  void CreatePageLoop(BufferPool* pool, TmpFileMgr::FileGroup* file_group,
+  void CreatePageLoop(BufferPool* pool, TmpFileGroup* file_group,
       ReservationTracker* parent_tracker, int num_ops);
 
  protected:
@@ -149,9 +150,9 @@ class BufferPoolTest : public ::testing::Test {
   }
 
   /// Create a new file group with the default configs.
-  TmpFileMgr::FileGroup* NewFileGroup() {
-    TmpFileMgr::FileGroup* file_group =
-        obj_pool_.Add(new TmpFileMgr::FileGroup(test_env_->tmp_file_mgr(),
+  TmpFileGroup* NewFileGroup() {
+    TmpFileGroup* file_group =
+        obj_pool_.Add(new TmpFileGroup(test_env_->tmp_file_mgr(),
             test_env_->exec_env()->disk_io_mgr(), NewProfile(), TUniqueId()));
     file_groups_.push_back(file_group);
     return file_group;
@@ -380,7 +381,7 @@ class BufferPoolTest : public ::testing::Test {
   void TestRandomInternalSingle(int64_t buffer_len, bool multiple_pins);
   void TestRandomInternalMulti(int num_threads, int64_t buffer_len, bool multiple_pins);
   static const int SINGLE_THREADED_TID = -1;
-  void TestRandomInternalImpl(BufferPool* pool, FileGroup* file_group,
+  void TestRandomInternalImpl(BufferPool* pool, TmpFileGroup* file_group,
       MemTracker* parent_mem_tracker, mt19937* rng, int tid, bool multiple_pins);
 
   ObjectPool obj_pool_;
@@ -392,7 +393,7 @@ class BufferPoolTest : public ::testing::Test {
   mt19937 rng_;
 
   /// The file groups created - closed at end of each test.
-  vector<TmpFileMgr::FileGroup*> file_groups_;
+  vector<TmpFileGroup*> file_groups_;
 
   /// Paths of temporary directories created during tests - deleted at end of test.
   vector<string> created_tmp_dirs_;
@@ -1067,7 +1068,7 @@ TEST_F(BufferPoolTest, ConcurrentPageCreation) {
 
   BufferPool pool(test_env_->metrics(), TEST_BUFFER_LEN, total_mem, total_mem);
   // Share a file group between the threads.
-  TmpFileMgr::FileGroup* file_group = NewFileGroup();
+  TmpFileGroup* file_group = NewFileGroup();
 
   // Launch threads, each with a different set of query IDs.
   thread_group workers;
@@ -1087,7 +1088,7 @@ TEST_F(BufferPoolTest, ConcurrentPageCreation) {
   global_reservations_.Close();
 }
 
-void BufferPoolTest::CreatePageLoop(BufferPool* pool, TmpFileMgr::FileGroup* file_group,
+void BufferPoolTest::CreatePageLoop(BufferPool* pool, TmpFileGroup* file_group,
     ReservationTracker* parent_tracker, int num_ops) {
   BufferPool::ClientHandle client;
   ASSERT_OK(pool->RegisterClient("test client", file_group, parent_tracker, NULL,
@@ -1119,7 +1120,7 @@ TEST_F(BufferPoolTest, SpillingDisabledDcheck) {
   ASSERT_OK(pool.Pin(&client, &handle));
   // It's ok to Unpin() if the pin count remains positive.
   pool.Unpin(&client, &handle);
-  // We didn't pass in a FileGroup, so spilling is disabled and we can't bring the
+  // We didn't pass in a TmpFileGroup, so spilling is disabled and we can't bring the
   // pin count to 0.
   IMPALA_ASSERT_DEBUG_DEATH(pool.Unpin(&client, &handle), "");
 
@@ -1643,7 +1644,7 @@ TEST_F(BufferPoolTest, WriteErrorBlacklist) {
   const int64_t MEM_PER_QUERY = PAGES_PER_QUERY * TEST_BUFFER_LEN;
   BufferPool pool(test_env_->metrics(), TEST_BUFFER_LEN, TOTAL_MEM, TOTAL_MEM);
   global_reservations_.InitRootTracker(NewProfile(), TOTAL_MEM);
-  vector<FileGroup*> file_groups;
+  vector<TmpFileGroup*> file_groups;
   vector<ClientHandle> clients(TOTAL_QUERIES);
   for (int i = 0; i < INITIAL_QUERIES; ++i) {
     file_groups.push_back(NewFileGroup());
@@ -1964,7 +1965,7 @@ void BufferPoolTest::TestRandomInternalMulti(
   BufferPool pool(&tmp_metrics, min_buffer_len, TOTAL_MEM, TOTAL_MEM);
   global_reservations_.InitRootTracker(NewProfile(), TOTAL_MEM);
   MemTracker global_tracker(TOTAL_MEM);
-  FileGroup* shared_file_group = NewFileGroup();
+  TmpFileGroup* shared_file_group = NewFileGroup();
   thread_group workers;
   vector<mt19937> rngs = RandTestUtil::CreateThreadLocalRngs(num_threads, &rng_);
   for (int i = 0; i < num_threads; ++i) {
@@ -1994,7 +1995,7 @@ void BufferPoolTest::TestRandomInternalMulti(
 /// 'multiple_pins' is true, pages can be pinned multiple times (useful to test this
 /// functionality). Otherwise they are only pinned once (useful to test the case when
 /// memory is more committed).
-void BufferPoolTest::TestRandomInternalImpl(BufferPool* pool, FileGroup* file_group,
+void BufferPoolTest::TestRandomInternalImpl(BufferPool* pool, TmpFileGroup* file_group,
     MemTracker* parent_mem_tracker, mt19937* rng, int tid, bool multiple_pins) {
   // Encrypting and decrypting is expensive - reduce iterations when encryption is on.
   int num_iterations = FLAGS_disk_spill_encryption ? 5000 : 50000;
diff --git a/be/src/runtime/bufferpool/buffer-pool.cc b/be/src/runtime/bufferpool/buffer-pool.cc
index c56b6cd..e568224 100644
--- a/be/src/runtime/bufferpool/buffer-pool.cc
+++ b/be/src/runtime/bufferpool/buffer-pool.cc
@@ -24,6 +24,7 @@
 #include "common/names.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/bufferpool/buffer-allocator.h"
+#include "runtime/tmp-file-mgr.h"
 #include "util/bit-util.h"
 #include "util/cpu-info.h"
 #include "util/debug-util.h"
@@ -117,7 +118,7 @@ BufferPool::BufferPool(MetricGroup* metrics, int64_t min_buffer_len,
 
 BufferPool::~BufferPool() {}
 
-Status BufferPool::RegisterClient(const string& name, TmpFileMgr::FileGroup* file_group,
+Status BufferPool::RegisterClient(const string& name, TmpFileGroup* file_group,
     ReservationTracker* parent_reservation, MemTracker* mem_tracker,
     int64_t reservation_limit, RuntimeProfile* profile, ClientHandle* client,
     MemLimit mem_limit_mode) {
@@ -405,7 +406,7 @@ void BufferPool::SubReservation::Close() {
   tracker_.reset();
 }
 
-BufferPool::Client::Client(BufferPool* pool, TmpFileMgr::FileGroup* file_group,
+BufferPool::Client::Client(BufferPool* pool, TmpFileGroup* file_group,
     const string& name, ReservationTracker* parent_reservation, MemTracker* mem_tracker,
     MemLimit mem_limit_mode, int64_t reservation_limit, RuntimeProfile* profile)
   : pool_(pool),
@@ -829,6 +830,11 @@ string BufferPool::PageHandle::DebugString() const {
   }
 }
 
+BufferPool::Page::Page(Client* client, int64_t len)
+    : client(client), len(len), pin_count(0), pin_in_flight(false) {}
+
+BufferPool::Page::~Page() {}
+
 string BufferPool::Page::DebugString() {
   return Substitute("<BufferPool::Page> $0 len: $1 pin_count: $2 buf: $3", this, len,
       pin_count, buffer.DebugString());
diff --git a/be/src/runtime/bufferpool/buffer-pool.h b/be/src/runtime/bufferpool/buffer-pool.h
index ffa97a9..a35d8f3 100644
--- a/be/src/runtime/bufferpool/buffer-pool.h
+++ b/be/src/runtime/bufferpool/buffer-pool.h
@@ -20,20 +20,15 @@
 
 #include <stdint.h>
 #include <string>
-#include <vector>
 #include <boost/scoped_ptr.hpp>
 
-#include "common/atomic.h"
 #include "common/compiler-util.h"
 #include "common/object-pool.h"
 #include "common/status.h"
 #include "gutil/macros.h"
 #include "runtime/mem-tracker-types.h"
-#include "runtime/tmp-file-mgr.h"
 #include "util/aligned-new.h"
-#include "util/internal-queue.h"
 #include "util/mem-range.h"
-#include "util/spinlock.h"
 
 namespace impala {
 
@@ -41,6 +36,7 @@ class MetricGroup;
 class ReservationTracker;
 class RuntimeProfile;
 class SystemAllocator;
+class TmpFileGroup;
 
 /// A buffer pool that manages memory buffers for all queries in an Impala daemon.
 /// The buffer pool enforces buffer reservations, limits, and implements policies
@@ -177,7 +173,7 @@ class BufferPool : public CacheLineAligned {
   /// 'reservation_limit' and associated with MemTracker 'mem_tracker'. The initial
   /// reservation is 0 bytes. 'mem_limit_mode' determines whether reservation
   /// increases are checked against the soft or hard limit of 'mem_tracker'.
-  Status RegisterClient(const std::string& name, TmpFileMgr::FileGroup* file_group,
+  Status RegisterClient(const std::string& name, TmpFileGroup* file_group,
       ReservationTracker* parent_reservation, MemTracker* mem_tracker,
       int64_t reservation_limit, RuntimeProfile* profile, ClientHandle* client,
       MemLimit mem_limit_mode = MemLimit::SOFT) WARN_UNUSED_RESULT;
diff --git a/be/src/runtime/client-cache.h b/be/src/runtime/client-cache.h
index 2c6a089..59a2217 100644
--- a/be/src/runtime/client-cache.h
+++ b/be/src/runtime/client-cache.h
@@ -17,27 +17,38 @@
 
 #pragma once
 
+#include <cstdint>
 #include <list>
+#include <map>
+#include <memory>
 #include <mutex>
+#include <ostream>
 #include <string>
-#include <vector>
+#include <typeinfo>
+
 #include <boost/bind.hpp>
+#include <boost/function.hpp>
 #include <boost/unordered_map.hpp>
-#include <gutil/strings/substitute.h>
+#include <thrift/Thrift.h>
+#include <thrift/transport/TTransportException.h>
 
-#include "catalog/catalog-service-client-wrapper.h"
+#include "common/logging.h"
+#include "common/status.h"
+#include "gen-cpp/ErrorCodes_types.h"
+#include "gen-cpp/Types_types.h"
+#include "gutil/strings/substitute.h"
 #include "rpc/thrift-client.h"
 #include "rpc/thrift-util.h"
-#include "runtime/client-cache-types.h"
-#include "util/debug-util.h"
+#include "util/container-util.h"
 #include "util/metrics-fwd.h"
 #include "util/network-util.h"
 #include "util/time.h"
 
-#include "common/status.h"
-
 namespace impala {
 
+class MetricGroup;
+template <class T> class ClientCache;
+
 /// Opaque pointer type which allows users of ClientCache to refer to particular client
 /// instances without requiring that we parameterise ClientCacheHelper by type.
 typedef void* ClientKey;
diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc
index 5d2223d..6be9c36 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -37,6 +37,7 @@
 #include "runtime/exec-env.h"
 #include "runtime/fragment-instance-state.h"
 #include "runtime/krpc-data-stream-sender.h"
+#include "runtime/mem-tracker.h"
 #include "service/control-service.h"
 #include "service/data-stream-service.h"
 #include "util/counting-barrier.h"
diff --git a/be/src/runtime/datetime-iso-sql-format-tokenizer.cc b/be/src/runtime/datetime-iso-sql-format-tokenizer.cc
index b0f4912..a7eaef3 100644
--- a/be/src/runtime/datetime-iso-sql-format-tokenizer.cc
+++ b/be/src/runtime/datetime-iso-sql-format-tokenizer.cc
@@ -17,6 +17,16 @@
 
 #include "runtime/datetime-iso-sql-format-tokenizer.h"
 
+#include <strings.h>
+#include <algorithm>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+#include <boost/algorithm/string/case_conv.hpp>
+
+#include "common/logging.h"
+
 namespace impala {
 
 namespace datetime_parse_util {
diff --git a/be/src/runtime/datetime-parser-common.cc b/be/src/runtime/datetime-parser-common.cc
index 1cf1474..4186fe7 100644
--- a/be/src/runtime/datetime-parser-common.cc
+++ b/be/src/runtime/datetime-parser-common.cc
@@ -21,6 +21,7 @@
 #include <boost/algorithm/string/trim.hpp>
 #include <boost/date_time/gregorian/gregorian.hpp>
 
+#include "exprs/timestamp-functions.h"
 #include "gutil/strings/ascii_ctype.h"
 #include "runtime/datetime-iso-sql-format-tokenizer.h"
 #include "runtime/string-value.h"
diff --git a/be/src/runtime/datetime-parser-common.h b/be/src/runtime/datetime-parser-common.h
index 9cbc58f..ddac490 100644
--- a/be/src/runtime/datetime-parser-common.h
+++ b/be/src/runtime/datetime-parser-common.h
@@ -29,6 +29,8 @@
 
 namespace impala {
 
+class TimestampValue;
+
 using impala_udf::FunctionContext;
 using impala_udf::StringVal;
 
diff --git a/be/src/runtime/decimal-test.cc b/be/src/runtime/decimal-test.cc
index 4d4bad3..9a2f7a8 100644
--- a/be/src/runtime/decimal-test.cc
+++ b/be/src/runtime/decimal-test.cc
@@ -26,6 +26,7 @@
 #include "runtime/raw-value.h"
 #include "runtime/types.h"
 #include "testutil/gtest-util.h"
+#include "util/decimal-util.h"
 #include "util/string-parser.h"
 
 #include "common/names.h"
@@ -437,7 +438,7 @@ TEST(DecimalTest, StringToDecimalLarge) {
   VerifyParse("01000000000000000000", 18, 0,
       Decimal8Value(0), StringParser::PARSE_OVERFLOW);
 
-  int128_t result = DecimalUtil::MAX_UNSCALED_DECIMAL16;
+  int128_t result = MAX_UNSCALED_DECIMAL16;
   VerifyParse("99999999999999999999999999999999999999",
       38, 0, Decimal16Value(result), StringParser::PARSE_SUCCESS);
   VerifyParse("99999999999999999999999999999999999999e1",
@@ -524,7 +525,7 @@ TEST(DecimalTest, Overflow) {
   bool overflow = false;
 
   Decimal16Value result;
-  Decimal16Value d_max(DecimalUtil::MAX_UNSCALED_DECIMAL16);
+  Decimal16Value d_max(MAX_UNSCALED_DECIMAL16);
   Decimal16Value two(2);
   Decimal16Value one(1);
   Decimal16Value zero(0);
@@ -655,16 +656,16 @@ TEST(DecimalTest, Overflow) {
   EXPECT_TRUE(overflow);
 
   // Add 37 9's (with scale 0)
-  Decimal16Value d3(DecimalUtil::MAX_UNSCALED_DECIMAL16 / 10);
+  Decimal16Value d3(MAX_UNSCALED_DECIMAL16 / 10);
   overflow = false;
   result = d3.Add<int128_t>(0, zero, 1, 38, 1, false, &overflow);
   EXPECT_FALSE(overflow);
-  EXPECT_EQ(result.value(), DecimalUtil::MAX_UNSCALED_DECIMAL16 - 9);
+  EXPECT_EQ(result.value(), MAX_UNSCALED_DECIMAL16 - 9);
 
   overflow = false;
   result = d3.Add<int128_t>(0, one, 1, 38, 1, false, &overflow);
   EXPECT_FALSE(overflow);
-  EXPECT_EQ(result.value(), DecimalUtil::MAX_UNSCALED_DECIMAL16 - 8);
+  EXPECT_EQ(result.value(), MAX_UNSCALED_DECIMAL16 - 8);
 
   // Mod
   overflow = false;
@@ -678,7 +679,7 @@ TEST(DecimalTest, Overflow) {
   result = d3.Mod<int128_t>(0, two, 0, 38, 0, false, &is_nan, &overflow);
   EXPECT_FALSE(overflow);
   EXPECT_FALSE(is_nan);
-  EXPECT_EQ(result.value(), DecimalUtil::MAX_UNSCALED_DECIMAL16 % 2);
+  EXPECT_EQ(result.value(), MAX_UNSCALED_DECIMAL16 % 2);
 
   result = d3.Mod<int128_t>(0, zero, 1, 38, 1, false, &is_nan, &overflow);
   EXPECT_TRUE(is_nan);
diff --git a/be/src/runtime/decimal-value.h b/be/src/runtime/decimal-value.h
index ea2d126..761d474 100644
--- a/be/src/runtime/decimal-value.h
+++ b/be/src/runtime/decimal-value.h
@@ -21,9 +21,9 @@
 
 #include <ostream>
 
+#include "common/logging.h"
 #include "gen-cpp/Data_types.h"
 #include "gen-cpp/data_stream_service.pb.h"
-#include "runtime/multi-precision.h"
 #include "runtime/types.h"
 
 #ifndef __has_builtin
@@ -219,7 +219,7 @@ typedef DecimalValue<int32_t> Decimal4Value;
 typedef DecimalValue<int64_t> Decimal8Value;
 /// TODO: should we support Decimal12Value? We pad it to 16 bytes in the tuple
 /// anyway.
-typedef DecimalValue<int128_t> Decimal16Value;
+typedef DecimalValue<__int128_t> Decimal16Value;
 
 inline std::ostream& operator<<(std::ostream& os, const Decimal4Value& d) {
   return os << d.value();
diff --git a/be/src/runtime/decimal-value.inline.h b/be/src/runtime/decimal-value.inline.h
index ea1bee2..5ece6a7 100644
--- a/be/src/runtime/decimal-value.inline.h
+++ b/be/src/runtime/decimal-value.inline.h
@@ -25,8 +25,10 @@
 #include <limits>
 
 #include "common/logging.h"
+#include "runtime/multi-precision.h"
 #include "util/arithmetic-util.h"
 #include "util/bit-util.h"
+#include "util/decimal-constants.h"
 #include "util/decimal-util.h"
 #include "util/hash-util.h"
 
@@ -115,7 +117,7 @@ inline typename RESULT_T::underlying_type_t DecimalValue<T>::ToInt(int scale,
     if (abs(remainder) >= divisor >> 1) {
       // Round away from zero.
       // Bias at zero must be corrected by sign of dividend.
-      result += BitUtil::Sign(v);
+      result += Sign(v);
     }
   }
   *overflow |=
@@ -255,14 +257,14 @@ inline int128_t AddLarge(int128_t x, int x_scale, int128_t y, int y_scale,
   // it is not necessary, because doing that is equivalent to doing nothing.
   DCHECK(right <= DecimalUtil::GetScaleMultiplier<int128_t>(result_scale));
 
-  *overflow |= x_left > DecimalUtil::MAX_UNSCALED_DECIMAL16 - y_left - carry_to_left;
+  *overflow |= x_left > MAX_UNSCALED_DECIMAL16 - y_left - carry_to_left;
   left = ArithmeticUtil::AsUnsigned<std::plus>(
       ArithmeticUtil::AsUnsigned<std::plus>(x_left, y_left),
       static_cast<int128_t>(carry_to_left));
 
   int128_t mult = DecimalUtil::GetScaleMultiplier<int128_t>(result_scale);
   if (UNLIKELY(!*overflow &&
-      left > (DecimalUtil::MAX_UNSCALED_DECIMAL16 - right) / mult)) {
+      left > (MAX_UNSCALED_DECIMAL16 - right) / mult)) {
     *overflow = true;
   }
   return ArithmeticUtil::AsUnsigned<std::plus>(
@@ -287,8 +289,8 @@ inline int128_t SubtractLarge(int128_t x, int x_scale, int128_t y, int y_scale,
   right = x_right + y_right;
   // Overflow is not possible because one number is positive and the other one is
   // negative.
-  DCHECK(abs(left) <= DecimalUtil::MAX_UNSCALED_DECIMAL16);
-  DCHECK(abs(right) <= DecimalUtil::MAX_UNSCALED_DECIMAL16);
+  DCHECK(abs(left) <= MAX_UNSCALED_DECIMAL16);
+  DCHECK(abs(right) <= MAX_UNSCALED_DECIMAL16);
   // If the whole and fractional parts have different signs, then we need to make the
   // fractional part have the same sign as the whole part. If either left or right is
   // zero, then nothing needs to be done.
@@ -316,7 +318,7 @@ inline int128_t SubtractLarge(int128_t x, int x_scale, int128_t y, int y_scale,
   DCHECK(abs(right) <= DecimalUtil::GetScaleMultiplier<int128_t>(result_scale));
 
   int128_t mult = DecimalUtil::GetScaleMultiplier<int128_t>(result_scale);
-  if (UNLIKELY(abs(left) > (DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(right)) / mult)) {
+  if (UNLIKELY(abs(left) > (MAX_UNSCALED_DECIMAL16 - abs(right)) / mult)) {
     *overflow = true;
   }
   return DecimalUtil::SafeMultiply(left, mult, *overflow) + right;
@@ -358,7 +360,7 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Add(int this_scale,
     RESULT_T x = 0;
     RESULT_T y = 0;
     AdjustToSameScale(*this, this_scale, other, other_scale, result_precision, &x, &y);
-    DCHECK(abs(x) <= DecimalUtil::MAX_UNSCALED_DECIMAL16 - abs(y));
+    DCHECK(abs(x) <= MAX_UNSCALED_DECIMAL16 - abs(y));
     x += y;
     if (result_scale_decrease > 0) {
       // After first adjusting x and y to the same scale and adding them together, we now
@@ -414,7 +416,7 @@ DecimalValue<RESULT_T> DecimalValue<T>::Multiply(int this_scale,
     // converting to 256 bits is necessary, when it's not actually the case.
     needs_int256 = total_leading_zeros <= 128;
     if (UNLIKELY(needs_int256 && delta_scale == 0)) {
-      if (LIKELY(abs(x) > DecimalUtil::MAX_UNSCALED_DECIMAL16 / abs(y))) {
+      if (LIKELY(abs(x) > MAX_UNSCALED_DECIMAL16 / abs(y))) {
         // If the intermediate value does not fit into 128 bits, we indicate overflow
         // because the final value would also not fit into 128 bits since delta_scale is
         // zero.
@@ -433,13 +435,13 @@ DecimalValue<RESULT_T> DecimalValue<T>::Multiply(int this_scale,
       intermediate_result = DecimalUtil::ScaleDownAndRound<int256_t>(
           intermediate_result, delta_scale, round);
       result = ConvertToInt128(
-          intermediate_result, DecimalUtil::MAX_UNSCALED_DECIMAL16, overflow);
+          intermediate_result, MAX_UNSCALED_DECIMAL16, overflow);
     }
   } else {
     if (delta_scale == 0) {
       result = DecimalUtil::SafeMultiply(x, y, false);
       if (UNLIKELY(result_precision == ColumnType::MAX_PRECISION &&
-          abs(result) > DecimalUtil::MAX_UNSCALED_DECIMAL16)) {
+          abs(result) > MAX_UNSCALED_DECIMAL16)) {
         // An overflow is possible here, if, for example, x = (2^64 - 1) and
         // y = (2^63 - 1).
         *overflow = true;
@@ -466,7 +468,7 @@ DecimalValue<RESULT_T> DecimalValue<T>::Multiply(int this_scale,
       result = 0;
     }
   }
-  DCHECK(*overflow || abs(result) <= DecimalUtil::MAX_UNSCALED_DECIMAL16);
+  DCHECK(*overflow || abs(result) <= MAX_UNSCALED_DECIMAL16);
   return DecimalValue<RESULT_T>(result);
 }
 
@@ -497,7 +499,7 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Divide(int this_scale,
     *overflow |= ovf;
     int128_t y_sp = other.value();
     int256_t y = ConvertToInt256(y_sp);
-    int128_t r = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16, overflow);
+    int128_t r = ConvertToInt128(x / y, MAX_UNSCALED_DECIMAL16, overflow);
     if (round) {
       int256_t remainder = x % y;
       // The following is frought with apparent difficulty, as there is only 1 bit
@@ -509,12 +511,12 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Divide(int this_scale,
       // This will need to be fixed if we optimize to get back a 128-bit signed value.
       if (abs(2 * remainder) >= abs(y)) {
         // Bias at zero must be corrected by sign of divisor and dividend.
-        r += (BitUtil::Sign(x_sp) ^ BitUtil::Sign(y_sp)) + 1;
+        r += (Sign(x_sp) ^ Sign(y_sp)) + 1;
       }
     }
     // Check overflow again after rounding since +/-1 could cause decimal overflow
     if (result_precision == ColumnType::MAX_PRECISION) {
-      *overflow |= abs(r) > DecimalUtil::MAX_UNSCALED_DECIMAL16;
+      *overflow |= abs(r) > MAX_UNSCALED_DECIMAL16;
     }
     return DecimalValue<RESULT_T>(r);
   } else {
@@ -533,12 +535,12 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Divide(int this_scale,
         // In addition, we know the dividend is non-zero, since there was a remainder.
         // The two conditions combined mean that the result must also be non-zero.
         DCHECK(r != 0);
-        r += BitUtil::Sign(r);
+        r += Sign(r);
       }
     }
-    DCHECK(abs(r) <= DecimalUtil::MAX_UNSCALED_DECIMAL16 &&
-        (sizeof(RESULT_T) > 8 || abs(r) <= DecimalUtil::MAX_UNSCALED_DECIMAL8) &&
-        (sizeof(RESULT_T) > 4 || abs(r) <= DecimalUtil::MAX_UNSCALED_DECIMAL4));
+    DCHECK(abs(r) <= MAX_UNSCALED_DECIMAL16 &&
+        (sizeof(RESULT_T) > 8 || abs(r) <= MAX_UNSCALED_DECIMAL8) &&
+        (sizeof(RESULT_T) > 4 || abs(r) <= MAX_UNSCALED_DECIMAL4));
     return DecimalValue<RESULT_T>(static_cast<RESULT_T>(r));
   }
 }
@@ -557,7 +559,7 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Mod(int this_scale,
     case 4: {
       int64_t x, y;
       AdjustToSameScale(*this, this_scale, other, other_scale, result_precision, &x, &y);
-      DCHECK(abs(x % y) <= DecimalUtil::MAX_UNSCALED_DECIMAL4);
+      DCHECK(abs(x % y) <= MAX_UNSCALED_DECIMAL4);
       result = x % y;
       break;
     }
@@ -567,13 +569,13 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Mod(int this_scale,
         int64_t x, y;
         AdjustToSameScale(*this, this_scale, other, other_scale,
             result_precision, &x, &y);
-        DCHECK(abs(x % y) <= DecimalUtil::MAX_UNSCALED_DECIMAL8);
+        DCHECK(abs(x % y) <= MAX_UNSCALED_DECIMAL8);
         result = x % y;
       } else {
         int128_t x, y;
         AdjustToSameScale(*this, this_scale, other, other_scale,
             result_precision, &x, &y);
-        DCHECK(abs(x % y) <= DecimalUtil::MAX_UNSCALED_DECIMAL8);
+        DCHECK(abs(x % y) <= MAX_UNSCALED_DECIMAL8);
         result = x % y;
       }
       break;
@@ -584,7 +586,7 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Mod(int this_scale,
         int128_t x, y;
         AdjustToSameScale(*this, this_scale, other, other_scale,
             result_precision, &x, &y);
-        DCHECK(abs(x % y) <= DecimalUtil::MAX_UNSCALED_DECIMAL16);
+        DCHECK(abs(x % y) <= MAX_UNSCALED_DECIMAL16);
         result = x % y;
       } else {
         int256_t x_256 = ConvertToInt256(value());
@@ -597,7 +599,7 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Mod(int this_scale,
         int256_t intermediate_result = x_256 % y_256;
         bool ovf = false;
         result = ConvertToInt128(intermediate_result,
-            DecimalUtil::MAX_UNSCALED_DECIMAL16, &ovf);
+            MAX_UNSCALED_DECIMAL16, &ovf);
         DCHECK(!ovf);
       }
       break;
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index c056b18..c4e1dc9 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -17,23 +17,23 @@
 
 #pragma once
 
+#include <cstdint>
 #include <iosfwd>
+#include <map>
 #include <unordered_map>
+#include <utility>
 #include <vector>
-#include <boost/scoped_ptr.hpp>
 
 #include "codegen/impala-ir.h"
 #include "common/global-types.h"
 #include "common/status.h"
 #include "runtime/types.h"
 
-#include "gen-cpp/Descriptors_types.h"  // for TTupleId
+#include "gen-cpp/CatalogObjects_types.h"
 #include "gen-cpp/Types_types.h"
 
 namespace llvm {
   class Constant;
-  class Function;
-  class PointerType;
   class StructType;
   class Value;
 };
@@ -43,14 +43,14 @@ namespace impala {
 class LlvmBuilder;
 class LlvmCodeGen;
 class ObjectPool;
-class RuntimeState;
-class ScalarExpr;
 class ScalarExprEvaluator;
+class TColumnDescriptor;
 class TDescriptorTable;
+class TDescriptorTableSerialized;
+class TExpr;
 class TSlotDescriptor;
-class TTable;
-class TTupleDescriptor;
 class TTableDescriptor;
+class TTupleDescriptor;
 
 /// A path into a table schema (e.g. a vector of ColumnTypes) pointing to a particular
 /// column/field. The i-th element of the path is the ordinal position of the column/field
diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc
index 65f12cd..a7bbaf7 100644
--- a/be/src/runtime/exec-env.cc
+++ b/be/src/runtime/exec-env.cc
@@ -24,6 +24,7 @@
 #include <gutil/strings/substitute.h>
 #include <kudu/client/client.h>
 
+#include "catalog/catalog-service-client-wrapper.h"
 #include "common/logging.h"
 #include "common/object-pool.h"
 #include "exec/kudu-util.h"
diff --git a/be/src/runtime/io/data-cache-test.cc b/be/src/runtime/io/data-cache-test.cc
index 416dc2a..b335677 100644
--- a/be/src/runtime/io/data-cache-test.cc
+++ b/be/src/runtime/io/data-cache-test.cc
@@ -19,7 +19,6 @@
 #include <boost/bind.hpp>
 #include <fstream>
 #include <gflags/gflags.h>
-#include <iostream>
 #include <rapidjson/document.h>
 #include <sys/sysinfo.h>
 
diff --git a/be/src/runtime/io/disk-io-mgr-test.cc b/be/src/runtime/io/disk-io-mgr-test.cc
index 71d2fa6..2cf4642 100644
--- a/be/src/runtime/io/disk-io-mgr-test.cc
+++ b/be/src/runtime/io/disk-io-mgr-test.cc
@@ -20,8 +20,6 @@
 #include <boost/thread/thread.hpp>
 #include <sys/stat.h>
 
-#include "codegen/llvm-codegen.h"
-#include "common/init.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/reservation-tracker.h"
 #include "runtime/io/cache-reader-test-stub.h"
@@ -30,14 +28,11 @@
 #include "runtime/io/disk-io-mgr.h"
 #include "runtime/io/request-context.h"
 #include "runtime/test-env.h"
-#include "runtime/thread-resource-mgr.h"
-#include "service/fe-support.h"
 #include "testutil/gtest-util.h"
 #include "testutil/rand-util.h"
 #include "testutil/scoped-flag-setter.h"
 #include "util/condition-variable.h"
-#include "util/cpu-info.h"
-#include "util/disk-info.h"
+#include "util/debug-util.h"
 #include "util/thread.h"
 #include "util/time.h"
 
diff --git a/be/src/runtime/mem-tracker.h b/be/src/runtime/mem-tracker.h
index a58f2a2..fc9c98e 100644
--- a/be/src/runtime/mem-tracker.h
+++ b/be/src/runtime/mem-tracker.h
@@ -17,19 +17,24 @@
 
 #pragma once
 
-#include <stdint.h>
-#include <map>
+#include <cstdint>
+#include <functional>
+#include <list>
 #include <memory>
 #include <mutex>
+#include <ostream>
 #include <queue>
+#include <string>
+#include <utility>
 #include <vector>
+
 #include <boost/unordered_map.hpp>
 
-#include "common/logging.h"
 #include "common/atomic.h"
+#include "common/compiler-util.h"
+#include "common/logging.h"
+#include "common/status.h"
 #include "runtime/mem-tracker-types.h"
-#include "util/debug-util.h"
-#include "util/internal-queue.h"
 #include "util/metrics-fwd.h"
 #include "util/runtime-profile-counters.h"
 #include "util/spinlock.h"
@@ -38,9 +43,10 @@
 
 namespace impala {
 
+class MetricGroup;
 class ObjectPool;
 struct ReservationTrackerCounters;
-class TQueryOptions;
+class RuntimeState;
 
 /// A MemTracker tracks memory consumption; it contains an optional limit
 /// and can be arranged into a tree structure such that the consumption tracked
diff --git a/be/src/runtime/multi-precision.cc b/be/src/runtime/multi-precision.cc
deleted file mode 100644
index e7c2cf3..0000000
--- a/be/src/runtime/multi-precision.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/multi-precision.h"
-
-#include "common/logging.h"
-
-#include "common/names.h"
-
-namespace impala {
-
-static const uint32_t ONE_BILLION = 1000000000;
-
-// Print the value in base 10 by converting v into parts that are base
-// 1 billion (large multiple of 10 that's easy to work with).
-ostream& operator<<(ostream& os, const int128_t& val) {
-  int128_t v = val;
-  if (v == 0) {
-    os << "0";
-    return os;
-  }
-
-  if (v < 0) {
-    v = -v;
-    os << "-";
-  }
-
-  // 1B^5 covers the range for int128_t
-  // parts[0] is the least significant place.
-  uint32_t parts[5];
-  int index = 0;
-  while (v > 0) {
-    parts[index++] = v % ONE_BILLION;
-    v /= ONE_BILLION;
-  }
-  --index;
-
-  // Accumulate into a temporary stringstream so format options on 'os' do
-  // not mess up printing val.
-  // TODO: This is likely pretty expensive with the string copies. We don't
-  // do this in paths we care about currently but might need to revisit.
-  stringstream ss;
-  ss << parts[index];
-  for (int i = index - 1; i >= 0; --i) {
-    // The remaining parts need to be padded with leading zeros.
-    ss << setfill('0') << setw(9) << parts[i];
-  }
-  os << ss.str();
-  return os;
-}
-
-}
diff --git a/be/src/runtime/multi-precision.h b/be/src/runtime/multi-precision.h
index 5abc6a3..3645d0d 100644
--- a/be/src/runtime/multi-precision.h
+++ b/be/src/runtime/multi-precision.h
@@ -128,8 +128,40 @@ inline uint64_t LowBits(int128_t x) {
   return x & 0xffffffffffffffff;
 }
 
-/// Prints v in base 10.
-std::ostream& operator<<(std::ostream& os, const int128_t& val);
+// Doubles the width of integer types (e.g. int32_t -> int64_t).
+// Currently only works with a few signed types.
+// Feel free to extend it to other types as well.
+template <typename T>
+struct DoubleWidth {};
+
+template <>
+struct DoubleWidth<int32_t> {
+  using type = int64_t;
+};
+
+template <>
+struct DoubleWidth<int64_t> {
+  using type = int128_t;
+};
+
+template <>
+struct DoubleWidth<int128_t> {
+  using type = int256_t;
+};
+
+/// Return an integer signifying the sign of the value, returning +1 for
+/// positive integers (and zero), -1 for negative integers.
+/// The extra shift is to silence GCC warnings about full width shift on
+/// unsigned types. It compiles out in optimized builds into the expected increment.
+template<typename T>
+constexpr static inline T Sign(T value) {
+  return 1 | ((value >> (ArithmeticUtil::UnsignedWidth<T>() - 1)) >> 1);
+}
+
+template<>
+inline int256_t Sign(int256_t value) {
+  return value < 0 ? -1 : 1;
+}
 
 }
 
diff --git a/be/src/runtime/query-state.cc b/be/src/runtime/query-state.cc
index 432d43f..a07e737 100644
--- a/be/src/runtime/query-state.cc
+++ b/be/src/runtime/query-state.cc
@@ -40,6 +40,7 @@
 #include "runtime/runtime-filter-bank.h"
 #include "runtime/runtime-state.h"
 #include "runtime/scanner-mem-limiter.h"
+#include "runtime/tmp-file-mgr.h"
 #include "service/control-service.h"
 #include "service/data-stream-service.h"
 #include "util/container-util.h"
@@ -251,7 +252,7 @@ Status QueryState::InitBufferPoolState() {
 
   if (query_options().scratch_limit != 0 && !query_ctx_.disable_spilling) {
     file_group_ = obj_pool_.Add(
-        new TmpFileMgr::FileGroup(exec_env->tmp_file_mgr(), exec_env->disk_io_mgr(),
+        new TmpFileGroup(exec_env->tmp_file_mgr(), exec_env->disk_io_mgr(),
             host_profile_, query_id(), query_options().scratch_limit));
   }
   return Status::OK();
diff --git a/be/src/runtime/query-state.h b/be/src/runtime/query-state.h
index 43c204b..eb6938c 100644
--- a/be/src/runtime/query-state.h
+++ b/be/src/runtime/query-state.h
@@ -32,7 +32,6 @@
 #include "gen-cpp/control_service.pb.h"
 #include "gutil/macros.h"
 #include "gutil/threading/thread_collision_warner.h" // for DFAKE_*
-#include "runtime/tmp-file-mgr.h"
 #include "util/counting-barrier.h"
 #include "util/spinlock.h"
 #include "util/unique-id-hash.h"
@@ -56,6 +55,7 @@ class RuntimeFilterBank;
 class RuntimeProfile;
 class RuntimeState;
 class ScannerMemLimiter;
+class TmpFileGroup;
 class TRuntimeProfileForest;
 
 /// Central class for all backend execution state (example: the FragmentInstanceStates
@@ -156,7 +156,7 @@ class QueryState {
     return buffer_reservation_;
   }
   InitialReservations* initial_reservations() const { return initial_reservations_; }
-  TmpFileMgr::FileGroup* file_group() const {
+  TmpFileGroup* file_group() const {
     DCHECK_GT(backend_resource_refcnt_.Load(), 0);
     return file_group_;
   }
@@ -350,7 +350,7 @@ class QueryState {
 
   /// Temporary files for this query (owned by obj_pool_). Non-null if spilling is
   /// enabled. Set in Prepare().
-  TmpFileMgr::FileGroup* file_group_ = nullptr;
+  TmpFileGroup* file_group_ = nullptr;
 
   /// Manages runtime filters that are either produced or consumed (or both!) by plan
   /// nodes on this backend.
diff --git a/be/src/runtime/runtime-filter-bank.cc b/be/src/runtime/runtime-filter-bank.cc
index 5b02ceb..f641585 100644
--- a/be/src/runtime/runtime-filter-bank.cc
+++ b/be/src/runtime/runtime-filter-bank.cc
@@ -40,6 +40,7 @@
 #include "service/impala-server.h"
 #include "util/bit-util.h"
 #include "util/bloom-filter.h"
+#include "util/debug-util.h"
 #include "util/min-max-filter.h"
 #include "util/pretty-printer.h"
 #include "util/uid-util.h"
diff --git a/be/src/runtime/sorted-run-merger.cc b/be/src/runtime/sorted-run-merger.cc
index 64feeb7..2331ab7 100644
--- a/be/src/runtime/sorted-run-merger.cc
+++ b/be/src/runtime/sorted-run-merger.cc
@@ -18,6 +18,7 @@
 #include "runtime/sorter.h"
 #include "runtime/tuple-row.h"
 #include "util/runtime-profile-counters.h"
+#include "util/tuple-row-compare.h"
 
 #include "common/names.h"
 
diff --git a/be/src/runtime/sorted-run-merger.h b/be/src/runtime/sorted-run-merger.h
index 7170e8f..7834b9c 100644
--- a/be/src/runtime/sorted-run-merger.h
+++ b/be/src/runtime/sorted-run-merger.h
@@ -21,13 +21,13 @@
 #include <boost/scoped_ptr.hpp>
 
 #include "common/object-pool.h"
-#include "util/tuple-row-compare.h"
+#include "util/runtime-profile.h"
 
 namespace impala {
 
 class RowBatch;
 class RowDescriptor;
-class RuntimeProfile;
+class TupleRowComparator;
 
 /// SortedRunMerger is used to merge multiple sorted runs of tuples. A run is a sorted
 /// sequence of row batches, which are fetched from a RunBatchSupplierFn function object.
diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc
index 006e409..339e0b9 100644
--- a/be/src/runtime/sorter.cc
+++ b/be/src/runtime/sorter.cc
@@ -22,6 +22,7 @@
 #include <boost/random/uniform_int.hpp>
 #include <gutil/strings/substitute.h>
 
+#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/bufferpool/reservation-tracker.h"
 #include "runtime/exec-env.h"
 #include "runtime/mem-tracker.h"
diff --git a/be/src/runtime/sorter.h b/be/src/runtime/sorter.h
index 3b56b2c..f70fa1e 100644
--- a/be/src/runtime/sorter.h
+++ b/be/src/runtime/sorter.h
@@ -21,12 +21,12 @@
 #include <deque>
 
 #include "runtime/bufferpool/buffer-pool.h"
+#include "util/runtime-profile.h"
 #include "util/tuple-row-compare.h"
 
 namespace impala {
 
 class SortedRunMerger;
-class RuntimeProfile;
 class RowBatch;
 
 /// Sorter contains the external sort implementation. Its purpose is to sort arbitrarily
diff --git a/be/src/runtime/test-env.cc b/be/src/runtime/test-env.cc
index e1c20cb..321529c 100644
--- a/be/src/runtime/test-env.cc
+++ b/be/src/runtime/test-env.cc
@@ -22,6 +22,8 @@
 
 #include "gutil/strings/substitute.h"
 #include "rpc/rpc-mgr.h"
+#include "runtime/fragment-instance-state.h"
+#include "runtime/mem-tracker.h"
 #include "runtime/query-exec-mgr.h"
 #include "runtime/query-state.h"
 #include "runtime/tmp-file-mgr.h"
diff --git a/be/src/runtime/test-env.h b/be/src/runtime/test-env.h
index d146c8c..ce5e354 100644
--- a/be/src/runtime/test-env.h
+++ b/be/src/runtime/test-env.h
@@ -20,8 +20,6 @@
 
 #include "runtime/io/disk-io-mgr.h"
 #include "runtime/exec-env.h"
-#include "runtime/fragment-instance-state.h"
-#include "runtime/mem-tracker.h"
 #include "runtime/runtime-state.h"
 
 namespace impala {
diff --git a/be/src/runtime/timestamp-parse-util.cc b/be/src/runtime/timestamp-parse-util.cc
index 9c6bac0..63340ea 100644
--- a/be/src/runtime/timestamp-parse-util.cc
+++ b/be/src/runtime/timestamp-parse-util.cc
@@ -17,7 +17,25 @@
 
 #include "runtime/timestamp-parse-util.h"
 
-#include "common/names.h"
+#include <algorithm>
+#include <cctype>
+#include <cstdint>
+#include <ostream>
+#include <vector>
+
+#include <boost/date_time/date.hpp>
+#include <boost/date_time/gregorian/greg_calendar.hpp>
+#include <boost/date_time/gregorian/greg_duration.hpp>
+#include <boost/date_time/gregorian_calendar.hpp>
+#include <boost/date_time/posix_time/posix_time_config.hpp>
+#include <boost/date_time/posix_time/posix_time_duration.hpp>
+#include <boost/date_time/posix_time/ptime.hpp>
+#include <boost/date_time/special_defs.hpp>
+#include <boost/date_time/time.hpp>
+#include <boost/date_time/time_duration.hpp>
+#include <boost/date_time/time_system_split.hpp>
+#include <boost/exception/exception.hpp>
+
 #include "runtime/datetime-iso-sql-format-parser.h"
 #include "runtime/datetime-simple-date-format-parser.h"
 #include "runtime/date-value.h"
@@ -26,7 +44,7 @@
 #include "udf/udf-internal.h"
 #include "util/string-parser.h"
 
-#include "cctype"
+#include "common/names.h"
 
 using boost::gregorian::date;
 using boost::gregorian::date_duration;
diff --git a/be/src/runtime/timestamp-test.cc b/be/src/runtime/timestamp-test.cc
index 463b8d3..c2de827 100644
--- a/be/src/runtime/timestamp-test.cc
+++ b/be/src/runtime/timestamp-test.cc
@@ -285,7 +285,7 @@ void TestFromSubSecondFunctions(int64_t seconds, int64_t millis, const char* exp
   }
 
   // Test UtcFromUnixTimeLimitedRangeNanos only for timestamps that fit to its range.
-  int128_t total_nanos = int128_t {seconds} * NANOS_PER_SEC + millis * 1000 * 1000;
+  __int128_t total_nanos = __int128_t {seconds} * NANOS_PER_SEC + millis * 1000 * 1000;
   if (std::numeric_limits<int64_t>::min() >= total_nanos &&
       std::numeric_limits<int64_t>::max() <= total_nanos) {
     EXPECT_EQ(from_millis,
diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h
index bdb64d5..0863518 100644
--- a/be/src/runtime/timestamp-value.h
+++ b/be/src/runtime/timestamp-value.h
@@ -15,18 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
 
-#ifndef IMPALA_RUNTIME_TIMESTAMP_VALUE_H
-#define IMPALA_RUNTIME_TIMESTAMP_VALUE_H
-
-#include <boost/date_time/compiler_config.hpp>
-#include <boost/date_time/gregorian/gregorian.hpp>
-#include <boost/date_time/local_time/local_time.hpp>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
 #include <string>
 
+#include <boost/date_time/gregorian/greg_date.hpp>
+#include <boost/date_time/posix_time/posix_time_config.hpp>
+#include <boost/date_time/posix_time/ptime.hpp>
+#include <boost/date_time/special_defs.hpp>
+#include <boost/date_time/time_duration.hpp>
+
+#include "common/compiler-util.h"
 #include "common/global-types.h"
-#include "gen-cpp/Data_types.h"
-#include "gen-cpp/data_stream_service.pb.h"
+#include "common/logging.h"
+#include "gen-cpp/common.pb.h"
 #include "udf/udf.h"
 #include "util/hash-util.h"
 
@@ -379,5 +384,3 @@ inline std::size_t hash_value(const TimestampValue& v) {
 
 std::ostream& operator<<(std::ostream& os, const TimestampValue& timestamp_value);
 }
-
-#endif
diff --git a/be/src/runtime/tmp-file-mgr-internal.h b/be/src/runtime/tmp-file-mgr-internal.h
index 59d0163..9c94144 100644
--- a/be/src/runtime/tmp-file-mgr-internal.h
+++ b/be/src/runtime/tmp-file-mgr-internal.h
@@ -24,17 +24,18 @@
 
 namespace impala {
 
-/// File is a handle to a physical file in a temporary directory. File space
+/// TmpFile is a handle to a physical file in a temporary directory. File space
 /// can be allocated and files removed using AllocateSpace() and Remove(). Used
 /// internally by TmpFileMgr.
 ///
 /// Creation of the physical file in the file system is deferred until the file is
 /// written by DiskIoMgr.
 ///
-/// Methods of File are not thread-safe.
-class TmpFileMgr::File {
+/// Methods of TmpFile are not thread-safe.
+class TmpFile {
  public:
-  File(FileGroup* file_group, DeviceId device_id, const std::string& path);
+  TmpFile(TmpFileGroup* file_group, TmpFileMgr::DeviceId device_id,
+      const std::string& path);
 
   /// Allocates 'num_bytes' bytes in this file for a new block of data if there is
   /// free capacity in this temporary directory. If there is insufficient capacity,
@@ -72,14 +73,14 @@ class TmpFileMgr::File {
   /// directory. A warning is issued if available space is less than this threshold.
   const static uint64_t AVAILABLE_SPACE_THRESHOLD_MB;
 
-  /// The FileGroup this belongs to. Cannot be null.
-  FileGroup* const file_group_;
+  /// The TmpFileGroup this belongs to. Cannot be null.
+  TmpFileGroup* const file_group_;
 
   /// Path of the physical file in the filesystem.
   const std::string path_;
 
   /// The temporary device this file is stored on.
-  const DeviceId device_id_;
+  const TmpFileMgr::DeviceId device_id_;
 
   /// The id of the disk on which the physical file lies.
   const int disk_id_;
@@ -92,7 +93,7 @@ class TmpFileMgr::File {
   bool blacklisted_;
 
   /// Helper to get the TmpDir that this file is associated with.
-  TmpDir* GetDir();
+  TmpFileMgr::TmpDir* GetDir();
 };
 } // namespace impala
 
diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index 70a59f2..7f217a6 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -130,11 +130,11 @@ class TmpFileMgrTest : public ::testing::Test {
   /// Helper to call the private CreateFiles() method and return
   /// the created files.
   static Status CreateFiles(
-      TmpFileMgr::FileGroup* group, vector<TmpFileMgr::File*>* files) {
+      TmpFileGroup* group, vector<TmpFile*>* files) {
     // The method expects the lock to be held.
     lock_guard<SpinLock> lock(group->lock_);
     RETURN_IF_ERROR(group->CreateFiles());
-    for (unique_ptr<TmpFileMgr::File>& file : group->tmp_files_) {
+    for (unique_ptr<TmpFile>& file : group->tmp_files_) {
       files->push_back(file.get());
     }
     return Status::OK();
@@ -146,38 +146,38 @@ class TmpFileMgrTest : public ::testing::Test {
   }
 
   /// Helper to call the private TmpFileMgr::NewFile() method.
-  static void NewFile(TmpFileMgr* mgr, TmpFileMgr::FileGroup* group,
-      TmpFileMgr::DeviceId device_id, unique_ptr<TmpFileMgr::File>* new_file) {
+  static void NewFile(TmpFileMgr* mgr, TmpFileGroup* group,
+      TmpFileMgr::DeviceId device_id, unique_ptr<TmpFile>* new_file) {
     mgr->NewFile(group, device_id, new_file);
   }
 
   /// Helper to call the private File::AllocateSpace() method.
   static void FileAllocateSpace(
-      TmpFileMgr::File* file, int64_t num_bytes, int64_t* offset) {
+      TmpFile* file, int64_t num_bytes, int64_t* offset) {
     file->AllocateSpace(num_bytes, offset);
   }
 
   /// Helper to call the private FileGroup::AllocateSpace() method.
-  static Status GroupAllocateSpace(TmpFileMgr::FileGroup* group, int64_t num_bytes,
-      TmpFileMgr::File** file, int64_t* offset) {
+  static Status GroupAllocateSpace(TmpFileGroup* group, int64_t num_bytes,
+      TmpFile** file, int64_t* offset) {
     return group->AllocateSpace(num_bytes, file, offset);
   }
 
   /// Helper to set FileGroup::next_allocation_index_.
-  static void SetNextAllocationIndex(TmpFileMgr::FileGroup* group, int value) {
+  static void SetNextAllocationIndex(TmpFileGroup* group, int value) {
     group->next_allocation_index_ = value;
   }
 
   /// Helper to cancel the FileGroup RequestContext.
-  static void CancelIoContext(TmpFileMgr::FileGroup* group) {
+  static void CancelIoContext(TmpFileGroup* group) {
     group->io_ctx_->Cancel();
   }
 
   /// Helper to get the # of bytes allocated by the group. Validates that the sum across
   /// all files equals this total.
-  static int64_t BytesAllocated(TmpFileMgr::FileGroup* group) {
+  static int64_t BytesAllocated(TmpFileGroup* group) {
     int64_t bytes_allocated = 0;
-    for (unique_ptr<TmpFileMgr::File>& file : group->tmp_files_) {
+    for (unique_ptr<TmpFile>& file : group->tmp_files_) {
       bytes_allocated += file->bytes_allocated_;
     }
     EXPECT_EQ(bytes_allocated, group->current_bytes_allocated_);
@@ -185,8 +185,8 @@ class TmpFileMgrTest : public ::testing::Test {
   }
 
   /// Helpers to call WriteHandle methods.
-  void Cancel(TmpFileMgr::WriteHandle* handle) { handle->Cancel(); }
-  void WaitForWrite(TmpFileMgr::WriteHandle* handle) {
+  void Cancel(TmpWriteHandle* handle) { handle->Cancel(); }
+  void WaitForWrite(TmpWriteHandle* handle) {
     handle->WaitForWrite();
   }
 
@@ -228,17 +228,17 @@ TEST_F(TmpFileMgrTest, TestFileAllocation) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.Init(metrics_.get()));
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(
+  TmpFileGroup file_group(
       &tmp_file_mgr, io_mgr(), profile_, id, 1024 * 1024 * 8);
 
   // Default configuration should give us one temporary device.
   EXPECT_EQ(1, tmp_file_mgr.NumActiveTmpDevices());
   vector<TmpFileMgr::DeviceId> tmp_devices = tmp_file_mgr.ActiveTmpDevices();
   EXPECT_EQ(1, tmp_devices.size());
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group, &files));
   EXPECT_EQ(1, files.size());
-  TmpFileMgr::File* file = files[0];
+  TmpFile* file = files[0];
   // Apply writes of variable sizes and check space was allocated correctly.
   int64_t write_sizes[] = {1, 10, 1024, 4, 1024 * 1024 * 8, 1024 * 1024 * 8, 16, 10};
   int num_write_sizes = sizeof(write_sizes) / sizeof(write_sizes[0]);
@@ -269,16 +269,16 @@ TEST_F(TmpFileMgrTest, TestOneDirPerDevice) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, true, metrics_.get()));
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
 
   // Only the first directory should be used.
   EXPECT_EQ(1, tmp_file_mgr.NumActiveTmpDevices());
   vector<TmpFileMgr::DeviceId> devices = tmp_file_mgr.ActiveTmpDevices();
   EXPECT_EQ(1, devices.size());
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group, &files));
   EXPECT_EQ(1, files.size());
-  TmpFileMgr::File* file = files[0];
+  TmpFile* file = files[0];
   // Check the prefix is the expected temporary directory.
   EXPECT_EQ(0, file->path().find(tmp_dirs[0]));
   ASSERT_OK(FileSystemUtil::RemovePaths(tmp_dirs));
@@ -293,14 +293,14 @@ TEST_F(TmpFileMgrTest, TestMultiDirsPerDevice) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
 
   // Both directories should be used.
   EXPECT_EQ(2, tmp_file_mgr.NumActiveTmpDevices());
   vector<TmpFileMgr::DeviceId> devices = tmp_file_mgr.ActiveTmpDevices();
   EXPECT_EQ(2, devices.size());
 
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group, &files));
   EXPECT_EQ(2, files.size());
   for (int i = 0; i < 2; ++i) {
@@ -321,7 +321,7 @@ TEST_F(TmpFileMgrTest, TestReportError) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
 
   // Both directories should be used.
   vector<TmpFileMgr::DeviceId> devices = tmp_file_mgr.ActiveTmpDevices();
@@ -330,11 +330,11 @@ TEST_F(TmpFileMgrTest, TestReportError) {
 
   // Inject an error on one device so that we can validate it is handled correctly.
   int good_device = 0, bad_device = 1;
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group, &files));
   ASSERT_EQ(2, files.size());
-  TmpFileMgr::File* good_file = files[good_device];
-  TmpFileMgr::File* bad_file = files[bad_device];
+  TmpFile* good_file = files[good_device];
+  TmpFile* bad_file = files[bad_device];
   ErrorMsg errmsg(TErrorCode::GENERAL, "A fake error");
   bad_file->Blacklist(errmsg);
 
@@ -352,7 +352,7 @@ TEST_F(TmpFileMgrTest, TestReportError) {
   // The good device should still be usable.
   FileAllocateSpace(good_file, 128, &offset);
   // Attempts to allocate new files on bad device should succeed.
-  unique_ptr<TmpFileMgr::File> bad_file2;
+  unique_ptr<TmpFile> bad_file2;
   NewFile(&tmp_file_mgr, &file_group, bad_device, &bad_file2);
   ASSERT_OK(FileSystemUtil::RemovePaths(tmp_dirs));
   file_group.Close();
@@ -370,9 +370,9 @@ TEST_F(TmpFileMgrTest, TestAllocateNonWritable) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
 
-  vector<TmpFileMgr::File*> allocated_files;
+  vector<TmpFile*> allocated_files;
   ASSERT_OK(CreateFiles(&file_group, &allocated_files));
   int64_t offset;
   FileAllocateSpace(allocated_files[0], 1, &offset);
@@ -400,15 +400,15 @@ TEST_F(TmpFileMgrTest, TestScratchLimit) {
   // A power-of-two so that FileGroup allocates exactly this amount of scratch space.
   const int64_t ALLOC_SIZE = 64;
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id, LIMIT);
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id, LIMIT);
 
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group, &files));
 
   // Test individual limit is enforced.
   Status status;
   int64_t offset;
-  TmpFileMgr::File* alloc_file;
+  TmpFile* alloc_file;
 
   // Alloc from file 1 should succeed.
   SetNextAllocationIndex(&file_group, 0);
@@ -441,7 +441,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dirs, false, metrics_.get()));
   TUniqueId id;
 
-  TmpFileMgr::FileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
+  TmpFileGroup file_group(&tmp_file_mgr, io_mgr(), profile_, id);
   int64_t expected_scratch_bytes_allocated = 0;
   // Test some different allocation sizes.
   checkHWMMetrics(0, 0);
@@ -456,7 +456,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
 
     WriteRange::WriteDoneCallback callback =
         bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
-    vector<unique_ptr<TmpFileMgr::WriteHandle>> handles(BLOCKS);
+    vector<unique_ptr<TmpWriteHandle>> handles(BLOCKS);
     // 'file_group' should allocate extra scratch bytes for this 'alloc_size'.
     expected_scratch_bytes_allocated += alloc_size * BLOCKS;
     const int TEST_ITERS = 5;
@@ -493,7 +493,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
 // internal invariants of TmpFileMgr to be broken on error path.
 TEST_F(TmpFileMgrTest, TestProcessMemLimitExceeded) {
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
+  TmpFileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
 
   const int DATA_SIZE = 64;
   vector<uint8_t> data(DATA_SIZE);
@@ -504,7 +504,7 @@ TEST_F(TmpFileMgrTest, TestProcessMemLimitExceeded) {
   // After this error, writing via the file group should fail.
   WriteRange::WriteDoneCallback callback =
       bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
-  unique_ptr<TmpFileMgr::WriteHandle> handle;
+  unique_ptr<TmpWriteHandle> handle;
   Status status = file_group.Write(MemRange(data.data(), DATA_SIZE), callback, &handle);
   EXPECT_EQ(TErrorCode::CANCELLED_INTERNALLY, status.code());
   file_group.Close();
@@ -520,7 +520,7 @@ TEST_F(TmpFileMgrTest, TestEncryptionDuringCancellation) {
   FLAGS_stress_scratch_write_delay_ms = 1000;
 #endif
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
+  TmpFileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
 
   // Make the data fairly large so that we have a better chance of cancelling while the
   // write is in flight
@@ -535,7 +535,7 @@ TEST_F(TmpFileMgrTest, TestEncryptionDuringCancellation) {
   }
 
   // Start a write in flight, which should encrypt the data and write it to disk.
-  unique_ptr<TmpFileMgr::WriteHandle> handle;
+  unique_ptr<TmpWriteHandle> handle;
   WriteRange::WriteDoneCallback callback =
       bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
   ASSERT_OK(file_group.Write(data_mem_range, callback, &handle));
@@ -572,12 +572,12 @@ TEST_F(TmpFileMgrTest, TestBlockVerificationGcmDisabled) {
 void TmpFileMgrTest::TestBlockVerification() {
   FLAGS_disk_spill_encryption = true;
   TUniqueId id;
-  TmpFileMgr::FileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
+  TmpFileGroup file_group(test_env_->tmp_file_mgr(), io_mgr(), profile_, id);
   string data = "the quick brown fox jumped over the lazy dog";
   MemRange data_mem_range(reinterpret_cast<uint8_t*>(&data[0]), data.size());
 
   // Start a write in flight, which should encrypt the data and write it to disk.
-  unique_ptr<TmpFileMgr::WriteHandle> handle;
+  unique_ptr<TmpWriteHandle> handle;
   WriteRange::WriteDoneCallback callback =
       bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
   ASSERT_OK(file_group.Write(data_mem_range, callback, &handle));
@@ -626,17 +626,17 @@ TEST_F(TmpFileMgrTest, TestHWMMetric) {
   // A power-of-two so that FileGroup allocates exactly this amount of scratch space.
   const int64_t ALLOC_SIZE = 64;
   TUniqueId id_1;
-  TmpFileMgr::FileGroup file_group_1(&tmp_file_mgr, io_mgr(), profile_1, id_1, LIMIT);
+  TmpFileGroup file_group_1(&tmp_file_mgr, io_mgr(), profile_1, id_1, LIMIT);
   TUniqueId id_2;
-  TmpFileMgr::FileGroup file_group_2(&tmp_file_mgr, io_mgr(), profile_2, id_2, LIMIT);
+  TmpFileGroup file_group_2(&tmp_file_mgr, io_mgr(), profile_2, id_2, LIMIT);
 
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group_1, &files));
   ASSERT_OK(CreateFiles(&file_group_2, &files));
 
   Status status;
   int64_t offset;
-  TmpFileMgr::File* alloc_file;
+  TmpFile* alloc_file;
 
   // Alloc from file_group_1 and file_group_2 interleaving allocations.
   SetNextAllocationIndex(&file_group_1, 0);
@@ -680,12 +680,12 @@ TEST_F(TmpFileMgrTest, TestDirectoryLimits) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dir_specs, false, metrics_.get()));
 
-  TmpFileMgr::FileGroup file_group_1(
+  TmpFileGroup file_group_1(
       &tmp_file_mgr, io_mgr(), RuntimeProfile::Create(&obj_pool_, "p1"), TUniqueId());
-  TmpFileMgr::FileGroup file_group_2(
+  TmpFileGroup file_group_2(
       &tmp_file_mgr, io_mgr(), RuntimeProfile::Create(&obj_pool_, "p2"), TUniqueId());
 
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group_1, &files));
   ASSERT_OK(CreateFiles(&file_group_2, &files));
 
@@ -699,7 +699,7 @@ TEST_F(TmpFileMgrTest, TestDirectoryLimits) {
   // A power-of-two so that FileGroup allocates exactly this amount of scratch space.
   const int64_t ALLOC_SIZE = 512;
   int64_t offset;
-  TmpFileMgr::File* alloc_file;
+  TmpFile* alloc_file;
 
   // Allocate three times - once per directory. We expect these allocations to go through
   // so we should have one allocation in each directory.
@@ -757,12 +757,12 @@ TEST_F(TmpFileMgrTest, TestDirectoryLimitsExhausted) {
   TmpFileMgr tmp_file_mgr;
   ASSERT_OK(tmp_file_mgr.InitCustom(tmp_dir_specs, false, metrics_.get()));
 
-  TmpFileMgr::FileGroup file_group_1(
+  TmpFileGroup file_group_1(
       &tmp_file_mgr, io_mgr(), RuntimeProfile::Create(&obj_pool_, "p1"), TUniqueId());
-  TmpFileMgr::FileGroup file_group_2(
+  TmpFileGroup file_group_2(
       &tmp_file_mgr, io_mgr(), RuntimeProfile::Create(&obj_pool_, "p2"), TUniqueId());
 
-  vector<TmpFileMgr::File*> files;
+  vector<TmpFile*> files;
   ASSERT_OK(CreateFiles(&file_group_1, &files));
   ASSERT_OK(CreateFiles(&file_group_2, &files));
 
@@ -775,7 +775,7 @@ TEST_F(TmpFileMgrTest, TestDirectoryLimitsExhausted) {
   const int64_t ALLOC_SIZE = 512;
   const int64_t MAX_ALLOCATIONS = (DIR1_LIMIT + DIR2_LIMIT) / ALLOC_SIZE;
   int64_t offset;
-  TmpFileMgr::File* alloc_file;
+  TmpFile* alloc_file;
 
   // Allocate exactly the maximum total capacity of the directories.
   SetNextAllocationIndex(&file_group_1, 0);
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 9875afa..d36d5c1 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -84,6 +84,10 @@ const string TMP_FILE_MGR_SCRATCH_SPACE_BYTES_USED =
 const string SCRATCH_DIR_BYTES_USED_FORMAT =
     "tmp-file-mgr.scratch-space-bytes-used.dir-$0";
 
+using DeviceId = TmpFileMgr::DeviceId;
+using TmpDir = TmpFileMgr::TmpDir;
+using WriteDoneCallback = TmpFileMgr::WriteDoneCallback;
+
 TmpFileMgr::TmpFileMgr()
   : initialized_(false),
     num_active_scratch_dirs_metric_(nullptr),
@@ -209,7 +213,7 @@ Status TmpFileMgr::InitCustom(const vector<string>& tmp_dir_specifiers,
 }
 
 void TmpFileMgr::NewFile(
-    FileGroup* file_group, DeviceId device_id, unique_ptr<File>* new_file) {
+    TmpFileGroup* file_group, DeviceId device_id, unique_ptr<TmpFile>* new_file) {
   DCHECK(initialized_);
   DCHECK_GE(device_id, 0);
   DCHECK_LT(device_id, tmp_dirs_.size());
@@ -221,7 +225,7 @@ void TmpFileMgr::NewFile(
   path new_file_path(tmp_dirs_[device_id].path);
   new_file_path /= file_name.str();
 
-  new_file->reset(new File(file_group, device_id, new_file_path.string()));
+  new_file->reset(new TmpFile(file_group, device_id, new_file_path.string()));
 }
 
 string TmpFileMgr::GetTmpDirPath(DeviceId device_id) const {
@@ -236,15 +240,15 @@ int TmpFileMgr::NumActiveTmpDevices() {
   return tmp_dirs_.size();
 }
 
-vector<TmpFileMgr::DeviceId> TmpFileMgr::ActiveTmpDevices() {
-  vector<TmpFileMgr::DeviceId> devices;
+vector<DeviceId> TmpFileMgr::ActiveTmpDevices() {
+  vector<DeviceId> devices;
   for (DeviceId device_id = 0; device_id < tmp_dirs_.size(); ++device_id) {
     devices.push_back(device_id);
   }
   return devices;
 }
 
-TmpFileMgr::File::File(FileGroup* file_group, DeviceId device_id, const string& path)
+TmpFile::TmpFile(TmpFileGroup* file_group, DeviceId device_id, const string& path)
   : file_group_(file_group),
     path_(path),
     device_id_(device_id),
@@ -254,7 +258,7 @@ TmpFileMgr::File::File(FileGroup* file_group, DeviceId device_id, const string&
   DCHECK(file_group != nullptr);
 }
 
-bool TmpFileMgr::File::AllocateSpace(int64_t num_bytes, int64_t* offset) {
+bool TmpFile::AllocateSpace(int64_t num_bytes, int64_t* offset) {
   DCHECK_GT(num_bytes, 0);
   TmpDir* dir = GetDir();
   // Increment optimistically and roll back if the limit is exceeded.
@@ -267,33 +271,33 @@ bool TmpFileMgr::File::AllocateSpace(int64_t num_bytes, int64_t* offset) {
   return true;
 }
 
-int TmpFileMgr::File::AssignDiskQueue() const {
+int TmpFile::AssignDiskQueue() const {
   return file_group_->io_mgr_->AssignQueue(path_.c_str(), disk_id_, false);
 }
 
-void TmpFileMgr::File::Blacklist(const ErrorMsg& msg) {
+void TmpFile::Blacklist(const ErrorMsg& msg) {
   LOG(ERROR) << "Error for temporary file '" << path_ << "': " << msg.msg();
   blacklisted_ = true;
 }
 
-Status TmpFileMgr::File::Remove() {
+Status TmpFile::Remove() {
   // Remove the file if present (it may not be present if no writes completed).
   Status status = FileSystemUtil::RemovePaths({path_});
   GetDir()->bytes_used_metric->Increment(-bytes_allocated_);
   return status;
 }
 
-TmpFileMgr::TmpDir* TmpFileMgr::File::GetDir() {
+TmpFileMgr::TmpDir* TmpFile::GetDir() {
   return &file_group_->tmp_file_mgr_->tmp_dirs_[device_id_];
 }
 
-string TmpFileMgr::File::DebugString() {
+string TmpFile::DebugString() {
   return Substitute("File $0 path '$1' device id $2 disk id $3 bytes allocated $4 "
       "blacklisted $5", this, path_, device_id_, disk_id_, bytes_allocated_,
       blacklisted_);
 }
 
-TmpFileMgr::FileGroup::FileGroup(TmpFileMgr* tmp_file_mgr, DiskIoMgr* io_mgr,
+TmpFileGroup::TmpFileGroup(TmpFileMgr* tmp_file_mgr, DiskIoMgr* io_mgr,
     RuntimeProfile* profile, const TUniqueId& unique_id, int64_t bytes_limit)
   : tmp_file_mgr_(tmp_file_mgr),
     io_mgr_(io_mgr),
@@ -315,19 +319,19 @@ TmpFileMgr::FileGroup::FileGroup(TmpFileMgr* tmp_file_mgr, DiskIoMgr* io_mgr,
   io_ctx_ = io_mgr_->RegisterContext();
 }
 
-TmpFileMgr::FileGroup::~FileGroup() {
+TmpFileGroup::~TmpFileGroup() {
   DCHECK_EQ(tmp_files_.size(), 0);
 }
 
-Status TmpFileMgr::FileGroup::CreateFiles() {
+Status TmpFileGroup::CreateFiles() {
   lock_.DCheckLocked();
   DCHECK(tmp_files_.empty());
   vector<DeviceId> tmp_devices = tmp_file_mgr_->ActiveTmpDevices();
   int files_allocated = 0;
   // Initialize the tmp files and the initial file to use.
   for (int i = 0; i < tmp_devices.size(); ++i) {
-    TmpFileMgr::DeviceId device_id = tmp_devices[i];
-    unique_ptr<TmpFileMgr::File> tmp_file;
+    DeviceId device_id = tmp_devices[i];
+    unique_ptr<TmpFile> tmp_file;
     tmp_file_mgr_->NewFile(this, device_id, &tmp_file);
     tmp_files_.emplace_back(std::move(tmp_file));
     ++files_allocated;
@@ -339,11 +343,11 @@ Status TmpFileMgr::FileGroup::CreateFiles() {
   return Status::OK();
 }
 
-void TmpFileMgr::FileGroup::Close() {
+void TmpFileGroup::Close() {
   // Cancel writes before deleting the files, since in-flight writes could re-create
   // deleted files.
   if (io_ctx_ != nullptr) io_mgr_->UnregisterContext(io_ctx_.get());
-  for (std::unique_ptr<TmpFileMgr::File>& file : tmp_files_) {
+  for (std::unique_ptr<TmpFile>& file : tmp_files_) {
     Status status = file->Remove();
     if (!status.ok()) {
       LOG(WARNING) << "Error removing scratch file '" << file->path()
@@ -356,8 +360,8 @@ void TmpFileMgr::FileGroup::Close() {
   tmp_files_.clear();
 }
 
-Status TmpFileMgr::FileGroup::AllocateSpace(
-    int64_t num_bytes, File** tmp_file, int64_t* file_offset) {
+Status TmpFileGroup::AllocateSpace(
+    int64_t num_bytes, TmpFile** tmp_file, int64_t* file_offset) {
   lock_guard<SpinLock> lock(lock_);
   int64_t scratch_range_bytes = max<int64_t>(1L, BitUtil::RoundUpToPowerOfTwo(num_bytes));
   int free_ranges_idx = BitUtil::Log2Ceiling64(scratch_range_bytes);
@@ -399,7 +403,7 @@ Status TmpFileMgr::FileGroup::AllocateSpace(
   return ScratchAllocationFailedStatus(at_capacity_dirs);
 }
 
-void TmpFileMgr::FileGroup::RecycleFileRange(unique_ptr<WriteHandle> handle) {
+void TmpFileGroup::RecycleFileRange(unique_ptr<TmpWriteHandle> handle) {
   int64_t scratch_range_bytes =
       max<int64_t>(1L, BitUtil::RoundUpToPowerOfTwo(handle->len()));
   int free_ranges_idx = BitUtil::Log2Ceiling64(scratch_range_bytes);
@@ -408,16 +412,16 @@ void TmpFileMgr::FileGroup::RecycleFileRange(unique_ptr<WriteHandle> handle) {
       handle->file_, handle->write_range_->offset());
 }
 
-Status TmpFileMgr::FileGroup::Write(
-    MemRange buffer, WriteDoneCallback cb, unique_ptr<TmpFileMgr::WriteHandle>* handle) {
+Status TmpFileGroup::Write(MemRange buffer, WriteDoneCallback cb,
+    unique_ptr<TmpWriteHandle>* handle) {
   DCHECK_GE(buffer.len(), 0);
 
-  File* tmp_file;
+  TmpFile* tmp_file;
   int64_t file_offset;
   RETURN_IF_ERROR(AllocateSpace(buffer.len(), &tmp_file, &file_offset));
 
-  unique_ptr<WriteHandle> tmp_handle(new WriteHandle(encryption_timer_, cb));
-  WriteHandle* tmp_handle_ptr = tmp_handle.get(); // Pass ptr by value into lambda.
+  unique_ptr<TmpWriteHandle> tmp_handle(new TmpWriteHandle(encryption_timer_, cb));
+  TmpWriteHandle* tmp_handle_ptr = tmp_handle.get(); // Pass ptr by value into lambda.
   WriteRange::WriteDoneCallback callback = [this, tmp_handle_ptr](
       const Status& write_status) { WriteComplete(tmp_handle_ptr, write_status); };
   RETURN_IF_ERROR(
@@ -428,12 +432,12 @@ Status TmpFileMgr::FileGroup::Write(
   return Status::OK();
 }
 
-Status TmpFileMgr::FileGroup::Read(WriteHandle* handle, MemRange buffer) {
+Status TmpFileGroup::Read(TmpWriteHandle* handle, MemRange buffer) {
   RETURN_IF_ERROR(ReadAsync(handle, buffer));
   return WaitForAsyncRead(handle, buffer);
 }
 
-Status TmpFileMgr::FileGroup::ReadAsync(WriteHandle* handle, MemRange buffer) {
+Status TmpFileGroup::ReadAsync(TmpWriteHandle* handle, MemRange buffer) {
   DCHECK(handle->write_range_ != nullptr);
   DCHECK(!handle->is_cancelled_);
   DCHECK_EQ(buffer.len(), handle->len());
@@ -460,7 +464,7 @@ Status TmpFileMgr::FileGroup::ReadAsync(WriteHandle* handle, MemRange buffer) {
   return Status::OK();
 }
 
-Status TmpFileMgr::FileGroup::WaitForAsyncRead(WriteHandle* handle, MemRange buffer) {
+Status TmpFileGroup::WaitForAsyncRead(TmpWriteHandle* handle, MemRange buffer) {
   DCHECK(handle->read_range_ != nullptr);
   // Don't grab handle->write_state_lock_, it is safe to touch all of handle's state
   // since the write is not in flight.
@@ -491,8 +495,8 @@ exit:
   return status;
 }
 
-Status TmpFileMgr::FileGroup::RestoreData(
-    unique_ptr<WriteHandle> handle, MemRange buffer) {
+Status TmpFileGroup::RestoreData(
+    unique_ptr<TmpWriteHandle> handle, MemRange buffer) {
   DCHECK_EQ(handle->write_range_->data(), buffer.data());
   DCHECK_EQ(handle->len(), buffer.len());
   DCHECK(!handle->write_in_flight_);
@@ -507,14 +511,14 @@ Status TmpFileMgr::FileGroup::RestoreData(
   return status;
 }
 
-void TmpFileMgr::FileGroup::DestroyWriteHandle(unique_ptr<WriteHandle> handle) {
+void TmpFileGroup::DestroyWriteHandle(unique_ptr<TmpWriteHandle> handle) {
   handle->Cancel();
   handle->WaitForWrite();
   RecycleFileRange(move(handle));
 }
 
-void TmpFileMgr::FileGroup::WriteComplete(
-    WriteHandle* handle, const Status& write_status) {
+void TmpFileGroup::WriteComplete(
+    TmpWriteHandle* handle, const Status& write_status) {
   Status status;
   if (!write_status.ok()) {
     status = RecoverWriteError(handle, write_status);
@@ -525,8 +529,8 @@ void TmpFileMgr::FileGroup::WriteComplete(
   handle->WriteComplete(status);
 }
 
-Status TmpFileMgr::FileGroup::RecoverWriteError(
-    WriteHandle* handle, const Status& write_status) {
+Status TmpFileGroup::RecoverWriteError(
+    TmpWriteHandle* handle, const Status& write_status) {
   DCHECK(!write_status.ok());
   DCHECK(handle->file_ != nullptr);
 
@@ -545,7 +549,7 @@ Status TmpFileMgr::FileGroup::RecoverWriteError(
   // Do not retry cancelled writes or propagate the error, simply return CANCELLED.
   if (handle->is_cancelled_) return Status::CancelledInternal("TmpFileMgr write");
 
-  TmpFileMgr::File* tmp_file;
+  TmpFile* tmp_file;
   int64_t file_offset;
   // Discard the scratch file range - we will not reuse ranges from a bad file.
   // Choose another file to try. Blacklisting ensures we don't retry the same file.
@@ -554,7 +558,7 @@ Status TmpFileMgr::FileGroup::RecoverWriteError(
   return handle->RetryWrite(io_ctx_.get(), tmp_file, file_offset);
 }
 
-Status TmpFileMgr::FileGroup::ScratchAllocationFailedStatus(
+Status TmpFileGroup::ScratchAllocationFailedStatus(
     const vector<int>& at_capacity_dirs) {
   vector<string> tmp_dir_paths;
   for (TmpDir& tmp_dir : tmp_file_mgr_->tmp_dirs_) {
@@ -575,10 +579,10 @@ Status TmpFileMgr::FileGroup::ScratchAllocationFailedStatus(
   return status;
 }
 
-string TmpFileMgr::FileGroup::DebugString() {
+string TmpFileGroup::DebugString() {
   lock_guard<SpinLock> lock(lock_);
   stringstream ss;
-  ss << "FileGroup " << this << " bytes limit " << bytes_limit_
+  ss << "TmpFileGroup " << this << " bytes limit " << bytes_limit_
      << " current bytes allocated " << current_bytes_allocated_
      << " next allocation index " << next_allocation_index_ << " writes "
      << write_counter_->value() << " bytes written " << bytes_written_counter_->value()
@@ -588,13 +592,13 @@ string TmpFileMgr::FileGroup::DebugString() {
      << disk_read_timer_->value() << " encryption timer " << encryption_timer_->value()
      << endl
      << "  " << tmp_files_.size() << " files:" << endl;
-  for (unique_ptr<File>& file : tmp_files_) {
+  for (unique_ptr<TmpFile>& file : tmp_files_) {
     ss << "    " << file->DebugString() << endl;
   }
   return ss.str();
 }
 
-TmpFileMgr::WriteHandle::WriteHandle(
+TmpWriteHandle::TmpWriteHandle(
     RuntimeProfile::Counter* encryption_timer, WriteDoneCallback cb)
   : cb_(cb),
     encryption_timer_(encryption_timer),
@@ -603,22 +607,22 @@ TmpFileMgr::WriteHandle::WriteHandle(
     is_cancelled_(false),
     write_in_flight_(false) {}
 
-TmpFileMgr::WriteHandle::~WriteHandle() {
+TmpWriteHandle::~TmpWriteHandle() {
   DCHECK(!write_in_flight_);
   DCHECK(read_range_ == nullptr);
 }
 
-string TmpFileMgr::WriteHandle::TmpFilePath() const {
+string TmpWriteHandle::TmpFilePath() const {
   if (file_ == nullptr) return "";
   return file_->path();
 }
 
-int64_t TmpFileMgr::WriteHandle::len() const {
+int64_t TmpWriteHandle::len() const {
   return write_range_->len();
 }
 
-Status TmpFileMgr::WriteHandle::Write(RequestContext* io_ctx,
-    File* file, int64_t offset, MemRange buffer,
+Status TmpWriteHandle::Write(RequestContext* io_ctx,
+    TmpFile* file, int64_t offset, MemRange buffer,
     WriteRange::WriteDoneCallback callback) {
   DCHECK(!write_in_flight_);
 
@@ -635,7 +639,7 @@ Status TmpFileMgr::WriteHandle::Write(RequestContext* io_ctx,
   if (!status.ok()) {
     // The write will not be in flight if we returned with an error.
     write_in_flight_ = false;
-    // We won't return this WriteHandle to the client of FileGroup, so it won't be
+    // We won't return this TmpWriteHandle to the client of TmpFileGroup, so it won't be
     // cancelled in the normal way. Mark the handle as cancelled so it can be
     // cleanly destroyed.
     is_cancelled_ = true;
@@ -644,8 +648,8 @@ Status TmpFileMgr::WriteHandle::Write(RequestContext* io_ctx,
   return Status::OK();
 }
 
-Status TmpFileMgr::WriteHandle::RetryWrite(
-    RequestContext* io_ctx, File* file, int64_t offset) {
+Status TmpWriteHandle::RetryWrite(
+    RequestContext* io_ctx, TmpFile* file, int64_t offset) {
   DCHECK(write_in_flight_);
   file_ = file;
   write_range_->SetRange(file->path(), offset, file->AssignDiskQueue());
@@ -658,7 +662,7 @@ Status TmpFileMgr::WriteHandle::RetryWrite(
   return Status::OK();
 }
 
-void TmpFileMgr::WriteHandle::WriteComplete(const Status& write_status) {
+void TmpWriteHandle::WriteComplete(const Status& write_status) {
   WriteDoneCallback cb;
   {
     lock_guard<mutex> lock(write_state_lock_);
@@ -677,7 +681,7 @@ void TmpFileMgr::WriteHandle::WriteComplete(const Status& write_status) {
   cb(write_status);
 }
 
-void TmpFileMgr::WriteHandle::Cancel() {
+void TmpWriteHandle::Cancel() {
   CancelRead();
   {
     unique_lock<mutex> lock(write_state_lock_);
@@ -687,19 +691,19 @@ void TmpFileMgr::WriteHandle::Cancel() {
   }
 }
 
-void TmpFileMgr::WriteHandle::CancelRead() {
+void TmpWriteHandle::CancelRead() {
   if (read_range_ != nullptr) {
     read_range_->Cancel(Status::CancelledInternal("TmpFileMgr read"));
     read_range_ = nullptr;
   }
 }
 
-void TmpFileMgr::WriteHandle::WaitForWrite() {
+void TmpWriteHandle::WaitForWrite() {
   unique_lock<mutex> lock(write_state_lock_);
   while (write_in_flight_) write_complete_cv_.Wait(lock);
 }
 
-Status TmpFileMgr::WriteHandle::EncryptAndHash(MemRange buffer) {
+Status TmpWriteHandle::EncryptAndHash(MemRange buffer) {
   DCHECK(FLAGS_disk_spill_encryption);
   SCOPED_TIMER(encryption_timer_);
   // Since we're using GCM/CTR/CFB mode, we must take care not to reuse a
@@ -713,7 +717,7 @@ Status TmpFileMgr::WriteHandle::EncryptAndHash(MemRange buffer) {
   return Status::OK();
 }
 
-Status TmpFileMgr::WriteHandle::CheckHashAndDecrypt(MemRange buffer) {
+Status TmpWriteHandle::CheckHashAndDecrypt(MemRange buffer) {
   DCHECK(FLAGS_disk_spill_encryption);
   DCHECK(write_range_ != nullptr);
   SCOPED_TIMER(encryption_timer_);
@@ -737,7 +741,7 @@ Status TmpFileMgr::WriteHandle::CheckHashAndDecrypt(MemRange buffer) {
   return Status::OK();
 }
 
-string TmpFileMgr::WriteHandle::DebugString() {
+string TmpWriteHandle::DebugString() {
   unique_lock<mutex> lock(write_state_lock_);
   stringstream ss;
   ss << "Write handle " << this << " file '" << file_->path() << "'"
diff --git a/be/src/runtime/tmp-file-mgr.h b/be/src/runtime/tmp-file-mgr.h
index 2599a07..6e72304 100644
--- a/be/src/runtime/tmp-file-mgr.h
+++ b/be/src/runtime/tmp-file-mgr.h
@@ -41,6 +41,9 @@ namespace io {
   class ScanRange;
   class WriteRange;
 }
+class TmpFile;
+class TmpFileGroup;
+class TmpWriteHandle;
 
 /// TmpFileMgr provides an abstraction for management of temporary (a.k.a. scratch) files
 /// on the filesystem and I/O to and from them. TmpFileMgr manages multiple scratch
@@ -50,39 +53,36 @@ namespace io {
 /// TmpFileMgr encrypts data written to disk if enabled by the --disk_spill_encryption
 /// command-line flag.
 ///
-/// FileGroups manage scratch space across multiple devices. To write to scratch space,
-/// first a FileGroup is created, then FileGroup::Write() is called to asynchronously
-/// write a memory buffer to one of the scratch files. FileGroup::Write() returns a
-/// WriteHandle, which is used by the caller to identify that write operation. The
-/// caller is notified when the asynchronous write completes via a callback, after which
-/// the caller can use the WriteHandle to read back the data.
+/// TmpFileGroups manage scratch space across multiple devices. To write to scratch space,
+/// first a TmpFileGroup is created, then TmpFileGroup::Write() is called to
+/// asynchronously write a memory buffer to one of the scratch files.
+/// TmpFileGroup::Write() returns a TmpWriteHandle, which identifies that write operation.
+/// The caller is notified when the asynchronous write completes via a callback, after
+/// which the caller can use the TmpWriteHandle to read back the data.
 ///
-/// Each WriteHandle is backed by a range of data in a scratch file. The first call to
-/// Write() will create files for the FileGroup with unique filenames on the configured
+/// Each TmpWriteHandle is backed by a range of data in a scratch file. The first call to
+/// Write() will create files for the TmpFileGroup with unique filenames on the configured
 /// temporary devices. At most one directory per device is used (unless overridden for
-/// testing). The file range of a WriteHandle can be replaced with a different one if
+/// testing). The file range of a TmpWriteHandle can be replaced with a different one if
 /// a write error is encountered and the data instead needs to be written to a different
 /// disk.
 ///
 /// Free Space Management:
-/// Free space is managed within a FileGroup: once a WriteHandle is destroyed, the file
-/// range backing it can be recycled for a different WriteHandle. Scratch file ranges
-/// are grouped into size classes, each for a power-of-two number of bytes. Free file
-/// ranges of each size class are managed separately (i.e. there is no splitting or
+/// Free space is managed within a TmpFileGroup: once a TmpWriteHandle is destroyed, the
+/// file range backing it can be recycled for a different TmpWriteHandle. Scratch file
+/// ranges are grouped into size classes, each for a power-of-two number of bytes. Free
+/// file ranges of each size class are managed separately (i.e. there is no splitting or
 /// coalescing of ranges).
 ///
 /// Resource Management:
 /// TmpFileMgr provides some basic support for managing local disk space consumption.
-/// A FileGroup can be created with a limit on the total number of bytes allocated across
-/// all files. Writes that would exceed the limit fail with an error status.
+/// A TmpFileGroup can be created with a limit on the total number of bytes allocated
+/// across all files. Writes that would exceed the limit fail with an error status.
 ///
 /// TODO: IMPALA-4683: we could implement smarter handling of failures, e.g. to
 /// temporarily blacklist devices that show I/O errors.
 class TmpFileMgr {
  public:
-  class File; // Needs to be public for TmpFileMgrTest.
-  class WriteHandle;
-
   /// DeviceId is an internal unique identifier for a temporary device managed by
   /// TmpFileMgr. DeviceIds in the range [0, num tmp devices) are allocated arbitrarily.
   /// Needs to be public for TmpFileMgrTest.
@@ -107,347 +107,351 @@ class TmpFileMgr {
     IntGauge* const bytes_used_metric;
   };
 
-  /// Represents a group of temporary files - one per disk with a scratch directory. The
-  /// total allocated bytes of the group can be bound by setting the space allocation
-  /// limit. The owner of the FileGroup object is responsible for calling the Close()
-  /// method to delete all the files in the group.
-  ///
-  /// Public methods of FileGroup and WriteHandle are safe to call concurrently from
-  /// multiple threads as long as different WriteHandle arguments are provided.
-  class FileGroup {
-   public:
-    /// Initialize a new file group, which will create files using 'tmp_file_mgr'
-    /// and perform I/O using 'io_mgr'. Adds counters to 'profile' to track scratch
-    /// space used. 'unique_id' is a unique ID that is used to prefix any scratch file
-    /// names. It is an error to create multiple FileGroups with the same 'unique_id'.
-    /// 'bytes_limit' is the limit on the total file space to allocate.
-    FileGroup(TmpFileMgr* tmp_file_mgr, io::DiskIoMgr* io_mgr, RuntimeProfile* profile,
-        const TUniqueId& unique_id, int64_t bytes_limit = -1);
-
-    ~FileGroup();
-
-    /// Asynchronously writes 'buffer' to a temporary file of this file group. If there
-    /// are multiple scratch files, this can write to any of them, and will attempt to
-    /// recover from I/O errors on one file by writing to a different file. The memory
-    /// referenced by 'buffer' must remain valid until the write completes. The callee
-    /// may rewrite the data in 'buffer' in-place (e.g. to do in-place encryption or
-    /// compression). The caller should not modify the data in 'buffer' until the write
-    /// completes or is cancelled, otherwise invalid data may be written to disk.
-    ///
-    /// Returns an error if the scratch space cannot be allocated or the write cannot
-    /// be started. Otherwise 'handle' is set and 'cb' will be called asynchronously from
-    /// a different thread when the write completes successfully or unsuccessfully or is
-    /// cancelled.
-    ///
-    /// 'handle' must be destroyed by passing the DestroyWriteHandle() or RestoreData().
-    Status Write(MemRange buffer, WriteDoneCallback cb,
-        std::unique_ptr<WriteHandle>* handle) WARN_UNUSED_RESULT;
-
-    /// Synchronously read the data referenced by 'handle' from the temporary file into
-    /// 'buffer'. buffer.len() must be the same as handle->len(). Can only be called
-    /// after a write successfully completes. Should not be called while an async read
-    /// is in flight. Equivalent to calling ReadAsync() then WaitForAsyncRead().
-    Status Read(WriteHandle* handle, MemRange buffer) WARN_UNUSED_RESULT;
-
-    /// Asynchronously read the data referenced by 'handle' from the temporary file into
-    /// 'buffer'. buffer.len() must be the same as handle->len(). Can only be called
-    /// after a write successfully completes. WaitForAsyncRead() must be called before the
-    /// data in the buffer is valid. Should not be called while an async read
-    /// is already in flight.
-    Status ReadAsync(WriteHandle* handle, MemRange buffer) WARN_UNUSED_RESULT;
-
-    /// Wait until the read started for 'handle' by ReadAsync() completes. 'buffer'
-    /// should be the same buffer passed into ReadAsync(). Returns an error if the
-    /// read fails. Retrying a failed read by calling ReadAsync() again is allowed.
-    Status WaitForAsyncRead(WriteHandle* handle, MemRange buffer) WARN_UNUSED_RESULT;
-
-    /// Restore the original data in the 'buffer' passed to Write(), decrypting or
-    /// decompressing as necessary. Returns an error if restoring the data fails.
-    /// The write must not be in-flight - the caller is responsible for waiting for
-    /// the write to complete.
-    Status RestoreData(
-        std::unique_ptr<WriteHandle> handle, MemRange buffer) WARN_UNUSED_RESULT;
-
-    /// Wait for the in-flight I/Os to complete and destroy resources associated with
-    /// 'handle'.
-    void DestroyWriteHandle(std::unique_ptr<WriteHandle> handle);
-
-    /// Calls Remove() on all the files in the group and deletes them.
-    void Close();
-
-    std::string DebugString();
-
-    const TUniqueId& unique_id() const { return unique_id_; }
-
-    TmpFileMgr* tmp_file_mgr() const { return tmp_file_mgr_; }
-
-   private:
-    friend class File;
-    friend class TmpFileMgrTest;
-
-    /// Initializes the file group with one temporary file per disk with a scratch
-    /// directory. Returns OK if at least one temporary file could be created.
-    /// Returns an error if no temporary files were successfully created. Must only be
-    /// called once. Must be called with 'lock_' held.
-    Status CreateFiles() WARN_UNUSED_RESULT;
-
-    /// Allocate 'num_bytes' bytes in a temporary file. Try multiple disks if error
-    /// occurs. Returns an error only if no temporary files are usable or the scratch
-    /// limit is exceeded. Must be called without 'lock_' held.
-    Status AllocateSpace(
-        int64_t num_bytes, File** tmp_file, int64_t* file_offset) WARN_UNUSED_RESULT;
-
-    /// Add the scratch range from 'handle' to 'free_ranges_' and destroy handle. Must be
-    /// called without 'lock_' held.
-    void RecycleFileRange(std::unique_ptr<WriteHandle> handle);
-
-    /// Called when the DiskIoMgr write completes for 'handle'. On error, will attempt
-    /// to retry the write. On success or if the write can't be retried, calls
-    /// handle->WriteComplete().
-    void WriteComplete(WriteHandle* handle, const Status& write_status);
-
-    /// Handles a write error. Logs the write error and blacklists the device for this
-    /// file group if the cause was an I/O error. Blacklisting limits the number of times
-    /// a write is retried because each device will only be tried once. Returns OK if it
-    /// successfully reissued the write. Returns an error status if the original error
-    /// was unrecoverable or an unrecoverable error is encountered when reissuing the
-    /// write. The error status will include all previous I/O errors in its details.
-    Status RecoverWriteError(
-        WriteHandle* handle, const Status& write_status) WARN_UNUSED_RESULT;
-
-    /// Return a SCRATCH_ALLOCATION_FAILED error with the appropriate information,
-    /// including scratch directories, the amount of scratch allocated and previous
-    /// errors that caused this failure. If some directories were at capacity,
-    /// but had not encountered an error, the indices of these directories in
-    /// tmp_file_mgr_->tmp_dir_ should be included in 'at_capacity_dirs'.
-    /// 'lock_' must be held by caller.
-    Status ScratchAllocationFailedStatus(const std::vector<int>& at_capacity_dirs);
-
-    /// The TmpFileMgr it is associated with.
-    TmpFileMgr* const tmp_file_mgr_;
-
-    /// DiskIoMgr used for all I/O to temporary files.
-    io::DiskIoMgr* const io_mgr_;
-
-    /// I/O context used for all reads and writes. Registered in constructor.
-    std::unique_ptr<io::RequestContext> io_ctx_;
-
-    /// Stores scan ranges allocated in Read(). Needed because ScanRange objects may be
-    /// touched by DiskIoMgr even after the scan is finished.
-    /// TODO: IMPALA-4249: remove once lifetime of ScanRange objects is better defined.
-    ObjectPool scan_range_pool_;
-
-    /// Unique across all FileGroups. Used to prefix file names.
-    const TUniqueId unique_id_;
-
-    /// Max write space allowed (-1 means no limit).
-    const int64_t bytes_limit_;
-
-    /// Number of write operations (includes writes started but not yet complete).
-    RuntimeProfile::Counter* const write_counter_;
-
-    /// Number of bytes written to disk (includes writes started but not yet complete).
-    RuntimeProfile::Counter* const bytes_written_counter_;
+  TmpFileMgr();
 
-    /// Number of read operations (includes reads started but not yet complete).
-    RuntimeProfile::Counter* const read_counter_;
-
-    /// Number of bytes read from disk (includes reads started but not yet complete).
-    RuntimeProfile::Counter* const bytes_read_counter_;
+  /// Creates the configured tmp directories. If multiple directories are specified per
+  /// disk, only one is created and used. Must be called after DiskInfo::Init().
+  Status Init(MetricGroup* metrics) WARN_UNUSED_RESULT;
 
-    /// Amount of scratch space allocated in bytes.
-    RuntimeProfile::Counter* const scratch_space_bytes_used_counter_;
+  /// Custom initialization - initializes with the provided list of directories.
+  /// If one_dir_per_device is true, only use one temporary directory per device.
+  /// This interface is intended for testing purposes. 'tmp_dir_specifiers'
+  /// use the command-line syntax, i.e. <path>[:<limit>]. The first variant takes
+  /// a comma-separated list, the second takes a vector.
+  Status InitCustom(const std::string& tmp_dirs_spec, bool one_dir_per_device,
+      MetricGroup* metrics) WARN_UNUSED_RESULT;
+  Status InitCustom(const std::vector<std::string>& tmp_dir_specifiers,
+      bool one_dir_per_device, MetricGroup* metrics) WARN_UNUSED_RESULT;
+
+  /// Return the scratch directory path for the device.
+  std::string GetTmpDirPath(DeviceId device_id) const;
 
-    /// Time spent waiting for disk reads.
-    RuntimeProfile::Counter* const disk_read_timer_;
+  /// Total number of devices with tmp directories that are active. There is one tmp
+  /// directory per device.
+  int NumActiveTmpDevices();
 
-    /// Time spent in disk spill encryption, decryption, and integrity checking.
-    RuntimeProfile::Counter* encryption_timer_;
+  /// Return vector with device ids of all tmp devices being actively used.
+  /// I.e. those that haven't been blacklisted.
+  std::vector<DeviceId> ActiveTmpDevices();
+
+ private:
+  friend class TmpFileMgrTest;
+  friend class TmpFile;
+  friend class TmpFileGroup;
 
-    /// Protects below members.
-    SpinLock lock_;
+  /// Return a new TmpFile handle with a path based on file_group->unique_id. The file is
+  /// associated with the 'file_group' and the file path is within the (single) scratch
+  /// directory on the specified device id. The caller owns the returned handle and is
+  /// responsible for deleting it. The file is not created - creation is deferred until
+  /// the file is written.
+  void NewFile(TmpFileGroup* file_group, DeviceId device_id,
+    std::unique_ptr<TmpFile>* new_file);
 
-    /// List of files representing the FileGroup.
-    std::vector<std::unique_ptr<File>> tmp_files_;
+  bool initialized_;
 
-    /// Total space allocated in this group's files.
-    int64_t current_bytes_allocated_;
+  /// The paths of the created tmp directories.
+  std::vector<TmpDir> tmp_dirs_;
 
-    /// Index into 'tmp_files' denoting the file to which the next temporary file range
-    /// should be allocated from. Used to implement round-robin allocation from temporary
-    /// files.
-    int next_allocation_index_;
+  /// Metrics to track active scratch directories.
+  IntGauge* num_active_scratch_dirs_metric_;
+  SetMetric<std::string>* active_scratch_dirs_metric_;
 
-    /// Each vector in free_ranges_[i] is a vector of File/offset pairs for free scratch
-    /// ranges of length 2^i bytes. Has 64 entries so that every int64_t length has a
-    /// valid list associated with it.
-    std::vector<std::vector<std::pair<File*, int64_t>>> free_ranges_;
-
-    /// Errors encountered when creating/writing scratch files. We store the history so
-    /// that we can report the original cause of the scratch errors if we run out of
-    /// devices to write to.
-    std::vector<Status> scratch_errors_;
-  };
+  /// Metrics to track the scratch space HWM.
+  AtomicHighWaterMarkGauge* scratch_bytes_used_metric_;
+};
 
-  /// A handle to a write operation, backed by a range of a temporary file. The operation
-  /// is either in-flight or has completed. If it completed with no error and wasn't
-  /// cancelled then the data is in the file and can be read back.
+/// Represents a group of temporary files - one per disk with a scratch directory. The
+/// total allocated bytes of the group can be bound by setting the space allocation
+/// limit. The owner of the TmpFileGroup object is responsible for calling the Close()
+/// method to delete all the files in the group.
+///
+/// Public methods of TmpFileGroup and TmpWriteHandle are safe to call concurrently from
+/// multiple threads as long as different TmpWriteHandle arguments are provided.
+class TmpFileGroup {
+ public:
+  /// Initialize a new file group, which will create files using 'tmp_file_mgr'
+  /// and perform I/O using 'io_mgr'. Adds counters to 'profile' to track scratch
+  /// space used. 'unique_id' is a unique ID that is used to prefix any scratch file
+  /// names. It is an error to create multiple TmpFileGroups with the same 'unique_id'.
+  /// 'bytes_limit' is the limit on the total file space to allocate.
+  TmpFileGroup(TmpFileMgr* tmp_file_mgr, io::DiskIoMgr* io_mgr, RuntimeProfile* profile,
+      const TUniqueId& unique_id, int64_t bytes_limit = -1);
+
+  ~TmpFileGroup();
+
+  /// Asynchronously writes 'buffer' to a temporary file of this file group. If there
+  /// are multiple scratch files, this can write to any of them, and will attempt to
+  /// recover from I/O errors on one file by writing to a different file. The memory
+  /// referenced by 'buffer' must remain valid until the write completes. The callee
+  /// may rewrite the data in 'buffer' in-place (e.g. to do in-place encryption or
+  /// compression). The caller should not modify the data in 'buffer' until the write
+  /// completes or is cancelled, otherwise invalid data may be written to disk.
   ///
-  /// WriteHandle is returned from FileGroup::Write(). After the write completes, the
-  /// handle can be passed to FileGroup::Read() to read back the data zero or more times.
-  /// FileGroup::DestroyWriteHandle() can be called at any time to destroy the handle and
-  /// allow reuse of the scratch file range written to. Alternatively,
-  /// FileGroup::RestoreData() can be called to reverse the effects of FileGroup::Write()
-  /// by destroying the handle and restoring the original data to the buffer, so long as
-  /// the data in the buffer was not modified by the caller.
+  /// Returns an error if the scratch space cannot be allocated or the write cannot
+  /// be started. Otherwise 'handle' is set and 'cb' will be called asynchronously from
+  /// a different thread when the write completes successfully or unsuccessfully or is
+  /// cancelled.
   ///
-  /// Public methods of WriteHandle are safe to call concurrently from multiple threads.
-  class WriteHandle {
-   public:
-    /// The write must be destroyed by passing it to FileGroup - destroying it before
-    /// the write completes is an error.
-    ~WriteHandle();
+  /// 'handle' must be destroyed by passing the DestroyWriteHandle() or RestoreData().
+  Status Write(MemRange buffer, TmpFileMgr::WriteDoneCallback cb,
+      std::unique_ptr<TmpWriteHandle>* handle) WARN_UNUSED_RESULT;
 
-    /// Cancel any in-flight read synchronously.
-    void CancelRead();
+  /// Synchronously read the data referenced by 'handle' from the temporary file into
+  /// 'buffer'. buffer.len() must be the same as handle->len(). Can only be called
+  /// after a write successfully completes. Should not be called while an async read
+  /// is in flight. Equivalent to calling ReadAsync() then WaitForAsyncRead().
+  Status Read(TmpWriteHandle* handle, MemRange buffer) WARN_UNUSED_RESULT;
 
-    /// Path of temporary file backing the block. Intended for use in testing.
-    /// Returns empty string if no backing file allocated.
-    std::string TmpFilePath() const;
+  /// Asynchronously read the data referenced by 'handle' from the temporary file into
+  /// 'buffer'. buffer.len() must be the same as handle->len(). Can only be called
+  /// after a write successfully completes. WaitForAsyncRead() must be called before the
+  /// data in the buffer is valid. Should not be called while an async read
+  /// is already in flight.
+  Status ReadAsync(TmpWriteHandle* handle, MemRange buffer) WARN_UNUSED_RESULT;
 
-    /// The length of the write range in bytes.
-    int64_t len() const;
+  /// Wait until the read started for 'handle' by ReadAsync() completes. 'buffer'
+  /// should be the same buffer passed into ReadAsync(). Returns an error if the
+  /// read fails. Retrying a failed read by calling ReadAsync() again is allowed.
+  Status WaitForAsyncRead(TmpWriteHandle* handle, MemRange buffer) WARN_UNUSED_RESULT;
 
-    std::string DebugString();
+  /// Restore the original data in the 'buffer' passed to Write(), decrypting or
+  /// decompressing as necessary. Returns an error if restoring the data fails.
+  /// The write must not be in-flight - the caller is responsible for waiting for
+  /// the write to complete.
+  Status RestoreData(
+      std::unique_ptr<TmpWriteHandle> handle, MemRange buffer) WARN_UNUSED_RESULT;
 
-   private:
-    friend class FileGroup;
-    friend class TmpFileMgrTest;
+  /// Wait for the in-flight I/Os to complete and destroy resources associated with
+  /// 'handle'.
+  void DestroyWriteHandle(std::unique_ptr<TmpWriteHandle> handle);
 
-    WriteHandle(RuntimeProfile::Counter* encryption_timer, WriteDoneCallback cb);
+  /// Calls Remove() on all the files in the group and deletes them.
+  void Close();
 
-    /// Starts a write of 'buffer' to 'offset' of 'file'. 'write_in_flight_' must be false
-    /// before calling. After returning, 'write_in_flight_' is true on success or false on
-    /// failure and 'is_cancelled_' is set to true on failure.
-    Status Write(io::RequestContext* io_ctx, File* file,
-        int64_t offset, MemRange buffer,
-        WriteDoneCallback callback) WARN_UNUSED_RESULT;
+  std::string DebugString();
 
-    /// Retry the write after the initial write failed with an error, instead writing to
-    /// 'offset' of 'file'. 'write_in_flight_' must be true before calling.
-    /// After returning, 'write_in_flight_' is true on success or false on failure.
-    Status RetryWrite(io::RequestContext* io_ctx, File* file,
-        int64_t offset) WARN_UNUSED_RESULT;
+  const TUniqueId& unique_id() const { return unique_id_; }
 
-    /// Called when the write has completed successfully or not. Sets 'write_in_flight_'
-    /// then calls 'cb_'.
-    void WriteComplete(const Status& write_status);
+  TmpFileMgr* tmp_file_mgr() const { return tmp_file_mgr_; }
 
-    /// Cancels any in-flight writes or reads. Reads are cancelled synchronously and
-    /// writes are cancelled asynchronously. After Cancel() is called, writes are not
-    /// retried. The write callback may be called with a CANCELLED_INTERNALLY status
-    /// (unless it succeeded or encountered a different error first).
-    void Cancel();
+ private:
+  friend class TmpFile;
+  friend class TmpFileMgrTest;
 
-    /// Blocks until the write completes either successfully or unsuccessfully.
-    /// May return before the write callback has been called.
-    void WaitForWrite();
+  /// Initializes the file group with one temporary file per disk with a scratch
+  /// directory. Returns OK if at least one temporary file could be created.
+  /// Returns an error if no temporary files were successfully created. Must only be
+  /// called once. Must be called with 'lock_' held.
+  Status CreateFiles() WARN_UNUSED_RESULT;
 
-    /// Encrypts the data in 'buffer' in-place and computes 'hash_'.
-    Status EncryptAndHash(MemRange buffer) WARN_UNUSED_RESULT;
+  /// Allocate 'num_bytes' bytes in a temporary file. Try multiple disks if error
+  /// occurs. Returns an error only if no temporary files are usable or the scratch
+  /// limit is exceeded. Must be called without 'lock_' held.
+  Status AllocateSpace(
+      int64_t num_bytes, TmpFile** tmp_file, int64_t* file_offset) WARN_UNUSED_RESULT;
 
-    /// Verifies the integrity hash and decrypts the contents of 'buffer' in place.
-    Status CheckHashAndDecrypt(MemRange buffer) WARN_UNUSED_RESULT;
+  /// Add the scratch range from 'handle' to 'free_ranges_' and destroy handle. Must be
+  /// called without 'lock_' held.
+  void RecycleFileRange(std::unique_ptr<TmpWriteHandle> handle);
 
-    /// Callback to be called when the write completes.
-    WriteDoneCallback cb_;
+  /// Called when the DiskIoMgr write completes for 'handle'. On error, will attempt
+  /// to retry the write. On success or if the write can't be retried, calls
+  /// handle->WriteComplete().
+  void WriteComplete(TmpWriteHandle* handle, const Status& write_status);
 
-    /// Reference to the FileGroup's 'encryption_timer_'.
-    RuntimeProfile::Counter* encryption_timer_;
+  /// Handles a write error. Logs the write error and blacklists the device for this
+  /// file group if the cause was an I/O error. Blacklisting limits the number of times
+  /// a write is retried because each device will only be tried once. Returns OK if it
+  /// successfully reissued the write. Returns an error status if the original error
+  /// was unrecoverable or an unrecoverable error is encountered when reissuing the
+  /// write. The error status will include all previous I/O errors in its details.
+  Status RecoverWriteError(
+      TmpWriteHandle* handle, const Status& write_status) WARN_UNUSED_RESULT;
 
-    /// The DiskIoMgr write range for this write.
-    boost::scoped_ptr<io::WriteRange> write_range_;
+  /// Return a SCRATCH_ALLOCATION_FAILED error with the appropriate information,
+  /// including scratch directories, the amount of scratch allocated and previous
+  /// errors that caused this failure. If some directories were at capacity,
+  /// but had not encountered an error, the indices of these directories in
+  /// tmp_file_mgr_->tmp_dir_ should be included in 'at_capacity_dirs'.
+  /// 'lock_' must be held by caller.
+  Status ScratchAllocationFailedStatus(const std::vector<int>& at_capacity_dirs);
 
-    /// The temporary file being written to.
-    File* file_;
+  /// The TmpFileMgr it is associated with.
+  TmpFileMgr* const tmp_file_mgr_;
 
-    /// If --disk_spill_encryption is on, a AES 256-bit key and initialization vector.
-    /// Regenerated for each write.
-    EncryptionKey key_;
+  /// DiskIoMgr used for all I/O to temporary files.
+  io::DiskIoMgr* const io_mgr_;
 
-    /// If --disk_spill_encryption is on, our hash of the data being written. Filled in
-    /// on writes; verified on reads. This is calculated _after_ encryption.
-    IntegrityHash hash_;
+  /// I/O context used for all reads and writes. Registered in constructor.
+  std::unique_ptr<io::RequestContext> io_ctx_;
 
-    /// The scan range for the read that is currently in flight. NULL when no read is in
-    /// flight.
-    io::ScanRange* read_range_;
+  /// Stores scan ranges allocated in Read(). Needed because ScanRange objects may be
+  /// touched by DiskIoMgr even after the scan is finished.
+  /// TODO: IMPALA-4249: remove once lifetime of ScanRange objects is better defined.
+  ObjectPool scan_range_pool_;
 
-    /// Protects all fields below while 'write_in_flight_' is true. At other times, it is
-    /// invalid to call WriteRange/FileGroup methods concurrently from multiple threads,
-    /// so no locking is required. This is a terminal lock and should not be held while
-    /// acquiring other locks or invoking 'cb_'.
-    std::mutex write_state_lock_;
+  /// Unique across all TmpFileGroups. Used to prefix file names.
+  const TUniqueId unique_id_;
 
-    /// True if the the write has been cancelled (but is not necessarily complete).
-    bool is_cancelled_;
+  /// Max write space allowed (-1 means no limit).
+  const int64_t bytes_limit_;
 
-    /// True if a write is in flight.
-    bool write_in_flight_;
+  /// Number of write operations (includes writes started but not yet complete).
+  RuntimeProfile::Counter* const write_counter_;
 
-    /// Signalled when the write completes and 'write_in_flight_' becomes false, before
-    /// 'cb_' is invoked.
-    ConditionVariable write_complete_cv_;
-  };
+  /// Number of bytes written to disk (includes writes started but not yet complete).
+  RuntimeProfile::Counter* const bytes_written_counter_;
 
-  TmpFileMgr();
+  /// Number of read operations (includes reads started but not yet complete).
+  RuntimeProfile::Counter* const read_counter_;
 
-  /// Creates the configured tmp directories. If multiple directories are specified per
-  /// disk, only one is created and used. Must be called after DiskInfo::Init().
-  Status Init(MetricGroup* metrics) WARN_UNUSED_RESULT;
+  /// Number of bytes read from disk (includes reads started but not yet complete).
+  RuntimeProfile::Counter* const bytes_read_counter_;
 
-  /// Custom initialization - initializes with the provided list of directories.
-  /// If one_dir_per_device is true, only use one temporary directory per device.
-  /// This interface is intended for testing purposes. 'tmp_dir_specifiers'
-  /// use the command-line syntax, i.e. <path>[:<limit>]. The first variant takes
-  /// a comma-separated list, the second takes a vector.
-  Status InitCustom(const std::string& tmp_dirs_spec, bool one_dir_per_device,
-      MetricGroup* metrics) WARN_UNUSED_RESULT;
-  Status InitCustom(const std::vector<std::string>& tmp_dir_specifiers,
-      bool one_dir_per_device, MetricGroup* metrics) WARN_UNUSED_RESULT;
+  /// Amount of scratch space allocated in bytes.
+  RuntimeProfile::Counter* const scratch_space_bytes_used_counter_;
 
-  /// Return the scratch directory path for the device.
-  std::string GetTmpDirPath(DeviceId device_id) const;
+  /// Time spent waiting for disk reads.
+  RuntimeProfile::Counter* const disk_read_timer_;
 
-  /// Total number of devices with tmp directories that are active. There is one tmp
-  /// directory per device.
-  int NumActiveTmpDevices();
+  /// Time spent in disk spill encryption, decryption, and integrity checking.
+  RuntimeProfile::Counter* encryption_timer_;
 
-  /// Return vector with device ids of all tmp devices being actively used.
-  /// I.e. those that haven't been blacklisted.
-  std::vector<DeviceId> ActiveTmpDevices();
+  /// Protects below members.
+  SpinLock lock_;
+
+  /// List of files representing the TmpFileGroup.
+  std::vector<std::unique_ptr<TmpFile>> tmp_files_;
+
+  /// Total space allocated in this group's files.
+  int64_t current_bytes_allocated_;
+
+  /// Index into 'tmp_files' denoting the file to which the next temporary file range
+  /// should be allocated from. Used to implement round-robin allocation from temporary
+  /// files.
+  int next_allocation_index_;
+
+  /// Each vector in free_ranges_[i] is a vector of File/offset pairs for free scratch
+  /// ranges of length 2^i bytes. Has 64 entries so that every int64_t length has a
+  /// valid list associated with it.
+  std::vector<std::vector<std::pair<TmpFile*, int64_t>>> free_ranges_;
+
+  /// Errors encountered when creating/writing scratch files. We store the history so
+  /// that we can report the original cause of the scratch errors if we run out of
+  /// devices to write to.
+  std::vector<Status> scratch_errors_;
+};
+
+/// A handle to a write operation, backed by a range of a temporary file. The operation
+/// is either in-flight or has completed. If it completed with no error and wasn't
+/// cancelled then the data is in the file and can be read back.
+///
+/// TmpWriteHandle is returned from TmpFileGroup::Write(). After the write completes, the
+/// handle can be passed to TmpFileGroup::Read() to read back the data zero or more
+/// times. TmpFileGroup::DestroyWriteHandle() can be called at any time to destroy the
+/// handle and allow reuse of the scratch file range written to. Alternatively,
+/// TmpFileGroup::RestoreData() can be called to reverse the effects of
+/// TmpFileGroup::Write() by destroying the handle and restoring the original data to
+/// the buffer, so long as the data in the buffer was not modified by the caller.
+///
+/// Public methods of TmpWriteHandle are safe to call concurrently from multiple threads.
+class TmpWriteHandle {
+ public:
+  /// The write must be destroyed by passing it to TmpFileGroup - destroying it before
+  /// the write completes is an error.
+  ~TmpWriteHandle();
+
+  /// Cancel any in-flight read synchronously.
+  void CancelRead();
+
+  /// Path of temporary file backing the block. Intended for use in testing.
+  /// Returns empty string if no backing file allocated.
+  std::string TmpFilePath() const;
+
+  /// The length of the write range in bytes.
+  int64_t len() const;
+
+  std::string DebugString();
 
  private:
+  friend class TmpFileGroup;
   friend class TmpFileMgrTest;
 
-  /// Return a new File handle with a path based on file_group->unique_id. The file is
-  /// associated with the 'file_group' and the file path is within the (single) scratch
-  /// directory on the specified device id. The caller owns the returned handle and is
-  /// responsible for deleting it. The file is not created - creation is deferred until
-  /// the file is written.
-  void NewFile(FileGroup* file_group, DeviceId device_id,
-    std::unique_ptr<File>* new_file);
+  TmpWriteHandle(
+      RuntimeProfile::Counter* encryption_timer, TmpFileMgr::WriteDoneCallback cb);
 
-  bool initialized_;
+  /// Starts a write of 'buffer' to 'offset' of 'file'. 'write_in_flight_' must be false
+  /// before calling. After returning, 'write_in_flight_' is true on success or false on
+  /// failure and 'is_cancelled_' is set to true on failure.
+  Status Write(io::RequestContext* io_ctx, TmpFile* file,
+      int64_t offset, MemRange buffer,
+      TmpFileMgr::WriteDoneCallback callback) WARN_UNUSED_RESULT;
 
-  /// The paths of the created tmp directories.
-  std::vector<TmpDir> tmp_dirs_;
+  /// Retry the write after the initial write failed with an error, instead writing to
+  /// 'offset' of 'file'. 'write_in_flight_' must be true before calling.
+  /// After returning, 'write_in_flight_' is true on success or false on failure.
+  Status RetryWrite(io::RequestContext* io_ctx, TmpFile* file,
+      int64_t offset) WARN_UNUSED_RESULT;
 
-  /// Metrics to track active scratch directories.
-  IntGauge* num_active_scratch_dirs_metric_;
-  SetMetric<std::string>* active_scratch_dirs_metric_;
+  /// Called when the write has completed successfully or not. Sets 'write_in_flight_'
+  /// then calls 'cb_'.
+  void WriteComplete(const Status& write_status);
 
-  /// Metrics to track the scratch space HWM.
-  AtomicHighWaterMarkGauge* scratch_bytes_used_metric_;
+  /// Cancels any in-flight writes or reads. Reads are cancelled synchronously and
+  /// writes are cancelled asynchronously. After Cancel() is called, writes are not
+  /// retried. The write callback may be called with a CANCELLED_INTERNALLY status
+  /// (unless it succeeded or encountered a different error first).
+  void Cancel();
+
+  /// Blocks until the write completes either successfully or unsuccessfully.
+  /// May return before the write callback has been called.
+  void WaitForWrite();
+
+  /// Encrypts the data in 'buffer' in-place and computes 'hash_'.
+  Status EncryptAndHash(MemRange buffer) WARN_UNUSED_RESULT;
+
+  /// Verifies the integrity hash and decrypts the contents of 'buffer' in place.
+  Status CheckHashAndDecrypt(MemRange buffer) WARN_UNUSED_RESULT;
+
+  /// Callback to be called when the write completes.
+  TmpFileMgr::WriteDoneCallback cb_;
+
+  /// Reference to the TmpFileGroup's 'encryption_timer_'.
+  RuntimeProfile::Counter* encryption_timer_;
+
+  /// The DiskIoMgr write range for this write.
+  boost::scoped_ptr<io::WriteRange> write_range_;
+
+  /// The temporary file being written to.
+  TmpFile* file_;
+
+  /// If --disk_spill_encryption is on, a AES 256-bit key and initialization vector.
+  /// Regenerated for each write.
+  EncryptionKey key_;
+
+  /// If --disk_spill_encryption is on, our hash of the data being written. Filled in
+  /// on writes; verified on reads. This is calculated _after_ encryption.
+  IntegrityHash hash_;
+
+  /// The scan range for the read that is currently in flight. NULL when no read is in
+  /// flight.
+  io::ScanRange* read_range_;
+
+  /// Protects all fields below while 'write_in_flight_' is true. At other times, it is
+  /// invalid to call WriteRange/TmpFileGroup methods concurrently from multiple
+  /// threads,
+  /// so no locking is required. This is a terminal lock and should not be held while
+  /// acquiring other locks or invoking 'cb_'.
+  std::mutex write_state_lock_;
+
+  /// True if the the write has been cancelled (but is not necessarily complete).
+  bool is_cancelled_;
+
+  /// True if a write is in flight.
+  bool write_in_flight_;
+
+  /// Signalled when the write completes and 'write_in_flight_' becomes false, before
+  /// 'cb_' is invoked.
+  ConditionVariable write_complete_cv_;
 };
 }
diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h
index 1ead5e4..a7554d8 100644
--- a/be/src/runtime/tuple.h
+++ b/be/src/runtime/tuple.h
@@ -34,7 +34,10 @@ class Constant;
 namespace impala {
 
 struct CollectionValue;
+class RuntimeState;
 struct StringValue;
+class ScalarExpr;
+class ScalarExprEvaluator;
 class TupleDescriptor;
 class TupleRow;
 
diff --git a/be/src/service/client-request-state-map.cc b/be/src/service/client-request-state-map.cc
index 468e14a..aaf2d98 100644
--- a/be/src/service/client-request-state-map.cc
+++ b/be/src/service/client-request-state-map.cc
@@ -19,6 +19,7 @@
 
 #include "gutil/strings/substitute.h"
 #include "util/container-util.h"
+#include "util/debug-util.h"
 #include "util/uid-util.h"
 
 #include "common/names.h"
diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index c3c888b..27179a0 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -19,6 +19,7 @@
 
 #include <boost/algorithm/string/join.hpp>
 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/algorithm/string/replace.hpp>
 #include <limits>
 #include <gutil/strings/substitute.h>
 #include <rapidjson/rapidjson.h>
@@ -26,6 +27,7 @@
 #include <rapidjson/writer.h>
 #include <rapidjson/error/en.h>
 
+#include "catalog/catalog-service-client-wrapper.h"
 #include "common/status.h"
 #include "exec/kudu-util.h"
 #include "kudu/rpc/rpc_controller.h"
diff --git a/be/src/service/client-request-state.h b/be/src/service/client-request-state.h
index 8d4c352..83badb4 100644
--- a/be/src/service/client-request-state.h
+++ b/be/src/service/client-request-state.h
@@ -21,7 +21,6 @@
 #include "common/object-pool.h"
 #include "common/status.h"
 #include "exec/catalog-op-executor.h"
-#include "runtime/timestamp-value.h"
 #include "scheduling/query-schedule.h"
 #include "service/child-query.h"
 #include "service/impala-server.h"
diff --git a/be/src/service/control-service.h b/be/src/service/control-service.h
index 1aa26b2..acd3ae0 100644
--- a/be/src/service/control-service.h
+++ b/be/src/service/control-service.h
@@ -22,7 +22,6 @@
 
 #include "kudu/rpc/rpc_context.h"
 #include "kudu/rpc/rpc_controller.h"
-#include "util/debug-util.h"
 
 #include "common/status.h"
 
diff --git a/be/src/service/impala-beeswax-server.cc b/be/src/service/impala-beeswax-server.cc
index 3c90cc2..827e58f 100644
--- a/be/src/service/impala-beeswax-server.cc
+++ b/be/src/service/impala-beeswax-server.cc
@@ -29,6 +29,7 @@
 #include "service/query-options.h"
 #include "service/query-result-set.h"
 #include "util/auth-util.h"
+#include "util/debug-util.h"
 #include "util/webserver.h"
 #include "util/runtime-profile.h"
 #include "util/runtime-profile-counters.h"
diff --git a/be/src/service/impala-http-handler.cc b/be/src/service/impala-http-handler.cc
index 3a5802b..3128708 100644
--- a/be/src/service/impala-http-handler.cc
+++ b/be/src/service/impala-http-handler.cc
@@ -41,6 +41,7 @@
 #include "service/impala-server.h"
 #include "thrift/protocol/TDebugProtocol.h"
 #include "util/coding-util.h"
+#include "util/debug-util.h"
 #include "util/logging-support.h"
 #include "util/pretty-printer.h"
 #include "util/redactor.h"
diff --git a/be/src/service/impala-internal-service.cc b/be/src/service/impala-internal-service.cc
index 3b7af8a..1db4bba 100644
--- a/be/src/service/impala-internal-service.cc
+++ b/be/src/service/impala-internal-service.cc
@@ -25,6 +25,7 @@
 #include "runtime/query-state.h"
 #include "runtime/fragment-instance-state.h"
 #include "runtime/exec-env.h"
+#include "util/debug-util.h"
 
 #include "common/names.h"
 
diff --git a/be/src/service/impala-server.h b/be/src/service/impala-server.h
index 97a01b9..64d127e 100644
--- a/be/src/service/impala-server.h
+++ b/be/src/service/impala-server.h
@@ -36,7 +36,6 @@
 #include "gen-cpp/control_service.pb.h"
 #include "kudu/util/random.h"
 #include "rpc/thrift-server.h"
-#include "runtime/timestamp-value.h"
 #include "runtime/types.h"
 #include "scheduling/query-schedule.h"
 #include "service/client-request-state-map.h"
@@ -74,6 +73,8 @@ class TGetExecSummaryReq;
 class ClientRequestState;
 class QuerySchedule;
 class SimpleLogger;
+class UpdateFilterParamsPB;
+class UpdateFilterResultPB;
 
 /// An ImpalaServer contains both frontend and backend functionality;
 /// it implements ImpalaService (Beeswax), ImpalaHiveServer2Service (HiveServer2)
diff --git a/be/src/statestore/statestore.h b/be/src/statestore/statestore.h
index a08d338..8b95ce7 100644
--- a/be/src/statestore/statestore.h
+++ b/be/src/statestore/statestore.h
@@ -36,7 +36,6 @@
 #include "gen-cpp/Types_types.h"
 #include "rpc/thrift-client.h"
 #include "runtime/client-cache.h"
-#include "runtime/timestamp-value.h"
 #include "statestore/failure-detector.h"
 #include "statestore/statestore-subscriber-client-wrapper.h"
 #include "util/aligned-new.h"
diff --git a/be/src/udf/uda-test.cc b/be/src/udf/uda-test.cc
index d589c75..485a704 100644
--- a/be/src/udf/uda-test.cc
+++ b/be/src/udf/uda-test.cc
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <iostream>
-
 #include "common/logging.h"
 #include "udf/uda-test-harness.h"
 #include "testutil/gtest-util.h"
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index c8cd735..f3dc289 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -45,7 +45,6 @@ add_library(Util
   compress.cc
   cpu-info.cc
   cyclic-barrier.cc
-  decimal-util.cc
   dynamic-util.cc
   debug-util.cc
   decompress.cc
diff --git a/be/src/util/arithmetic-util.h b/be/src/util/arithmetic-util.h
index d6a6446..ca7d3b1 100644
--- a/be/src/util/arithmetic-util.h
+++ b/be/src/util/arithmetic-util.h
@@ -19,6 +19,7 @@
 #define IMPALA_ARITHMETIC_UTIL_H
 
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 namespace impala {
@@ -137,6 +138,32 @@ class ArithmeticUtil {
           Ring::INTEGER :
           (std::is_floating_point<T>::value ? Ring::FLOAT : Ring::NEITHER)>
   struct OperateOn;
+
+ public:
+  /// Returns the width of the integer portion of the type, not counting the sign bit.
+  /// Not safe for use with unknown or non-native types, so make it undefined
+  template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
+      typename std::enable_if<std::is_integral<CVR_REMOVED>{} ||
+                              std::is_same<CVR_REMOVED, unsigned __int128>{} ||
+                              std::is_same<CVR_REMOVED, __int128>{}, int>::type = 0>
+  constexpr static inline int UnsignedWidth() {
+    return std::is_integral<CVR_REMOVED>::value ?
+        std::numeric_limits<CVR_REMOVED>::digits :
+        std::is_same<CVR_REMOVED, unsigned __int128>::value ? 128 :
+        std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1;
+  }
+
+  /// Returns the max value that can be represented in T.
+  template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
+      typename std::enable_if<std::is_integral<CVR_REMOVED> {}||
+                              std::is_same<CVR_REMOVED, __int128> {}, int>::type = 0>
+  constexpr static inline CVR_REMOVED Max() {
+    return std::is_integral<CVR_REMOVED>::value ?
+        std::numeric_limits<CVR_REMOVED>::max() :
+        std::is_same<CVR_REMOVED, __int128>::value ?
+            static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1;
+  }
+
 };
 
 template <typename T>
diff --git a/be/src/util/auth-util.cc b/be/src/util/auth-util.cc
index 238b9de..c2a4e15 100644
--- a/be/src/util/auth-util.cc
+++ b/be/src/util/auth-util.cc
@@ -23,6 +23,7 @@
 
 #include "common/logging.h"
 #include "gen-cpp/ImpalaInternalService_types.h"
+#include "gutil/strings/substitute.h"
 #include "kudu/security/init.h"
 #include "kudu/util/status.h"
 #include "util/kudu-status-util.h"
diff --git a/be/src/util/auth-util.h b/be/src/util/auth-util.h
index 45c9620..71fb3f3 100644
--- a/be/src/util/auth-util.h
+++ b/be/src/util/auth-util.h
@@ -21,7 +21,6 @@
 #include <gflags/gflags_declare.h>
 
 #include "common/status.h"
-#include "gutil/strings/substitute.h"
 #include "service/impala-server.h"
 
 DECLARE_bool(skip_external_kerberos_auth);
diff --git a/be/src/util/bit-stream-utils-test.cc b/be/src/util/bit-stream-utils-test.cc
index abdb63a..1407f51 100644
--- a/be/src/util/bit-stream-utils-test.cc
+++ b/be/src/util/bit-stream-utils-test.cc
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <boost/utility.hpp>
+
 #include "testutil/gtest-util.h"
 #include "util/bit-packing.inline.h"
 #include "util/bit-stream-utils.inline.h"
diff --git a/be/src/util/bit-util-test.cc b/be/src/util/bit-util-test.cc
index ae3fa62..46419cb 100644
--- a/be/src/util/bit-util-test.cc
+++ b/be/src/util/bit-util-test.cc
@@ -25,40 +25,43 @@
 
 #include <boost/utility.hpp>
 
+#include "runtime/multi-precision.h"
 #include "testutil/gtest-util.h"
+#include "util/arithmetic-util.h"
 #include "util/bit-util.h"
 #include "util/cpu-info.h"
 
 #include "common/names.h"
-#include "runtime/multi-precision.h"
 
 namespace impala {
 
 TEST(BitUtil, UnsignedWidth) {
-  EXPECT_EQ(BitUtil::UnsignedWidth<signed char>(), 7);
-  EXPECT_EQ(BitUtil::UnsignedWidth<unsigned char>(), 8);
-  EXPECT_EQ(BitUtil::UnsignedWidth<volatile long long>(), 63);
-  EXPECT_EQ(BitUtil::UnsignedWidth<unsigned long long&>(), 64);
-  EXPECT_EQ(BitUtil::UnsignedWidth<const int128_t&>(), 127);
-  EXPECT_EQ(BitUtil::UnsignedWidth<const volatile unsigned __int128&>(), 128);
+  // UnsignedWidth was originally in BitUtil. The unit test is kept here for convenience.
+  EXPECT_EQ(ArithmeticUtil::UnsignedWidth<signed char>(), 7);
+  EXPECT_EQ(ArithmeticUtil::UnsignedWidth<unsigned char>(), 8);
+  EXPECT_EQ(ArithmeticUtil::UnsignedWidth<volatile long long>(), 63);
+  EXPECT_EQ(ArithmeticUtil::UnsignedWidth<unsigned long long&>(), 64);
+  EXPECT_EQ(ArithmeticUtil::UnsignedWidth<const int128_t&>(), 127);
+  EXPECT_EQ(ArithmeticUtil::UnsignedWidth<const volatile unsigned __int128&>(), 128);
 }
 
 TEST(BitUtil, Sign) {
-  EXPECT_EQ(BitUtil::Sign<int>(0), 1);
-  EXPECT_EQ(BitUtil::Sign<int>(1), 1);
-  EXPECT_EQ(BitUtil::Sign<int>(-1), -1);
-  EXPECT_EQ(BitUtil::Sign<int>(200), 1);
-  EXPECT_EQ(BitUtil::Sign<int>(-200), -1);
-  EXPECT_EQ(BitUtil::Sign<unsigned int>(0), 1);
-  EXPECT_EQ(BitUtil::Sign<unsigned int>(1), 1);
-  EXPECT_EQ(BitUtil::Sign<unsigned int>(-1U), 1);
-  EXPECT_EQ(BitUtil::Sign<unsigned int>(200), 1);
-  EXPECT_EQ(BitUtil::Sign<unsigned int>(-200), 1);
-  EXPECT_EQ(BitUtil::Sign<int128_t>(0), 1);
-  EXPECT_EQ(BitUtil::Sign<int128_t>(1), 1);
-  EXPECT_EQ(BitUtil::Sign<int128_t>(-1), -1);
-  EXPECT_EQ(BitUtil::Sign<int128_t>(200), 1);
-  EXPECT_EQ(BitUtil::Sign<int128_t>(-200), -1);
+  // Sign was originally in BitUtil. The unit test is kept here for convenience.
+  EXPECT_EQ(Sign<int>(0), 1);
+  EXPECT_EQ(Sign<int>(1), 1);
+  EXPECT_EQ(Sign<int>(-1), -1);
+  EXPECT_EQ(Sign<int>(200), 1);
+  EXPECT_EQ(Sign<int>(-200), -1);
+  EXPECT_EQ(Sign<unsigned int>(0), 1);
+  EXPECT_EQ(Sign<unsigned int>(1), 1);
+  EXPECT_EQ(Sign<unsigned int>(-1U), 1);
+  EXPECT_EQ(Sign<unsigned int>(200), 1);
+  EXPECT_EQ(Sign<unsigned int>(-200), 1);
+  EXPECT_EQ(Sign<int128_t>(0), 1);
+  EXPECT_EQ(Sign<int128_t>(1), 1);
+  EXPECT_EQ(Sign<int128_t>(-1), -1);
+  EXPECT_EQ(Sign<int128_t>(200), 1);
+  EXPECT_EQ(Sign<int128_t>(-200), -1);
 }
 
 TEST(BitUtil, Ceil) {
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 45f2f19..cf025dd 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -26,77 +26,20 @@
 #include <climits>
 #include <cstdint>
 #include <limits>
-#include <type_traits>
 
 #include "common/compiler-util.h"
 #include "common/logging.h"
 #include "gutil/bits.h"
-#include "runtime/multi-precision.h"
 #include "util/arithmetic-util.h"
 #include "util/cpu-info.h"
 #include "util/sse-util.h"
 
 namespace impala {
 
-// Doubles the width of integer types (e.g. int32_t -> int64_t).
-// Currently only works with a few signed types.
-// Feel free to extend it to other types as well.
-template <typename T>
-struct DoubleWidth {};
-
-template <>
-struct DoubleWidth<int32_t> {
-  using type = int64_t;
-};
-
-template <>
-struct DoubleWidth<int64_t> {
-  using type = int128_t;
-};
-
-template <>
-struct DoubleWidth<int128_t> {
-  using type = int256_t;
-};
-
 /// Utility class to do standard bit tricks
 /// TODO: is this in boost or something else like that?
 class BitUtil {
  public:
-
-  /// Returns the width of the integer portion of the type, not counting the sign bit.
-  /// Not safe for use with unknown or non-native types, so make it undefined
-  template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
-      typename std::enable_if<std::is_integral<CVR_REMOVED>{} ||
-                              std::is_same<CVR_REMOVED, unsigned __int128>{} ||
-                              std::is_same<CVR_REMOVED, __int128>{}, int>::type = 0>
-  constexpr static inline int UnsignedWidth() {
-    return std::is_integral<CVR_REMOVED>::value ?
-        std::numeric_limits<CVR_REMOVED>::digits :
-        std::is_same<CVR_REMOVED, unsigned __int128>::value ? 128 :
-        std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1;
-  }
-
-  /// Returns the max value that can be represented in T.
-  template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
-      typename std::enable_if<std::is_integral<CVR_REMOVED> {}||
-                              std::is_same<CVR_REMOVED, __int128> {}, int>::type = 0>
-  constexpr static inline CVR_REMOVED Max() {
-    return std::is_integral<CVR_REMOVED>::value ?
-        std::numeric_limits<CVR_REMOVED>::max() :
-        std::is_same<CVR_REMOVED, __int128>::value ?
-            static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1;
-  }
-
-  /// Return an integer signifying the sign of the value, returning +1 for
-  /// positive integers (and zero), -1 for negative integers.
-  /// The extra shift is to silence GCC warnings about full width shift on
-  /// unsigned types. It compiles out in optimized builds into the expected increment.
-  template<typename T>
-  constexpr static inline T Sign(T value) {
-    return 1 | ((value >> (UnsignedWidth<T>() - 1)) >> 1);
-  }
-
   /// Returns the ceil of value/divisor
   constexpr static inline int64_t Ceil(int64_t value, int64_t divisor) {
     return value / divisor + (value % divisor != 0);
@@ -394,11 +337,6 @@ class BitUtil {
   }
 };
 
-template<>
-inline int256_t BitUtil::Sign(int256_t value) {
-  return value < 0 ? -1 : 1;
-}
-
 /// An encapsulation class of SIMD byteswap functions
 class SimdByteSwap {
  public:
diff --git a/be/src/util/bitmap-test.cc b/be/src/util/bitmap-test.cc
index 61efa23..a9e020d 100644
--- a/be/src/util/bitmap-test.cc
+++ b/be/src/util/bitmap-test.cc
@@ -16,10 +16,7 @@
 // under the License.
 
 #include <stdlib.h>
-#include <stdio.h>
-#include <iostream>
 #include <limits.h>
-#include <boost/utility.hpp>
 
 #include "testutil/gtest-util.h"
 #include "util/bitmap.h"
diff --git a/be/src/util/bloom-filter.h b/be/src/util/bloom-filter.h
index d1f6258..c628e12 100644
--- a/be/src/util/bloom-filter.h
+++ b/be/src/util/bloom-filter.h
@@ -29,7 +29,6 @@
 #include "common/logging.h"
 #include "common/status.h"
 #include "gutil/macros.h"
-#include "gutil/strings/substitute.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "util/cpu-info.h"
 #include "util/hash-util.h"
diff --git a/be/src/util/cgroup-util.cc b/be/src/util/cgroup-util.cc
index d9bd359..0b09b95 100644
--- a/be/src/util/cgroup-util.cc
+++ b/be/src/util/cgroup-util.cc
@@ -19,7 +19,6 @@
 
 #include <algorithm>
 #include <fstream>
-#include <iostream>
 #include <utility>
 #include <vector>
 
diff --git a/be/src/util/codec.cc b/be/src/util/codec.cc
index 3e91643..f5a8bf1 100644
--- a/be/src/util/codec.cc
+++ b/be/src/util/codec.cc
@@ -25,6 +25,7 @@
 #include "common/compiler-util.h"
 #include "common/logging.h"
 #include "gutil/strings/substitute.h"
+#include "runtime/mem-pool.h"
 #include "util/bit-util.h"
 #include "util/compress.h"
 #include "util/decompress.h"
@@ -76,6 +77,8 @@ Status Codec::GetHadoopCodecClassName(THdfsCompression::type type, string* out_n
       _THdfsCompression_VALUES_TO_NAMES.find(type)->second));
 }
 
+Codec::~Codec() {}
+
 Status Codec::CreateCompressor(MemPool* mem_pool, bool reuse, const string& codec,
     scoped_ptr<Codec>* compressor) {
   CodecMap::const_iterator type = CODEC_MAP.find(codec);
diff --git a/be/src/util/codec.h b/be/src/util/codec.h
index 2b4afe7..5b40e26 100644
--- a/be/src/util/codec.h
+++ b/be/src/util/codec.h
@@ -25,11 +25,11 @@
 
 #include "common/status.h"
 #include "gen-cpp/CatalogObjects_types.h"
-#include "gutil/strings/substitute.h"
-#include "runtime/mem-pool.h"
 
 namespace impala {
 
+class MemPool;
+
 /// Create a compression object.  This is the base class for all compression algorithms. A
 /// compression algorithm is either a compressor or a decompressor.  To add a new
 /// algorithm, generally, both a compressor and a decompressor will be added.  Each of
@@ -107,7 +107,7 @@ class Codec {
   static Status GetHadoopCodecClassName(
       THdfsCompression::type, std::string* out_name) WARN_UNUSED_RESULT;
 
-  virtual ~Codec() {}
+  virtual ~Codec();
 
   /// Initialize the codec. This should only be called once.
   virtual Status Init() WARN_UNUSED_RESULT { return Status::OK(); }
diff --git a/be/src/util/debug-util.h b/be/src/util/debug-util.h
index e53a105..ab0e743 100644
--- a/be/src/util/debug-util.h
+++ b/be/src/util/debug-util.h
@@ -36,16 +36,18 @@
 #include "gen-cpp/beeswax_types.h"
 #include "gen-cpp/parquet_types.h"
 #include "gutil/macros.h"
-#include "runtime/descriptors.h" // for SchemaPath
 
 namespace impala {
 
+class RowBatch;
 class RowDescriptor;
 class TableDescriptor;
 class TupleDescriptor;
 class Tuple;
 class TupleRow;
-class RowBatch;
+
+// Forward declaration to avoid including descriptors.h.
+typedef std::vector<int> SchemaPath;
 
 // TODO: remove these functions and use operator << after upgrading to Thrift 0.11.0 or
 // higher.
diff --git a/be/src/util/decimal-util.cc b/be/src/util/decimal-constants.h
similarity index 56%
rename from be/src/util/decimal-util.cc
rename to be/src/util/decimal-constants.h
index 031c490..5b7e20f 100644
--- a/be/src/util/decimal-util.cc
+++ b/be/src/util/decimal-constants.h
@@ -15,12 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "util/decimal-util.h"
+// Common constants shared between decimal-related modules.
+
+#pragma once
+
+#include <cstdint>
 
 namespace impala {
 
-const int32_t DecimalUtil::MAX_UNSCALED_DECIMAL4;
-const int64_t DecimalUtil::MAX_UNSCALED_DECIMAL8;
-const int128_t DecimalUtil::MAX_UNSCALED_DECIMAL16;
+/// Maximum absolute value of a valid Decimal4Value. This is 9 digits of 9's.
+constexpr int32_t MAX_UNSCALED_DECIMAL4 = 999999999;
+
+/// Maximum absolute value of a valid Decimal8Value. This is 18 digits of 9's.
+constexpr int64_t MAX_UNSCALED_DECIMAL8 = 999999999999999999;
+
+/// Maximum absolute value a valid Decimal16Value. This is 38 digits of 9's.
+constexpr __int128_t MAX_UNSCALED_DECIMAL16 = 99 + 100 *
+    (MAX_UNSCALED_DECIMAL8 + (1 + MAX_UNSCALED_DECIMAL8) *
+     static_cast<__int128_t>(MAX_UNSCALED_DECIMAL8));
+
+} // namespace impala
 
-}
diff --git a/be/src/util/decimal-util.h b/be/src/util/decimal-util.h
index d35d1f5..f505ecc 100644
--- a/be/src/util/decimal-util.h
+++ b/be/src/util/decimal-util.h
@@ -28,22 +28,12 @@
 #include "runtime/types.h"
 #include "util/arithmetic-util.h"
 #include "util/bit-util.h"
+#include "util/decimal-constants.h"
 
 namespace impala {
 
 class DecimalUtil {
  public:
-  /// Maximum absolute value of a valid Decimal4Value. This is 9 digits of 9's.
-  static const int32_t MAX_UNSCALED_DECIMAL4 = 999999999;
-
-  /// Maximum absolute value of a valid Decimal8Value. This is 18 digits of 9's.
-  static const int64_t MAX_UNSCALED_DECIMAL8 = 999999999999999999;
-
-  /// Maximum absolute value a valid Decimal16Value. This is 38 digits of 9's.
-  static const int128_t MAX_UNSCALED_DECIMAL16 = 99 + 100 *
-      (MAX_UNSCALED_DECIMAL8 + (1 + MAX_UNSCALED_DECIMAL8) *
-       static_cast<int128_t>(MAX_UNSCALED_DECIMAL8));
-
   // Helper function that checks for multiplication overflow. We only check for overflow
   // if may_overflow is false.
   template <typename T>
@@ -96,7 +86,7 @@ class DecimalUtil {
         // here that it is a multiple of two.
         if (abs(remainder) >= (divisor >> 1)) {
           // Bias at zero must be corrected by sign of dividend.
-          result += BitUtil::Sign(value);
+          result += Sign(value);
         }
       }
       return result;
diff --git a/be/src/util/decompress.cc b/be/src/util/decompress.cc
index 8d93409..8430328 100644
--- a/be/src/util/decompress.cc
+++ b/be/src/util/decompress.cc
@@ -32,6 +32,7 @@
 
 #include "common/logging.h"
 #include "exec/read-write-util.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 
diff --git a/be/src/util/decompress.h b/be/src/util/decompress.h
index 8227293..1ec3f4f 100644
--- a/be/src/util/decompress.h
+++ b/be/src/util/decompress.h
@@ -26,7 +26,6 @@
 #include <zstd.h>
 
 #include "common/status.h"
-#include "gutil/strings/substitute.h"
 #include "util/codec.h"
 
 namespace impala {
diff --git a/be/src/util/dict-test.cc b/be/src/util/dict-test.cc
index 73c64a5..dbc57b7 100644
--- a/be/src/util/dict-test.cc
+++ b/be/src/util/dict-test.cc
@@ -15,10 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <iostream>
 #include <utility>
 
 #include "runtime/mem-tracker.h"
@@ -26,7 +22,6 @@
 #include "runtime/timestamp-value.h"
 #include "testutil/gtest-util.h"
 #include "testutil/rand-util.h"
-#include "util/bit-packing.inline.h"
 #include "util/dict-encoding.h"
 #include "util/encoding-test-util.h"
 
diff --git a/be/src/util/event-metrics.h b/be/src/util/event-metrics.h
index e1a13c6..e6fa1e1 100644
--- a/be/src/util/event-metrics.h
+++ b/be/src/util/event-metrics.h
@@ -19,7 +19,6 @@
 
 #include <string>
 
-#include "gutil/strings/substitute.h"
 #include "util/metrics-fwd.h"
 
 namespace impala {
diff --git a/be/src/util/logging-support-test.cc b/be/src/util/logging-support-test.cc
index a4d0edc..c3e65fa 100644
--- a/be/src/util/logging-support-test.cc
+++ b/be/src/util/logging-support-test.cc
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <iostream>
 #include <boost/filesystem.hpp>
 #include <ctime>
 
diff --git a/be/src/util/mem-info.cc b/be/src/util/mem-info.cc
index 10ce09f..a3f9d9d 100644
--- a/be/src/util/mem-info.cc
+++ b/be/src/util/mem-info.cc
@@ -24,8 +24,6 @@
 #include <unistd.h>
 #include <cctype>
 #include <fstream>
-#include <iostream>
-#include <sstream>
 
 #include <boost/algorithm/string.hpp>
 #include <boost/lexical_cast.hpp>
diff --git a/be/src/util/memory-metrics.h b/be/src/util/memory-metrics.h
index b1efdcc..e3b9f51 100644
--- a/be/src/util/memory-metrics.h
+++ b/be/src/util/memory-metrics.h
@@ -20,14 +20,12 @@
 #include "util/metrics.h"
 
 #include <mutex>
-#include <boost/bind.hpp>
 #include <gperftools/malloc_extension.h>
 #if defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER)
 #include <sanitizer/allocator_interface.h>
 #endif
 
 #include "gen-cpp/Frontend_types.h"
-#include "util/debug-util.h"
 
 namespace impala {
 
diff --git a/be/src/util/metrics.h b/be/src/util/metrics.h
index 65af37f..e6971e8 100644
--- a/be/src/util/metrics.h
+++ b/be/src/util/metrics.h
@@ -35,12 +35,13 @@
 #include "util/debug-util.h"
 #include "util/metrics-fwd.h"
 #include "util/spinlock.h"
-#include "util/webserver.h"
 
 using kudu::HttpStatusCode;
 
 namespace impala {
 
+class Webserver;
+
 /// Singleton that provides metric definitions. Metrics are defined in metrics.json
 /// and generate_metrics.py produces MetricDefs.thrift. This singleton wraps an instance
 /// of the thrift definitions.
@@ -463,25 +464,26 @@ class MetricGroup {
   typedef std::unordered_map<std::string, MetricGroup*> ChildGroupMap;
   ChildGroupMap children_;
 
+  // Forward declaration for Webserver::WebRequest.
+  using WebRequest = kudu::WebCallbackRegistry::WebRequest;
+
   /// Webserver callback for /metrics. Produces a tree of JSON values, each representing a
   /// metric group, and each including a list of metrics, and a list of immediate
   /// children.  If args contains a paramater 'metric', only the json for that metric is
   /// returned.
-  void TemplateCallback(const Webserver::WebRequest& req,
-      rapidjson::Document* document);
+  void TemplateCallback(const WebRequest& req, rapidjson::Document* document);
 
   /// Webserver callback for /metricsPrometheus. Produces string in prometheus format,
   /// each representing metric group, and each including a list of metrics, and a list
   /// of immediate children.  If args contains a paramater 'metric', only the json for
   /// that metric is returned.
-  void PrometheusCallback(const Webserver::WebRequest& req, std::stringstream* data,
+  void PrometheusCallback(const WebRequest& req, std::stringstream* data,
       HttpStatusCode* response);
 
   /// Legacy webpage callback for CM 5.0 and earlier. Produces a flattened map of (key,
   /// value) pairs for all metrics in this hierarchy.
   /// If args contains a paramater 'metric', only the json for that metric is returned.
-  void CMCompatibleCallback(const Webserver::WebRequest& req,
-      rapidjson::Document* document);
+  void CMCompatibleCallback(const WebRequest& req, rapidjson::Document* document);
 
   /// Non-templated implementation for FindMetricForTesting() that does not cast.
   Metric* FindMetricForTestingInternal(const std::string& key);
diff --git a/be/src/util/os-info.cc b/be/src/util/os-info.cc
index 0103e76..0811248 100644
--- a/be/src/util/os-info.cc
+++ b/be/src/util/os-info.cc
@@ -20,13 +20,16 @@
 #include <stdlib.h>
 #include <string.h>
 #include <fstream>
-#include <iostream>
 #include <sstream>
 
 #include <unistd.h>
-#include <boost/algorithm/string.hpp>
 #include <sys/stat.h>
 
+#include <boost/algorithm/string/classification.hpp>
+#include <boost/algorithm/string/constants.hpp>
+#include <boost/algorithm/string/detail/classification.hpp>
+#include <boost/algorithm/string/split.hpp>
+
 #include "common/names.h"
 
 using boost::algorithm::is_any_of;
diff --git a/be/src/util/pretty-printer.h b/be/src/util/pretty-printer.h
index e9c3f18..a211803 100644
--- a/be/src/util/pretty-printer.h
+++ b/be/src/util/pretty-printer.h
@@ -18,12 +18,13 @@
 #ifndef IMPALA_UTIL_PRETTY_PRINTER_H
 #define IMPALA_UTIL_PRETTY_PRINTER_H
 
-#include <boost/algorithm/string.hpp>
 #include <cmath>
 #include <iomanip>
 #include <limits>
 #include <sstream>
 
+#include <boost/algorithm/string/join.hpp>
+
 #include "gen-cpp/RuntimeProfile_types.h"
 #include "util/cpu-info.h"
 #include "util/template-util.h"
diff --git a/be/src/util/proc-info-test.cc b/be/src/util/proc-info-test.cc
index d9793e3..87ea572 100644
--- a/be/src/util/proc-info-test.cc
+++ b/be/src/util/proc-info-test.cc
@@ -15,20 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <iostream>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-
 #include <gtest/gtest.h>
 
-#include "common/init.h"
-#include "service/fe-support.h"
 #include "testutil/gtest-util.h"
 #include "util/cgroup-util.h"
 #include "util/mem-info.h"
 #include "util/process-state-info.h"
-#include "util/test-info.h"
 
 #include "common/names.h"
 
diff --git a/be/src/util/process-state-info.cc b/be/src/util/process-state-info.cc
index e72138a..bc2b97e 100644
--- a/be/src/util/process-state-info.cc
+++ b/be/src/util/process-state-info.cc
@@ -20,7 +20,6 @@
 #include <stdlib.h>
 #include <unistd.h>
 
-#include <iostream>
 #include <fstream>
 #include <sstream>
 #include <boost/algorithm/string.hpp>
diff --git a/be/src/util/redactor-test-utils.h b/be/src/util/redactor-test-utils.h
index 5f156be..7d1aa96 100644
--- a/be/src/util/redactor-test-utils.h
+++ b/be/src/util/redactor-test-utils.h
@@ -21,8 +21,6 @@
 
 #include <gtest/gtest.h>
 
-#include "gutil/strings/substitute.h"
-
 namespace impala {
 
 /// Utility class for creating a redaction config file that will be automatically deleted
diff --git a/be/src/util/runtime-profile.cc b/be/src/util/runtime-profile.cc
index 005b70a..923a919 100644
--- a/be/src/util/runtime-profile.cc
+++ b/be/src/util/runtime-profile.cc
@@ -29,6 +29,7 @@
 #include "gutil/strings/strip.h"
 #include "kudu/util/logging.h"
 #include "rpc/thrift-util.h"
+#include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 #include "util/coding-util.h"
 #include "util/compress.h"
diff --git a/be/src/util/string-parser-test.cc b/be/src/util/string-parser-test.cc
index 0eaef06..2e53a37 100644
--- a/be/src/util/string-parser-test.cc
+++ b/be/src/util/string-parser-test.cc
@@ -19,7 +19,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <cstdint>
-#include <iostream>
 #include <limits>
 #include <boost/lexical_cast.hpp>
 #include "testutil/gtest-util.h"
diff --git a/be/src/util/string-parser.h b/be/src/util/string-parser.h
index a688d24..6dc8dda 100644
--- a/be/src/util/string-parser.h
+++ b/be/src/util/string-parser.h
@@ -52,6 +52,9 @@ namespace impala {
 ///  - Validate input using _sidd_compare_ranges
 ///  - Since we know the length, we can parallelize this: i.e. result = 100*s[0] +
 ///    10*s[1] + s[2]
+///
+/// TODO: people went crazy with huge inline functions in this file - most should be
+/// moved out-of-line.
 class StringParser {
  public:
   enum ParseResult {
diff --git a/be/src/util/symbols-util-test.cc b/be/src/util/symbols-util-test.cc
index 63bd3ee..c6ced23 100644
--- a/be/src/util/symbols-util-test.cc
+++ b/be/src/util/symbols-util-test.cc
@@ -15,9 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <stdlib.h>
 #include <stdio.h>
-#include <iostream>
 
 #include "testutil/gtest-util.h"
 #include "util/symbols-util.h"
diff --git a/be/src/util/system-state-info.cc b/be/src/util/system-state-info.cc
index b5495c5..b0344cc 100644
--- a/be/src/util/system-state-info.cc
+++ b/be/src/util/system-state-info.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "gutil/strings/numbers.h"
 #include "gutil/strings/split.h"
 #include "gutil/strings/strip.h"
 #include "gutil/strings/util.h"
@@ -22,14 +23,11 @@
 #include "kudu/util/faststring.h"
 #include "kudu/util/logging.h"
 #include "util/disk-info.h"
-#include "util/error-util.h"
-#include "util/string-parser.h"
 #include "util/system-state-info.h"
 #include "util/time.h"
 
 #include <numeric>
 #include <fstream>
-#include <iostream>
 
 #include "common/names.h"
 
diff --git a/be/src/util/tuple-row-compare.h b/be/src/util/tuple-row-compare.h
index c417bc2..59eb5f8 100644
--- a/be/src/util/tuple-row-compare.h
+++ b/be/src/util/tuple-row-compare.h
@@ -21,7 +21,6 @@
 
 #include "common/compiler-util.h"
 #include "exprs/scalar-expr.h"
-#include "exprs/scalar-expr-evaluator.h"
 #include "runtime/descriptors.h"
 #include "runtime/raw-value.h"
 #include "runtime/raw-value.inline.h"
diff --git a/be/src/util/uid-util-test.cc b/be/src/util/uid-util-test.cc
index 1f4c9ee..60b985b 100644
--- a/be/src/util/uid-util-test.cc
+++ b/be/src/util/uid-util-test.cc
@@ -15,10 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <iostream>
-
 #include <boost/uuid/uuid_generators.hpp>
 
 #include "testutil/gtest-util.h"
diff --git a/be/src/util/uid-util.cc b/be/src/util/uid-util.cc
index fac5be7..9910cc4 100644
--- a/be/src/util/uid-util.cc
+++ b/be/src/util/uid-util.cc
@@ -19,6 +19,8 @@
 
 #include <boost/uuid/uuid_generators.hpp>
 
+#include "common/names.h"
+
 namespace impala {
 
 string GenerateUUIDString() {
diff --git a/be/src/util/uid-util.h b/be/src/util/uid-util.h
index aaa6f09..0a23f6a 100644
--- a/be/src/util/uid-util.h
+++ b/be/src/util/uid-util.h
@@ -17,11 +17,15 @@
 
 #pragma once
 
+#include <string.h>
+#include <cstdint>
+#include <string>
+
 #include <boost/uuid/uuid.hpp>
 
-#include "gen-cpp/Types_types.h"  // for TUniqueId
-#include "gen-cpp/control_service.pb.h"
-#include "util/debug-util.h"
+#include "common/logging.h"
+#include "gen-cpp/Types_types.h"
+#include "gen-cpp/common.pb.h"
 
 namespace impala {
 
@@ -97,7 +101,7 @@ std::string GenerateUUIDString();
 
 /// generates a 16 byte UUID
 inline TUniqueId GenerateUUID() {
-  const string& u = GenerateUUIDString();
+  const std::string& u = GenerateUUIDString();
   TUniqueId uid;
   memcpy(&uid.hi, u.data(), sizeof(int64_t));
   memcpy(&uid.lo, u.data() + sizeof(int64_t), sizeof(int64_t));