You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by ta...@apache.org on 2017/11/18 00:31:44 UTC

[01/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Repository: incubator-impala
Updated Branches:
  refs/heads/master 3ddafcd29 -> b840137c9


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-views.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-views.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-views.test
index 1345cab..5bb8828 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-views.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-views.test
@@ -93,7 +93,7 @@ PLAN-ROOT SINK
 |
 |--16:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.nation.n_regionkey = tpch.region.r_regionkey
-|  |  runtime filters: RF005 <- tpch.region.r_regionkey
+|  |  runtime filters: RF010 <- tpch.region.r_regionkey
 |  |
 |  |--04:SCAN HDFS [tpch.region]
 |  |     partitions=1/1 files=1 size=384B
@@ -101,19 +101,19 @@ PLAN-ROOT SINK
 |  |
 |  15:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.supplier.s_nationkey = tpch.nation.n_nationkey
-|  |  runtime filters: RF006 <- tpch.nation.n_nationkey
+|  |  runtime filters: RF012 <- tpch.nation.n_nationkey
 |  |
 |  |--03:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
-|  |     runtime filters: RF005 -> tpch.nation.n_regionkey
+|  |     runtime filters: RF010 -> tpch.nation.n_regionkey
 |  |
 |  14:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.supplier.s_suppkey = tpch.partsupp.ps_suppkey
-|  |  runtime filters: RF007 <- tpch.partsupp.ps_suppkey
+|  |  runtime filters: RF014 <- tpch.partsupp.ps_suppkey
 |  |
 |  |--13:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: tpch.partsupp.ps_partkey = tpch.part.p_partkey
-|  |  |  runtime filters: RF008 <- tpch.part.p_partkey
+|  |  |  runtime filters: RF016 <- tpch.part.p_partkey
 |  |  |
 |  |  |--00:SCAN HDFS [tpch.part]
 |  |  |     partitions=1/1 files=1 size=22.83MB
@@ -121,11 +121,11 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch.partsupp]
 |  |     partitions=1/1 files=1 size=112.71MB
-|  |     runtime filters: RF008 -> tpch.partsupp.ps_partkey
+|  |     runtime filters: RF016 -> tpch.partsupp.ps_partkey
 |  |
 |  01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF006 -> tpch.supplier.s_nationkey, RF007 -> tpch.supplier.s_suppkey
+|     runtime filters: RF012 -> tpch.supplier.s_nationkey, RF014 -> tpch.supplier.s_suppkey
 |
 12:AGGREGATE [FINALIZE]
 |  output: min(ps_supplycost)
@@ -133,7 +133,7 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: n_regionkey = r_regionkey
-|  runtime filters: RF002 <- r_regionkey
+|  runtime filters: RF004 <- r_regionkey
 |
 |--08:SCAN HDFS [tpch.region]
 |     partitions=1/1 files=1 size=384B
@@ -141,23 +141,23 @@ PLAN-ROOT SINK
 |
 10:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF003 <- n_nationkey
+|  runtime filters: RF006 <- n_nationkey
 |
 |--07:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF002 -> n_regionkey
+|     runtime filters: RF004 -> n_regionkey
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF004 <- s_suppkey
+|  runtime filters: RF008 <- s_suppkey
 |
 |--06:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF003 -> s_nationkey
+|     runtime filters: RF006 -> s_nationkey
 |
 05:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF004 -> ps_suppkey
+   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF008 -> ps_suppkey
 ====
 # TPCH-Q3
 # Q3 - Shipping Priority Query
@@ -204,7 +204,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  runtime filters: RF001 <- tpch.orders.o_orderkey
+|  runtime filters: RF002 <- tpch.orders.o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
@@ -214,7 +214,7 @@ PLAN-ROOT SINK
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: tpch.lineitem.l_shipdate > '1995-03-15'
-   runtime filters: RF001 -> tpch.lineitem.l_orderkey
+   runtime filters: RF002 -> tpch.lineitem.l_orderkey
 ====
 # TPCH-Q4
 # Q4 - Order Priority Checking Query
@@ -308,7 +308,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.supplier.s_nationkey = tpch.nation.n_nationkey
-|  runtime filters: RF001 <- tpch.nation.n_nationkey
+|  runtime filters: RF002 <- tpch.nation.n_nationkey
 |
 |--04:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
@@ -316,32 +316,32 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.customer.c_nationkey = tpch.supplier.s_nationkey, tpch.lineitem.l_suppkey = tpch.supplier.s_suppkey
-|  runtime filters: RF002 <- tpch.supplier.s_nationkey, RF003 <- tpch.supplier.s_suppkey
+|  runtime filters: RF004 <- tpch.supplier.s_nationkey, RF005 <- tpch.supplier.s_suppkey
 |
 |--03:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> tpch.supplier.s_nationkey
+|     runtime filters: RF002 -> tpch.supplier.s_nationkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.orders.o_custkey = tpch.customer.c_custkey
-|  runtime filters: RF004 <- tpch.customer.c_custkey
+|  runtime filters: RF008 <- tpch.customer.c_custkey
 |
 |--00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF001 -> tpch.customer.c_nationkey, RF002 -> tpch.customer.c_nationkey
+|     runtime filters: RF002 -> tpch.customer.c_nationkey, RF004 -> tpch.customer.c_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  runtime filters: RF005 <- tpch.orders.o_orderkey
+|  runtime filters: RF010 <- tpch.orders.o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: tpch.orders.o_orderdate < '1995-01-01', tpch.orders.o_orderdate >= '1994-01-01'
-|     runtime filters: RF004 -> tpch.orders.o_custkey
+|     runtime filters: RF008 -> tpch.orders.o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF003 -> tpch.lineitem.l_suppkey, RF005 -> tpch.lineitem.l_orderkey
+   runtime filters: RF005 -> tpch.lineitem.l_suppkey, RF010 -> tpch.lineitem.l_orderkey
 ====
 # TPCH-Q6
 # Q6 - Forecasting Revenue Change Query
@@ -424,14 +424,14 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.supplier.s_nationkey = tpch.nation.n_nationkey
-|  runtime filters: RF001 <- tpch.nation.n_nationkey
+|  runtime filters: RF002 <- tpch.nation.n_nationkey
 |
 |--04:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.orders.o_custkey = tpch.customer.c_custkey
-|  runtime filters: RF002 <- tpch.customer.c_custkey
+|  runtime filters: RF004 <- tpch.customer.c_custkey
 |
 |--03:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
@@ -439,24 +439,24 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_suppkey = tpch.supplier.s_suppkey
-|  runtime filters: RF003 <- tpch.supplier.s_suppkey
+|  runtime filters: RF006 <- tpch.supplier.s_suppkey
 |
 |--00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> tpch.supplier.s_nationkey
+|     runtime filters: RF002 -> tpch.supplier.s_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  runtime filters: RF004 <- tpch.orders.o_orderkey
+|  runtime filters: RF008 <- tpch.orders.o_orderkey
 |
 |--02:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF002 -> tpch.orders.o_custkey
+|     runtime filters: RF004 -> tpch.orders.o_custkey
 |
 01:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: tpch.lineitem.l_shipdate <= '1996-12-31', tpch.lineitem.l_shipdate >= '1995-01-01'
-   runtime filters: RF003 -> tpch.lineitem.l_suppkey, RF004 -> tpch.lineitem.l_orderkey
+   runtime filters: RF006 -> tpch.lineitem.l_suppkey, RF008 -> tpch.lineitem.l_orderkey
 ====
 # TPCH-Q8
 # Q8 - National Market Share Query
@@ -516,7 +516,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.nation.n_regionkey = tpch.region.r_regionkey
-|  runtime filters: RF001 <- tpch.region.r_regionkey
+|  runtime filters: RF002 <- tpch.region.r_regionkey
 |
 |--07:SCAN HDFS [tpch.region]
 |     partitions=1/1 files=1 size=384B
@@ -524,19 +524,19 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.customer.c_nationkey = tpch.nation.n_nationkey
-|  runtime filters: RF002 <- tpch.nation.n_nationkey
+|  runtime filters: RF004 <- tpch.nation.n_nationkey
 |
 |--05:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF001 -> tpch.nation.n_regionkey
+|     runtime filters: RF002 -> tpch.nation.n_regionkey
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.customer.c_custkey = tpch.orders.o_custkey
-|  runtime filters: RF003 <- tpch.orders.o_custkey
+|  runtime filters: RF006 <- tpch.orders.o_custkey
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.lineitem.l_suppkey = tpch.supplier.s_suppkey
-|  |  runtime filters: RF004 <- tpch.supplier.s_suppkey
+|  |  runtime filters: RF008 <- tpch.supplier.s_suppkey
 |  |
 |  |--01:SCAN HDFS [tpch.supplier]
 |  |     partitions=1/1 files=1 size=1.33MB
@@ -544,11 +544,11 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.orders.o_orderkey = tpch.lineitem.l_orderkey
-|  |  runtime filters: RF005 <- tpch.lineitem.l_orderkey
+|  |  runtime filters: RF010 <- tpch.lineitem.l_orderkey
 |  |
 |  |--08:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: tpch.lineitem.l_partkey = tpch.part.p_partkey
-|  |  |  runtime filters: RF006 <- tpch.part.p_partkey
+|  |  |  runtime filters: RF012 <- tpch.part.p_partkey
 |  |  |
 |  |  |--00:SCAN HDFS [tpch.part]
 |  |  |     partitions=1/1 files=1 size=22.83MB
@@ -556,16 +556,16 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch.lineitem]
 |  |     partitions=1/1 files=1 size=718.94MB
-|  |     runtime filters: RF004 -> tpch.lineitem.l_suppkey, RF006 -> tpch.lineitem.l_partkey
+|  |     runtime filters: RF008 -> tpch.lineitem.l_suppkey, RF012 -> tpch.lineitem.l_partkey
 |  |
 |  03:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: tpch.orders.o_orderdate <= '1996-12-31', tpch.orders.o_orderdate >= '1995-01-01'
-|     runtime filters: RF005 -> tpch.orders.o_orderkey
+|     runtime filters: RF010 -> tpch.orders.o_orderkey
 |
 04:SCAN HDFS [tpch.customer]
    partitions=1/1 files=1 size=23.08MB
-   runtime filters: RF002 -> tpch.customer.c_nationkey, RF003 -> tpch.customer.c_custkey
+   runtime filters: RF004 -> tpch.customer.c_nationkey, RF006 -> tpch.customer.c_custkey
 ====
 # TPCH-Q9
 # Q9 - Product Type Measure Query
@@ -619,38 +619,38 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_partkey = tpch.partsupp.ps_partkey, tpch.lineitem.l_suppkey = tpch.partsupp.ps_suppkey
-|  runtime filters: RF001 <- tpch.partsupp.ps_partkey, RF002 <- tpch.partsupp.ps_suppkey
+|  runtime filters: RF002 <- tpch.partsupp.ps_partkey, RF003 <- tpch.partsupp.ps_suppkey
 |
 |--03:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_suppkey = tpch.supplier.s_suppkey
-|  runtime filters: RF003 <- tpch.supplier.s_suppkey
+|  runtime filters: RF006 <- tpch.supplier.s_suppkey
 |
 |--01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF000 -> tpch.supplier.s_nationkey, RF002 -> tpch.supplier.s_suppkey
+|     runtime filters: RF000 -> tpch.supplier.s_nationkey, RF003 -> tpch.supplier.s_suppkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  runtime filters: RF004 <- tpch.orders.o_orderkey
+|  runtime filters: RF008 <- tpch.orders.o_orderkey
 |
 |--04:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_partkey = tpch.part.p_partkey
-|  runtime filters: RF005 <- tpch.part.p_partkey
+|  runtime filters: RF010 <- tpch.part.p_partkey
 |
 |--00:SCAN HDFS [tpch.part]
 |     partitions=1/1 files=1 size=22.83MB
 |     predicates: tpch.part.p_name LIKE '%green%'
-|     runtime filters: RF001 -> tpch.part.p_partkey
+|     runtime filters: RF002 -> tpch.part.p_partkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF001 -> tpch.lineitem.l_partkey, RF002 -> tpch.lineitem.l_suppkey, RF003 -> tpch.lineitem.l_suppkey, RF004 -> tpch.lineitem.l_orderkey, RF005 -> tpch.lineitem.l_partkey
+   runtime filters: RF002 -> tpch.lineitem.l_partkey, RF003 -> tpch.lineitem.l_suppkey, RF006 -> tpch.lineitem.l_suppkey, RF008 -> tpch.lineitem.l_orderkey, RF010 -> tpch.lineitem.l_partkey
 ====
 # TPCH-Q10
 # Q10 - Returned Item Reporting Query
@@ -706,11 +706,11 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.customer.c_custkey = tpch.orders.o_custkey
-|  runtime filters: RF001 <- tpch.orders.o_custkey
+|  runtime filters: RF002 <- tpch.orders.o_custkey
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  |  runtime filters: RF002 <- tpch.orders.o_orderkey
+|  |  runtime filters: RF004 <- tpch.orders.o_orderkey
 |  |
 |  |--01:SCAN HDFS [tpch.orders]
 |  |     partitions=1/1 files=1 size=162.56MB
@@ -719,11 +719,11 @@ PLAN-ROOT SINK
 |  02:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB
 |     predicates: tpch.lineitem.l_returnflag = 'R'
-|     runtime filters: RF002 -> tpch.lineitem.l_orderkey
+|     runtime filters: RF004 -> tpch.lineitem.l_orderkey
 |
 00:SCAN HDFS [tpch.customer]
    partitions=1/1 files=1 size=23.08MB
-   runtime filters: RF000 -> tpch.customer.c_nationkey, RF001 -> tpch.customer.c_custkey
+   runtime filters: RF000 -> tpch.customer.c_nationkey, RF002 -> tpch.customer.c_custkey
 ====
 # TPCH-Q11
 # Q11 - Important Stock Identification
@@ -774,7 +774,7 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.supplier.s_nationkey = tpch.nation.n_nationkey
-|  |  runtime filters: RF002 <- tpch.nation.n_nationkey
+|  |  runtime filters: RF004 <- tpch.nation.n_nationkey
 |  |
 |  |--08:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
@@ -782,15 +782,15 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.partsupp.ps_suppkey = tpch.supplier.s_suppkey
-|  |  runtime filters: RF003 <- tpch.supplier.s_suppkey
+|  |  runtime filters: RF006 <- tpch.supplier.s_suppkey
 |  |
 |  |--07:SCAN HDFS [tpch.supplier]
 |  |     partitions=1/1 files=1 size=1.33MB
-|  |     runtime filters: RF002 -> tpch.supplier.s_nationkey
+|  |     runtime filters: RF004 -> tpch.supplier.s_nationkey
 |  |
 |  06:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF003 -> tpch.partsupp.ps_suppkey
+|     runtime filters: RF006 -> tpch.partsupp.ps_suppkey
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(tpch.partsupp.ps_supplycost * tpch.partsupp.ps_availqty)
@@ -806,7 +806,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.partsupp.ps_suppkey = tpch.supplier.s_suppkey
-|  runtime filters: RF001 <- tpch.supplier.s_suppkey
+|  runtime filters: RF002 <- tpch.supplier.s_suppkey
 |
 |--01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
@@ -814,7 +814,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF001 -> tpch.partsupp.ps_suppkey
+   runtime filters: RF002 -> tpch.partsupp.ps_suppkey
 ====
 # TPCH-Q12
 # Q12 - Shipping Mode and Order Priority Query
@@ -1113,7 +1113,7 @@ PLAN-ROOT SINK
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.lineitem.l_partkey = tpch.part.p_partkey
-|  |  runtime filters: RF001 <- tpch.part.p_partkey
+|  |  runtime filters: RF002 <- tpch.part.p_partkey
 |  |
 |  |--01:SCAN HDFS [tpch.part]
 |  |     partitions=1/1 files=1 size=22.83MB
@@ -1121,7 +1121,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB
-|     runtime filters: RF001 -> tpch.lineitem.l_partkey
+|     runtime filters: RF002 -> tpch.lineitem.l_partkey
 |
 03:AGGREGATE [FINALIZE]
 |  output: avg(tpch.lineitem.l_quantity)
@@ -1191,22 +1191,22 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.orders.o_custkey = tpch.customer.c_custkey
-|  runtime filters: RF001 <- tpch.customer.c_custkey
+|  runtime filters: RF002 <- tpch.customer.c_custkey
 |
 |--00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  runtime filters: RF002 <- tpch.orders.o_orderkey
+|  runtime filters: RF004 <- tpch.orders.o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> tpch.orders.o_orderkey, RF001 -> tpch.orders.o_custkey
+|     runtime filters: RF000 -> tpch.orders.o_orderkey, RF002 -> tpch.orders.o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> tpch.lineitem.l_orderkey
+   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF004 -> tpch.lineitem.l_orderkey
 ====
 # TPCH-Q19
 # Q19 - Discounted Revenue Query
@@ -1316,7 +1316,7 @@ PLAN-ROOT SINK
 |
 |--08:HASH JOIN [INNER JOIN]
 |  |  hash predicates: tpch.supplier.s_nationkey = tpch.nation.n_nationkey
-|  |  runtime filters: RF004 <- tpch.nation.n_nationkey
+|  |  runtime filters: RF008 <- tpch.nation.n_nationkey
 |  |
 |  |--01:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
@@ -1324,16 +1324,16 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF004 -> tpch.supplier.s_nationkey
+|     runtime filters: RF008 -> tpch.supplier.s_nationkey
 |
 07:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: l_partkey = tpch.partsupp.ps_partkey, l_suppkey = tpch.partsupp.ps_suppkey
 |  other join predicates: tpch.partsupp.ps_availqty > 0.5 * sum(l_quantity)
-|  runtime filters: RF001 <- tpch.partsupp.ps_partkey, RF002 <- tpch.partsupp.ps_suppkey
+|  runtime filters: RF002 <- tpch.partsupp.ps_partkey, RF003 <- tpch.partsupp.ps_suppkey
 |
 |--06:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: tpch.partsupp.ps_partkey = tpch.part.p_partkey
-|  |  runtime filters: RF003 <- tpch.part.p_partkey
+|  |  runtime filters: RF006 <- tpch.part.p_partkey
 |  |
 |  |--03:SCAN HDFS [tpch.part]
 |  |     partitions=1/1 files=1 size=22.83MB
@@ -1341,7 +1341,7 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF000 -> tpch.partsupp.ps_suppkey, RF003 -> tpch.partsupp.ps_partkey
+|     runtime filters: RF000 -> tpch.partsupp.ps_suppkey, RF006 -> tpch.partsupp.ps_partkey
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(tpch.lineitem.l_quantity)
@@ -1350,7 +1350,7 @@ PLAN-ROOT SINK
 04:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: tpch.lineitem.l_shipdate < '1995-01-01', tpch.lineitem.l_shipdate >= '1994-01-01'
-   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF001 -> tpch.lineitem.l_partkey, RF002 -> tpch.lineitem.l_suppkey
+   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF002 -> tpch.lineitem.l_partkey, RF003 -> tpch.lineitem.l_suppkey
 ====
 # TPCH-Q21
 # Q21 - Suppliers Who Kept Orders Waiting Query
@@ -1415,7 +1415,7 @@ PLAN-ROOT SINK
 |  |
 |  |--08:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: tpch.supplier.s_nationkey = n_nationkey
-|  |  |  runtime filters: RF001 <- n_nationkey
+|  |  |  runtime filters: RF002 <- n_nationkey
 |  |  |
 |  |  |--03:SCAN HDFS [tpch.nation]
 |  |  |     partitions=1/1 files=1 size=2.15KB
@@ -1423,15 +1423,15 @@ PLAN-ROOT SINK
 |  |  |
 |  |  07:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: tpch.lineitem.l_suppkey = tpch.supplier.s_suppkey
-|  |  |  runtime filters: RF002 <- tpch.supplier.s_suppkey
+|  |  |  runtime filters: RF004 <- tpch.supplier.s_suppkey
 |  |  |
 |  |  |--00:SCAN HDFS [tpch.supplier]
 |  |  |     partitions=1/1 files=1 size=1.33MB
-|  |  |     runtime filters: RF001 -> tpch.supplier.s_nationkey
+|  |  |     runtime filters: RF002 -> tpch.supplier.s_nationkey
 |  |  |
 |  |  06:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: tpch.lineitem.l_orderkey = tpch.orders.o_orderkey
-|  |  |  runtime filters: RF003 <- tpch.orders.o_orderkey
+|  |  |  runtime filters: RF006 <- tpch.orders.o_orderkey
 |  |  |
 |  |  |--02:SCAN HDFS [tpch.orders]
 |  |  |     partitions=1/1 files=1 size=162.56MB
@@ -1440,7 +1440,7 @@ PLAN-ROOT SINK
 |  |  01:SCAN HDFS [tpch.lineitem]
 |  |     partitions=1/1 files=1 size=718.94MB
 |  |     predicates: tpch.lineitem.l_receiptdate > tpch.lineitem.l_commitdate
-|  |     runtime filters: RF002 -> tpch.lineitem.l_suppkey, RF003 -> tpch.lineitem.l_orderkey
+|  |     runtime filters: RF004 -> tpch.lineitem.l_suppkey, RF006 -> tpch.lineitem.l_orderkey
 |  |
 |  04:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/union.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/union.test b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
index 03a2df5..4ebfb6b 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
@@ -3145,25 +3145,25 @@ PLAN-ROOT SINK
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.bigint_col = t1.bigint_col
-|  |  runtime filters: RF002 <- t1.bigint_col
+|  |  runtime filters: RF004 <- t1.bigint_col
 |  |
 |  |--08:SCAN HDFS [functional.alltypestiny t1]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  09:SCAN HDFS [functional.alltypes t2]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF002 -> t2.bigint_col
+|     runtime filters: RF004 -> t2.bigint_col
 |
 |--07:HASH JOIN [RIGHT OUTER JOIN]
 |  |  hash predicates: t2.bigint_col = t1.bigint_col
-|  |  runtime filters: RF001 <- t1.bigint_col
+|  |  runtime filters: RF002 <- t1.bigint_col
 |  |
 |  |--05:SCAN HDFS [functional.alltypestiny t1]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  06:SCAN HDFS [functional.alltypes t2]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> t2.bigint_col
+|     runtime filters: RF002 -> t2.bigint_col
 |
 |--04:HASH JOIN [RIGHT SEMI JOIN]
 |  |  hash predicates: t2.bigint_col = t1.bigint_col

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/views.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/views.test b/testdata/workloads/functional-planner/queries/PlannerTest/views.test
index a6322e6..5caeab5 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/views.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/views.test
@@ -153,7 +153,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = b.id
-|  |  runtime filters: RF002 <- b.id
+|  |  runtime filters: RF004 <- b.id
 |  |
 |  |--03:SCAN HDFS [functional.alltypestiny b]
 |  |     partitions=4/4 files=4 size=460B
@@ -161,11 +161,11 @@ PLAN-ROOT SINK
 |  02:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
 |     predicates: a.bigint_col < 50
-|     runtime filters: RF002 -> a.id
+|     runtime filters: RF004 -> a.id
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: functional.alltypes.id = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--01:SCAN HDFS [functional.alltypes]
 |     partitions=24/24 files=24 size=478.45KB
@@ -175,7 +175,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
    predicates: functional.alltypes.id > 1
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -207,7 +207,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: a.id = b.id
-|  |  runtime filters: RF002 <- b.id
+|  |  runtime filters: RF004 <- b.id
 |  |
 |  |--11:EXCHANGE [BROADCAST]
 |  |  |
@@ -217,11 +217,11 @@ PLAN-ROOT SINK
 |  02:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
 |     predicates: a.bigint_col < 50
-|     runtime filters: RF002 -> a.id
+|     runtime filters: RF004 -> a.id
 |
 07:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: functional.alltypes.id = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--10:EXCHANGE [HASH(int_col)]
 |  |
@@ -235,7 +235,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
    predicates: functional.alltypes.id > 1
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ====
 # Self-join of view to make sure the on clause is properly set
 # in the cloned view instances.
@@ -254,7 +254,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: functional.alltypes.id = functional.alltypes.id
-|  runtime filters: RF001 <- functional.alltypes.id
+|  runtime filters: RF002 <- functional.alltypes.id
 |
 |--01:SCAN HDFS [functional.alltypes]
 |     partitions=24/24 files=24 size=478.45KB
@@ -262,7 +262,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -279,7 +279,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: functional.alltypes.id = functional.alltypes.id
-|  runtime filters: RF001 <- functional.alltypes.id
+|  runtime filters: RF002 <- functional.alltypes.id
 |
 |--06:EXCHANGE [HASH(functional.alltypes.id)]
 |  |
@@ -291,7 +291,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ====
 # Self-join views to make sure the using clause is properly set
 # in the cloned view instances.
@@ -310,7 +310,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: functional.alltypes.id = functional.alltypes.id
-|  runtime filters: RF001 <- functional.alltypes.id
+|  runtime filters: RF002 <- functional.alltypes.id
 |
 |--01:SCAN HDFS [functional.alltypes]
 |     partitions=24/24 files=24 size=478.45KB
@@ -318,7 +318,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -335,7 +335,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: functional.alltypes.id = functional.alltypes.id
-|  runtime filters: RF001 <- functional.alltypes.id
+|  runtime filters: RF002 <- functional.alltypes.id
 |
 |--06:EXCHANGE [HASH(functional.alltypes.id)]
 |  |
@@ -347,7 +347,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ====
 # Self-join of view to make sure the join op is properly set
 # in the cloned view instances.
@@ -416,7 +416,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: functional.alltypes.id = functional.alltypes.id
-|  runtime filters: RF001 <- functional.alltypes.id
+|  runtime filters: RF002 <- functional.alltypes.id
 |
 |--01:SCAN HDFS [functional.alltypes]
 |     partitions=24/24 files=24 size=478.45KB
@@ -424,7 +424,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -443,7 +443,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: functional.alltypes.id = functional.alltypes.id
-|  runtime filters: RF001 <- functional.alltypes.id
+|  runtime filters: RF002 <- functional.alltypes.id
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -453,7 +453,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.id
+   runtime filters: RF000 -> functional.alltypes.id, RF002 -> functional.alltypes.id
 ====
 # Tests that parentheses are preserved when creating a view
 # enabling proper partition pruning for this particular view.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test b/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
index 69105c7..0a966c4 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
@@ -98,7 +98,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--01:SCAN HDFS [functional.alltypestiny]
 |     partitions=4/4 files=4 size=460B
@@ -106,7 +106,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypes.int_col, RF002 -> int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -123,7 +123,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -133,7 +133,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypes.int_col, RF002 -> int_col
 ====
 # Multiple dependent views in with-clause
 with t1 as (
@@ -226,7 +226,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--01:SCAN HDFS [functional.alltypestiny]
 |     partitions=4/4 files=4 size=460B
@@ -234,7 +234,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> functional.alltypestiny.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypestiny.int_col, RF002 -> int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -251,7 +251,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--06:EXCHANGE [HASH(int_col)]
 |  |
@@ -263,7 +263,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> functional.alltypestiny.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypestiny.int_col, RF002 -> int_col
 ====
 # Self-join of with-clause table to make sure the using clause is properly set
 # in the cloned inline-view instances.
@@ -281,7 +281,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--01:SCAN HDFS [functional.alltypestiny]
 |     partitions=4/4 files=4 size=460B
@@ -289,7 +289,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> functional.alltypestiny.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypestiny.int_col, RF002 -> int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -306,7 +306,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--06:EXCHANGE [HASH(int_col)]
 |  |
@@ -318,7 +318,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> functional.alltypestiny.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypestiny.int_col, RF002 -> int_col
 ====
 # Self-join of with-clause table to make sure the join op is properly set
 # in the cloned inline-view instances.
@@ -385,7 +385,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--01:SCAN HDFS [functional.alltypestiny]
 |     partitions=4/4 files=4 size=460B
@@ -393,7 +393,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> functional.alltypestiny.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypestiny.int_col, RF002 -> int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -412,7 +412,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -422,7 +422,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> functional.alltypestiny.int_col, RF001 -> int_col
+   runtime filters: RF000 -> functional.alltypestiny.int_col, RF002 -> int_col
 ====
 # Multiple with clauses. One for the UnionStmt and one for each union operand.
 with t1 as (values('a', 'b'))

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/bloom_filters.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/bloom_filters.test b/testdata/workloads/functional-query/queries/QueryTest/bloom_filters.test
new file mode 100644
index 0000000..ace4596
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/bloom_filters.test
@@ -0,0 +1,126 @@
+====
+---- QUERY
+####################################################
+# Test case 1: bloom filters with high expected FP rate get disabled.
+# To trigger this path, we have to trick the planner into estimating a too-small
+# build-side cardinality, which will cause the BF size to be estimated low (and therefore
+# the FP rate to be high). We do this by using predicates that are completely unselective,
+# but which the planner thinks have relatively high selectivity.
+# Kudu doesn't support bloom filters, so it just doesn't filter anything.
+####################################################
+
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_MAX_SIZE=4K;
+select STRAIGHT_JOIN count(*) from alltypes a
+    join [BROADCAST]
+    # Build-side needs to be sufficiently large to trigger FP check.
+    (select id, int_col from alltypes UNION ALL select id, int_col from alltypes) b
+        on a.id = b.id
+        # Predicates that are always true (but planner thinks are selective)
+        where (b.id - b.id) < 1 AND (b.int_col - b.int_col) < 1;
+---- RESULTS
+14600
+---- RUNTIME_PROFILE
+row_regex: .*0 of 1 Runtime Filter Published, 1 Disabled.*
+row_regex: .*Rows rejected: 0 .*
+====
+
+
+---- QUERY
+####################################################
+# Test case 2: Filter sizes change according to their NDV
+####################################################
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MIN_SIZE=4KB;
+with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
+select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
+    join (select * from l LIMIT 1) b on a.l_orderkey = -b.l_orderkey;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+row_regex: .*1 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(4.00 KB\).*
+====
+---- QUERY
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MIN_SIZE=4KB;
+with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
+select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
+    join (select * from l LIMIT 500000) b on a.l_orderkey = -b.l_orderkey;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+row_regex: .*1 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(256.00 KB\).*
+====
+---- QUERY
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MIN_SIZE=4KB;
+with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
+select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
+    join (select * from l LIMIT 1000000) b on a.l_orderkey = -b.l_orderkey;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+row_regex: .*1 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(512.00 KB\).*
+====
+---- QUERY
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MIN_SIZE=4KB;
+with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
+select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
+    join (select * from l LIMIT 2000000) b on a.l_orderkey = -b.l_orderkey;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+row_regex: .*1 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(1.00 MB\).*
+====
+
+
+
+---- QUERY
+####################################################
+# Test case 3: Filter sizes respect query options
+####################################################
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MIN_SIZE=8KB;
+SET RUNTIME_FILTER_MAX_SIZE=8KB;
+# This query would produce a 4KB filter without setting the minimum size.
+select STRAIGHT_JOIN count(*) from alltypes a join [SHUFFLE] alltypes b on a.id = b.id;
+---- RESULTS
+7300
+---- RUNTIME_PROFILE
+row_regex: .*1 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(8.00 KB\).*
+====
+---- QUERY
+# Check that filter sizes are rounded up to power-of-two
+SET RUNTIME_FILTER_MIN_SIZE=6000B;
+SET RUNTIME_FILTER_MAX_SIZE=6000B;
+select STRAIGHT_JOIN count(*) from alltypes a join [SHUFFLE] alltypes b on a.id = b.id;
+---- RESULTS
+7300
+---- RUNTIME_PROFILE
+row_regex: .*1 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(8.00 KB\).*
+====
+---- QUERY
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+SET RUNTIME_FILTER_MAX_SIZE=8192;
+# Query would produce a 512KB filter without setting the max
+with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
+select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
+    join (select * from l LIMIT 1000000) b on a.l_orderkey = -b.l_orderkey;
+---- RUNTIME_PROFILE
+row_regex: .*0 of 1 Runtime Filter Published.*
+row_regex: .*Filter 0 \(8.00 KB\).*
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/bloom_filters_wait.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/bloom_filters_wait.test b/testdata/workloads/functional-query/queries/QueryTest/bloom_filters_wait.test
new file mode 100644
index 0000000..1ed6668
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/bloom_filters_wait.test
@@ -0,0 +1,22 @@
+====
+---- QUERY
+####################################################
+# Regression test for IMPALA-3141: Disabled filters should send dummy filters
+# to unblock waiters.
+####################################################
+
+SET RUNTIME_FILTER_WAIT_TIME_MS=600000;
+SET RUNTIME_FILTER_MODE=GLOBAL;
+SET RUNTIME_FILTER_MAX_SIZE=4096;
+select STRAIGHT_JOIN count(*) from alltypes a
+    join [BROADCAST]
+    # Build-side needs to be sufficiently large to trigger FP check.
+    (select id, int_col from alltypes UNION ALL select id, int_col from alltypes) b
+        on a.id = b.id
+        # Predicates that are always true (but planner thinks are selective)
+        where (b.id - b.id) < 1 AND (b.int_col - b.int_col) < 1;
+---- RESULTS
+14600
+---- RUNTIME_PROFILE
+row_regex: .*0 of 1 Runtime Filter Published, 1 Disabled.*
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
index 1ade744..4c48ba4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
@@ -22,7 +22,7 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 '02:HASH JOIN [INNER JOIN, BROADCAST]'
 '|  hash predicates: l_orderkey = o_orderkey'
 '|  fk/pk conjuncts: l_orderkey = o_orderkey'
-'|  runtime filters: RF000 <- o_orderkey'
+'|  runtime filters: RF000[bloom] <- o_orderkey'
 '|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
 '|  tuple-ids=0,1 row-size=454B cardinality=5757710'
 '|'
@@ -42,7 +42,7 @@ row_regex:.*partitions=1/1 files=1 size=.*
 '|'
 '00:SCAN HDFS [tpch.lineitem, RANDOM]'
 row_regex:.*partitions=1/1 files=1 size=.*
-'   runtime filters: RF000 -> l_orderkey'
+'   runtime filters: RF000[bloom] -> l_orderkey'
 '   stats-rows=6001215 extrapolated-rows=disabled'
 '   table stats: rows=6001215 size=718.94MB'
 '   column stats: all'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
index 80c0d8f..b0b6595 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
@@ -24,7 +24,7 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 '  02:HASH JOIN [INNER JOIN, BROADCAST]'
 '  |  hash predicates: l_orderkey = o_orderkey'
 '  |  fk/pk conjuncts: l_orderkey = o_orderkey'
-'  |  runtime filters: RF000 <- o_orderkey'
+'  |  runtime filters: RF000[bloom] <- o_orderkey'
 '  |  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
 '  |  tuple-ids=0,1 row-size=454B cardinality=5757710'
 '  |'
@@ -34,7 +34,7 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 '  |'
 '  00:SCAN HDFS [tpch.lineitem, RANDOM]'
 row_regex:.*partitions=1/1 files=1 size=.*
-'     runtime filters: RF000 -> l_orderkey'
+'     runtime filters: RF000[bloom] -> l_orderkey'
 '     stats-rows=6001215 extrapolated-rows=disabled'
 '     table stats: rows=6001215 size=718.94MB'
 '     column stats: all'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/min_max_filters.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/min_max_filters.test b/testdata/workloads/functional-query/queries/QueryTest/min_max_filters.test
new file mode 100644
index 0000000..c87cc74
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/min_max_filters.test
@@ -0,0 +1,121 @@
+====
+---- QUERY
+####################################################
+# Test case 1: Min-max filters of all possible types.
+####################################################
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.bool_col = (b.bool_col && !b.bool_col)
+---- RESULTS
+29200
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.tinyint_col = b.tinyint_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.smallint_col = b.smallint_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.int_col = b.int_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.bigint_col = b.bigint_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.float_col = b.float_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.double_col = b.double_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.string_col = b.string_col
+---- RESULTS
+5840
+====
+---- QUERY
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*) from alltypes a join [BROADCAST] alltypestiny b
+where a.timestamp_col = b.timestamp_col
+---- RESULTS
+8
+====
+
+---- QUERY
+####################################################
+# Test case 2: Min-max filters on a primary key/partition column
+####################################################
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN a.id, a.tinyint_col, b.id, b.tinyint_col
+from alltypes a join [BROADCAST] alltypestiny b
+where a.id = b.tinyint_col * 2;
+---- RESULTS: VERIFY_IS_EQUAL_SORTED
+0,0,4,0
+0,0,2,0
+0,0,0,0
+0,0,6,0
+2,2,3,1
+2,2,7,1
+2,2,5,1
+2,2,1,1
+----TYPES
+INT,TINYINT,INT,TINYINT
+====
+
+
+---- QUERY
+####################################################
+# Test case 3: Target expr has an implicit integer cast
+####################################################
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*)
+from alltypes a join [BROADCAST] alltypes b
+where a.tinyint_col = b.int_col and b.int_col in (0, 1)
+---- RESULTS
+1065800
+====
+---- QUERY
+# The min/max values in the filter are both above the range of the target col so all rows
+# are filtered.
+SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
+select STRAIGHT_JOIN count(*)
+from alltypes a join [BROADCAST] alltypes b
+where a.tinyint_col = b.int_col + 10000
+---- RESULTS
+0
+====
+---- QUERY
+# The min/max values in the filter are below/above the range for the target col,
+# respectively, so no rows are filtered.
+select STRAIGHT_JOIN count(*)
+from alltypes a join [BROADCAST]
+  (values (min_int() x), (max_int()), (0)) v
+where a.tinyint_col = v.x
+---- RESULTS
+730
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test b/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
index 1bc9b12..e883cb4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/runtime_filters.test
@@ -18,6 +18,7 @@ row_regex: .*RowsRead: 2.43K .*
 ====
 ---- QUERY
 # Now turn on local filtering: we expect to see a reduction in scan volume.
+# TODO: improve the Kudu profile output once KUDU-2162 is fixed.
 SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
 SET RUNTIME_FILTER_MODE=LOCAL;
 select STRAIGHT_JOIN count(*) from alltypes p join [BROADCAST] alltypestiny b
@@ -26,6 +27,8 @@ on p.month = b.int_col and b.month = 1 and b.string_col = "1"
 620
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 7 \(7\).*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 206 .*
 ====
 
 
@@ -57,6 +60,8 @@ on p.month = b.int_col and b.month = 1 and b.string_col = "1"
 620
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 7 \(7\).*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 206 .*
 ====
 
 
@@ -81,6 +86,8 @@ select STRAIGHT_JOIN count(*) from alltypes a
 0
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 0 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 2.43K .*
 ====
 ---- QUERY
 # Global mode. Scan of 'b' will receive highly effective filter, and will propagate that
@@ -95,6 +102,9 @@ select STRAIGHT_JOIN count(*) from alltypes a
 0
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 8 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 0 .*
+row_regex: .*ScanRangesComplete: 0 .*
 ====
 
 
@@ -103,6 +113,7 @@ row_regex: .*Files rejected: 8 .*
 # Test case 4: complex filter expressions. The join predicate matches nothing, but
 # isn't simplified by the planner before execution.
 # With local filtering, expect 0 rows, as all are rejected by partition pruning.
+# For Kudu, the target is not a single column, so no filters will be created.
 ####################################################
 
 SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
@@ -115,6 +126,8 @@ select STRAIGHT_JOIN count(*) from alltypes a
 ---- RUNTIME_PROFILE
 row_regex: .*RowsRead: 0 .*
 row_regex: .*Files rejected: 8 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 2.43K .*
 ====
 
 
@@ -146,6 +159,9 @@ select STRAIGHT_JOIN count(*) from alltypes a
 ---- RUNTIME_PROFILE
 row_regex: .*FiltersReceived: 0 .*
 row_regex: .*Files rejected: 8 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*FiltersReceived: 0 .*
+row_regex: .*RowsRead: 0 .*
 ====
 
 
@@ -183,6 +199,7 @@ row_regex: .*FiltersReceived: 3 .*
 ####################################################
 # Test case 7: filters with target exprs bound by > 1 slotref.
 # Expect all but one partition to be filtered out by join expr.
+# For Kudu, the target is not a single column, so no filters will be created.
 ####################################################
 
 SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
@@ -194,6 +211,8 @@ select STRAIGHT_JOIN count(*) from alltypes a
 2480
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 7 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 2.43K .*
 ====
 
 
@@ -229,6 +248,8 @@ select STRAIGHT_JOIN count(*) from alltypes a
 8
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 8 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 0 .*
 ====
 
 ---- QUERY
@@ -248,40 +269,16 @@ row_regex: .*RowsReturned: 2.43K .*
 ====
 
 
----- QUERY
-####################################################
-# Test case 11: filters with high expected FP rate get disabled.
-# To trigger this path, we have to trick the planner into estimating a too-small
-# build-side cardinality, which will cause the BF size to be estimated low (and therefore
-# the FP rate to be high). We do this by using predicates that are completely unselective,
-# but which the planner thinks have relatively high selectivity.
-####################################################
-
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_MAX_SIZE=4K;
-select STRAIGHT_JOIN count(*) from alltypes a
-    join [BROADCAST]
-    # Build-side needs to be sufficiently large to trigger FP check.
-    (select id, int_col from alltypes UNION ALL select id, int_col from alltypes) b
-        on a.id = b.id
-        # Predicates that are always true (but planner thinks are selective)
-        where (b.id - b.id) < 1 AND (b.int_col - b.int_col) < 1;
----- RESULTS
-14600
----- RUNTIME_PROFILE
-row_regex: .*0 of 1 Runtime Filter Published, 1 Disabled.*
-row_regex: .*Rows rejected: 0 .*
-====
 
 
 ---- QUERY
 ####################################################
-# Test case 12: join predicates with NULL values.
+# Test case 11: join predicates with NULL values.
 # Build-side selects one row from alltypes agg where day IS NULL, and joins with all rows
 # in probe side with day IS NULL.
 # Expect with filtering that 1K rows are returned, with an average of 333 per scan node
 # per fragment instance, and three files rejected per scan.
+# For Kudu, 'IS NOT DISTINCT' is not supported for runtime filters.
 ####################################################
 
 SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
@@ -294,12 +291,14 @@ select STRAIGHT_JOIN count(*) from alltypesagg a
 1000
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 3 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 3.67K .*
 ====
 
 
 ---- QUERY
 ####################################################
-# Test case 13: coordinator fragment produces filters
+# Test case 12: coordinator fragment produces filters
 # In this esoteric query plan, the coordinator fragment has a hash
 # join in its root, which produces filters for the scan of t1.
 ####################################################
@@ -307,18 +306,20 @@ row_regex: .*Files rejected: 3 .*
 set RUNTIME_FILTER_WAIT_TIME_MS=30000;
 set RUNTIME_FILTER_MODE=GLOBAL;
 with t1 as (select month x, bigint_col y from alltypes limit 7300),
-     t2 as (select int_col x, bigint_col y from alltypestiny limit 2)
+     t2 as (select distinct int_col x, bigint_col y from alltypestiny limit 2)
      select count(*) from t1, t2 where t1.x = t2.x
 ---- RESULTS
 620
 ---- RUNTIME_PROFILE
 row_regex: .*Files rejected: 7 .*
+---- RUNTIME_PROFILE: table_format=kudu
+row_regex: .*RowsRead: 206 .*
 ====
 
 
 ---- QUERY
 ####################################################
-# Test case 14: When NUM_NODES=1, all filters should be local.
+# Test case 13: When NUM_NODES=1, all filters should be local.
 # Regression test for IMPALA-3245.
 ####################################################
 set NUM_NODES=1;
@@ -332,105 +333,7 @@ select STRAIGHT_JOIN count(a.id) from alltypes a
 
 ---- QUERY
 ####################################################
-# Test case 15: Filter sizes change according to their NDV
-####################################################
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MIN_SIZE=4KB;
-with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
-select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
-    join (select * from l LIMIT 1) b on a.l_orderkey = -b.l_orderkey;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*1 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(4.00 KB\).*
-====
----- QUERY
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MIN_SIZE=4KB;
-with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
-select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
-    join (select * from l LIMIT 500000) b on a.l_orderkey = -b.l_orderkey;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*1 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(256.00 KB\).*
-====
----- QUERY
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MIN_SIZE=4KB;
-with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
-select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
-    join (select * from l LIMIT 1000000) b on a.l_orderkey = -b.l_orderkey;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*1 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(512.00 KB\).*
-====
----- QUERY
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MIN_SIZE=4KB;
-with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
-select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
-    join (select * from l LIMIT 2000000) b on a.l_orderkey = -b.l_orderkey;
----- RESULTS
-0
----- RUNTIME_PROFILE
-row_regex: .*1 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(1.00 MB\).*
-====
-
-
----- QUERY
-####################################################
-# Test case 16: Filter sizes respect query options
-####################################################
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MIN_SIZE=8KB;
-SET RUNTIME_FILTER_MAX_SIZE=8KB;
-# This query would produce a 4KB filter without setting the minimum size.
-select STRAIGHT_JOIN count(*) from alltypes a join [SHUFFLE] alltypes b on a.id = b.id;
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*1 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(8.00 KB\).*
-====
----- QUERY
-# Check that filter sizes are rounded up to power-of-two
-SET RUNTIME_FILTER_MIN_SIZE=6000B;
-SET RUNTIME_FILTER_MAX_SIZE=6000B;
-select STRAIGHT_JOIN count(*) from alltypes a join [SHUFFLE] alltypes b on a.id = b.id;
----- RESULTS
-7300
----- RUNTIME_PROFILE
-row_regex: .*1 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(8.00 KB\).*
-====
----- QUERY
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
-SET RUNTIME_FILTER_MAX_SIZE=8192;
-# Query would produce a 512KB filter without setting the max
-with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
-select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
-    join (select * from l LIMIT 1000000) b on a.l_orderkey = -b.l_orderkey;
----- RUNTIME_PROFILE
-row_regex: .*0 of 1 Runtime Filter Published.*
-row_regex: .*Filter 0 \(8.00 KB\).*
-====
-
-
----- QUERY
-####################################################
-# Test case 17: Filter with two targers (one local, one remote)
+# Test case 14: Filter with two targers (one local, one remote)
 # In this three-way join the filter produced by the top-level
 # join has both a local and a remote target.
 ####################################################
@@ -447,7 +350,7 @@ from alltypes a join [BROADCAST] alltypessmall c
 
 ---- QUERY
 ####################################################
-# Test case 18: Runtime filter pushed to all union operands
+# Test case 15: Runtime filter pushed to all union operands
 ####################################################
 set RUNTIME_FILTER_WAIT_TIME_MS=30000;
 set RUNTIME_FILTER_MODE=GLOBAL;
@@ -460,3 +363,19 @@ where b.int_col = 1;
 ---- RESULTS
 14400
 ====
+
+
+---- QUERY
+####################################################
+# Test case 16: Both HDFS and Kudu targets results in both bloom and min-max filters
+# being generated
+####################################################
+set RUNTIME_FILTER_WAIT_TIME_MS=30000;
+set RUNTIME_FILTER_MODE=GLOBAL;
+select straight_join count(*)
+from functional_parquet.alltypes a join [BROADCAST] functional_kudu.alltypes b
+    join [BROADCAST] functional_parquet.alltypes c
+where a.int_col = b.int_col and a.int_col = c.smallint_col * 2 and c.id < 100
+---- RESULTS
+26645000
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters_wait.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters_wait.test b/testdata/workloads/functional-query/queries/QueryTest/runtime_filters_wait.test
index 4743f3e..73ea1ae 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/runtime_filters_wait.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/runtime_filters_wait.test
@@ -14,26 +14,3 @@ on p.month = b.int_col and b.month = 1 and b.string_col = "1"
 ---- RESULTS
 620
 ====
-
-
----- QUERY
-####################################################
-# Regression test for IMPALA-3141: Disabled filters should send dummy filters
-# to unblock waiters.
-####################################################
-
-SET RUNTIME_FILTER_WAIT_TIME_MS=600000;
-SET RUNTIME_FILTER_MODE=GLOBAL;
-SET RUNTIME_FILTER_MAX_SIZE=4096;
-select STRAIGHT_JOIN count(*) from alltypes a
-    join [BROADCAST]
-    # Build-side needs to be sufficiently large to trigger FP check.
-    (select id, int_col from alltypes UNION ALL select id, int_col from alltypes) b
-        on a.id = b.id
-        # Predicates that are always true (but planner thinks are selective)
-        where (b.id - b.id) < 1 AND (b.int_col - b.int_col) < 1;
----- RESULTS
-14600
----- RUNTIME_PROFILE
-row_regex: .*0 of 1 Runtime Filter Published, 1 Disabled.*
-====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/targeted-perf/queries/primitive_min_max_runtime_filter.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/targeted-perf/queries/primitive_min_max_runtime_filter.test b/testdata/workloads/targeted-perf/queries/primitive_min_max_runtime_filter.test
new file mode 100644
index 0000000..ddfbc3d
--- /dev/null
+++ b/testdata/workloads/targeted-perf/queries/primitive_min_max_runtime_filter.test
@@ -0,0 +1,9 @@
+====
+---- QUERY: primitive_min_max_runtime_filter
+-- Description: a query that results in a highly selective min-max runtime filter. Will
+-- only see a perf improvement on Kudu as min-max filters are not implemented for other
+-- scanners yet.
+select count(*)
+from lineitem a, lineitem b
+where a.l_orderkey = b.l_orderkey * 2 and b.l_orderkey = 5
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/tests/common/impala_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index 1581e9b..b35e054 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -431,7 +431,13 @@ class ImpalaTestSuite(BaseTestSuite):
         test_section['RESULTS'] = test_section['RESULTS'] \
             .replace(NAMENODE, '$NAMENODE') \
             .replace('$IMPALA_HOME', IMPALA_HOME)
-      if 'RUNTIME_PROFILE' in test_section:
+      if 'RUNTIME_PROFILE_%s' % table_format_info.file_format in test_section:
+        # If this table format has a RUNTIME_PROFILE section specifically for it, evaluate
+        # that section and ignore any general RUNTIME_PROFILE sections.
+        verify_runtime_profile(
+            test_section['RUNTIME_PROFILE_%s' % table_format_info.file_format],
+            result.runtime_profile)
+      elif 'RUNTIME_PROFILE' in test_section:
         verify_runtime_profile(test_section['RUNTIME_PROFILE'], result.runtime_profile)
 
       if 'DML_RESULTS' in test_section:

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/tests/query_test/test_kudu.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_kudu.py b/tests/query_test/test_kudu.py
index 27ee757..f8a761a 100644
--- a/tests/query_test/test_kudu.py
+++ b/tests/query_test/test_kudu.py
@@ -663,6 +663,12 @@ class TestCreateExternalTable(KuduTestSuite):
       assert cursor.fetchall() == [(4, )]
       cursor.execute("select * from %s order by kEY" % (table_name))
       assert cursor.fetchall() == [(1, ), (4, ), (5, )]
+
+      # Do a join with a runtime filter targeting the column.
+      cursor.execute("select count(*) from %s a, %s b where a.key = b.key" %
+          (table_name, table_name))
+      assert cursor.fetchall() == [(3, )]
+
       cursor.execute("alter table %s add range partition 11 < values < 20" % table_name)
 
       new_key = "KEY2"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/tests/query_test/test_runtime_filters.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_runtime_filters.py b/tests/query_test/test_runtime_filters.py
index 049560f..4d64d36 100644
--- a/tests/query_test/test_runtime_filters.py
+++ b/tests/query_test/test_runtime_filters.py
@@ -35,9 +35,9 @@ class TestRuntimeFilters(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestRuntimeFilters, cls).add_test_dimensions()
-    # Runtime filters are disabled on HBase, Kudu
+    # Runtime filters are disabled on HBase
     cls.ImpalaTestMatrix.add_constraint(
-      lambda v: v.get_value('table_format').file_format not in ['hbase', 'kudu'])
+        lambda v: v.get_value('table_format').file_format not in ['hbase'])
 
   def test_basic_filters(self, vector):
     self.run_test_case('QueryTest/runtime_filters', vector)
@@ -49,9 +49,11 @@ class TestRuntimeFilters(ImpalaTestSuite):
     self.run_test_case('QueryTest/runtime_filters_wait', vector)
     duration = time.time() - now
     assert duration < WAIT_TIME_MS, \
-      "Query took too long (%ss, possibly waiting for missing filters?)" % str(duration)
+        "Query took too long (%ss, possibly waiting for missing filters?)" % str(duration)
 
   def test_file_filtering(self, vector):
+    if 'kudu' in str(vector.get_value('table_format')):
+      return
     self.change_database(self.client, vector.get_value('table_format'))
     self.execute_query("SET RUNTIME_FILTER_MODE=GLOBAL")
     self.execute_query("SET RUNTIME_FILTER_WAIT_TIME_MS=10000")
@@ -62,6 +64,93 @@ class TestRuntimeFilters(ImpalaTestSuite):
     assert re.search("Splits rejected: [^0] \([^0]\)", result.runtime_profile) is None
 
 @SkipIfLocal.multiple_impalad
+class TestBloomFilters(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestBloomFilters, cls).add_test_dimensions()
+    # Bloom filters are disabled on HBase, Kudu
+    cls.ImpalaTestMatrix.add_constraint(
+        lambda v: v.get_value('table_format').file_format not in ['hbase', 'kudu'])
+
+  def test_bloom_filters(self, vector):
+    self.run_test_case('QueryTest/bloom_filters', vector)
+
+  def test_bloom_wait_time(self, vector):
+    """Test that a query that has global filters does not wait for them if run in LOCAL
+    mode"""
+    now = time.time()
+    self.run_test_case('QueryTest/bloom_filters_wait', vector)
+    duration = time.time() - now
+    assert duration < 60, \
+        "Query took too long (%ss, possibly waiting for missing filters?)" % str(duration)
+
+
+@SkipIfLocal.multiple_impalad
+class TestMinMaxFilters(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestMinMaxFilters, cls).add_test_dimensions()
+    # Min-max filters are only implemented for Kudu.
+    cls.ImpalaTestMatrix.add_constraint(
+        lambda v: v.get_value('table_format').file_format in ['kudu'])
+
+  def test_min_max_filters(self, vector):
+    self.run_test_case('QueryTest/min_max_filters', vector)
+
+  def test_large_strings(self, cursor, unique_database):
+    """Tests that truncation of large strings by min-max filters still gives correct
+    results"""
+    table1 = "%s.min_max_filter_large_strings1" % unique_database
+    cursor.execute(
+        "create table %s (string_col string primary key) stored as kudu" % table1)
+    # Min-max bounds are truncated at 1024 characters, so construct some strings that are
+    # longer than that, as well as some that are very close to the min/max bounds.
+    matching_vals =\
+        ('b' * 1100, 'b' * 1099 + 'c', 'd' * 1100, 'f'* 1099 + 'e', 'f' * 1100)
+    cursor.execute("insert into %s values ('%s'), ('%s'), ('%s'), ('%s'), ('%s')"
+        % ((table1,) + matching_vals))
+    non_matching_vals = ('b' * 1099 + 'a', 'c', 'f' * 1099 + 'g')
+    cursor.execute("insert into %s values ('%s'), ('%s'), ('%s')"
+        % ((table1,) + non_matching_vals))
+
+    table2 = "%s.min_max_filter_large_strings2" % unique_database
+    cursor.execute(
+        "create table %s (string_col string primary key) stored as kudu" % table2)
+    cursor.execute("insert into %s values ('%s'), ('%s'), ('%s'), ('%s'), ('%s')"
+        % ((table2,) + matching_vals))
+
+    cursor.execute("select count(*) from %s a, %s b where a.string_col = b.string_col"
+        % (table1, table2))
+    assert cursor.fetchall() == [(len(matching_vals),)]
+
+    # Insert a string that will have the max char (255) trailing after truncation, to
+    # test the path where adding 1 to the max bound after trunc overflows.
+    max_trail_str = "concat(repeat('h', 1000), repeat(chr(255), 50))"
+    cursor.execute("insert into %s values (%s)" % (table1, max_trail_str))
+    cursor.execute("insert into %s values (%s)" % (table2, max_trail_str))
+    cursor.execute("select count(*) from %s a, %s b where a.string_col = b.string_col"
+        % (table1, table2))
+    assert cursor.fetchall() == [(len(matching_vals) + 1,)]
+
+    # Insert a string that is entirely the max char to test the path where the max can't
+    # have 1 added to it after truncation and the filter is disabled.
+    all_max_str = "repeat(chr(255), 1030)"
+    cursor.execute("insert into %s values (%s)" % (table1, all_max_str))
+    cursor.execute("insert into %s values (%s)" % (table2, all_max_str))
+    cursor.execute("select count(*) from %s a, %s b where a.string_col = b.string_col"
+        % (table1, table2))
+    assert cursor.fetchall() == [(len(matching_vals) + 2,)]
+
+
+@SkipIfLocal.multiple_impalad
 class TestRuntimeRowFilters(ImpalaTestSuite):
   @classmethod
   def get_workload(cls):

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/tests/util/test_file_parser.py
----------------------------------------------------------------------
diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py
index 89149ab..9fa8d70 100644
--- a/tests/util/test_file_parser.py
+++ b/tests/util/test_file_parser.py
@@ -242,6 +242,23 @@ def parse_test_file_text(text, valid_section_names, skip_unknown_sections=True):
         parsed_sections['DML_RESULTS_TABLE'] = subsection_comment
         parsed_sections['VERIFIER'] = 'VERIFY_IS_EQUAL_SORTED'
 
+      # The RUNTIME_PROFILE section is used to specify lines of text that should be
+      # present in the query runtime profile. It takes an option comment containing a
+      # table format. RUNTIME_PROFILE secions with a comment are only evaluated for the
+      # specified format. If there is a RUNTIME_PROFILE section without a comment, it is
+      # evaluated for all formats that don't have a commented section for this query.
+      if subsection_name == 'RUNTIME_PROFILE':
+        if subsection_comment is not None and subsection_comment is not "":
+          allowed_formats = ['kudu']
+          if not subsection_comment.startswith("table_format="):
+            raise RuntimeError, 'RUNTIME_PROFILE comment (%s) must be of the form ' \
+              '"table_format=FORMAT"' % subsection_comment
+          table_format = subsection_comment[13:]
+          if table_format not in allowed_formats:
+            raise RuntimeError, 'RUNTIME_PROFILE table format (%s) must be in: %s' % \
+                (table_format, allowed_formats)
+          subsection_name = 'RUNTIME_PROFILE_%s' % table_format
+
       parsed_sections[subsection_name] = subsection_str
 
     if parsed_sections:

[16/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

IMPALA-4835 (prep only): create io subfolder and namespace

Instead of using the DiskIoMgr class as a namespace, which prevents
forward-declaration of inner classes, create an impala::io namespace
and unnested the inner class.

This is done in anticipation of DiskIoMgr depending on BufferPool. This
helps avoid a circular dependency between DiskIoMgr, TmpFileMgr and
BufferPool headers that could not be broken with forward declarations.

Testing:
Ran core tests.

Change-Id: If807f93a47d8027a43e56dd80b1b535d0bb74e1b
Reviewed-on: http://gerrit.cloudera.org:8080/8424
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b840137c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b840137c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b840137c

Branch: refs/heads/master
Commit: b840137c940d71af5cec2daf482b523a38b6a9f1
Parents: 2510fe0
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Mon Oct 30 16:34:47 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Nov 17 22:47:34 2017 +0000

----------------------------------------------------------------------
 be/CMakeLists.txt                               |    2 +
 be/src/exec/base-sequence-scanner.cc            |    9 +-
 be/src/exec/hdfs-parquet-scanner.cc             |   36 +-
 be/src/exec/hdfs-parquet-scanner.h              |    4 +-
 be/src/exec/hdfs-scan-node-base.cc              |   25 +-
 be/src/exec/hdfs-scan-node-base.h               |   26 +-
 be/src/exec/hdfs-scan-node-mt.h                 |    2 +-
 be/src/exec/hdfs-scan-node.cc                   |    7 +-
 be/src/exec/hdfs-scan-node.h                    |    6 +-
 be/src/exec/hdfs-text-scanner.cc                |    9 +-
 be/src/exec/kudu-scan-node.cc                   |    2 +-
 be/src/exec/scanner-context.cc                  |   12 +-
 be/src/exec/scanner-context.h                   |   14 +-
 be/src/runtime/CMakeLists.txt                   |   10 +-
 be/src/runtime/disk-io-mgr-handle-cache.h       |  196 ---
 .../runtime/disk-io-mgr-handle-cache.inline.h   |  231 ----
 be/src/runtime/disk-io-mgr-internal.h           |   76 --
 be/src/runtime/disk-io-mgr-reader-context.cc    |  292 -----
 be/src/runtime/disk-io-mgr-reader-context.h     |  406 ------
 be/src/runtime/disk-io-mgr-scan-range.cc        |  591 ---------
 be/src/runtime/disk-io-mgr-stress-test.cc       |   60 -
 be/src/runtime/disk-io-mgr-stress.cc            |  246 ----
 be/src/runtime/disk-io-mgr-stress.h             |   94 --
 be/src/runtime/disk-io-mgr-test.cc              | 1127 -----------------
 be/src/runtime/disk-io-mgr.cc                   | 1190 -----------------
 be/src/runtime/disk-io-mgr.h                    |  972 --------------
 be/src/runtime/exec-env.cc                      |    4 +-
 be/src/runtime/exec-env.h                       |    9 +-
 be/src/runtime/io/CMakeLists.txt                |   36 +
 be/src/runtime/io/disk-io-mgr-internal.h        |   78 ++
 be/src/runtime/io/disk-io-mgr-stress-test.cc    |   61 +
 be/src/runtime/io/disk-io-mgr-stress.cc         |  247 ++++
 be/src/runtime/io/disk-io-mgr-stress.h          |   95 ++
 be/src/runtime/io/disk-io-mgr-test.cc           | 1129 +++++++++++++++++
 be/src/runtime/io/disk-io-mgr.cc                | 1191 ++++++++++++++++++
 be/src/runtime/io/disk-io-mgr.h                 |  550 ++++++++
 be/src/runtime/io/handle-cache.h                |  197 +++
 be/src/runtime/io/handle-cache.inline.h         |  232 ++++
 be/src/runtime/io/request-context.cc            |  293 +++++
 be/src/runtime/io/request-context.h             |  403 ++++++
 be/src/runtime/io/request-ranges.h              |  471 +++++++
 be/src/runtime/io/scan-range.cc                 |  593 +++++++++
 be/src/runtime/row-batch.h                      |    2 +-
 be/src/runtime/runtime-state.cc                 |    2 +-
 be/src/runtime/runtime-state.h                  |    7 +-
 be/src/runtime/test-env.h                       |    2 +-
 be/src/runtime/tmp-file-mgr-test.cc             |   10 +-
 be/src/runtime/tmp-file-mgr.cc                  |   20 +-
 be/src/runtime/tmp-file-mgr.h                   |   20 +-
 49 files changed, 5702 insertions(+), 5595 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index bf7aa26..163567a 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -361,6 +361,7 @@ set (IMPALA_LINK_LIBS
   GlobalFlags
   histogram_proto
   ImpalaThrift
+  Io
   kudu_util
   krpc
   Rpc
@@ -386,6 +387,7 @@ set (IMPALA_LINK_LIBS
 if (BUILD_SHARED_LIBS)
   set (IMPALA_LINK_LIBS ${IMPALA_LINK_LIBS}
     BufferPool
+    Io
     Runtime
     Exec
     CodeGen

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/base-sequence-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/base-sequence-scanner.cc b/be/src/exec/base-sequence-scanner.cc
index fcf58c6..7f20e31 100644
--- a/be/src/exec/base-sequence-scanner.cc
+++ b/be/src/exec/base-sequence-scanner.cc
@@ -32,6 +32,7 @@
 #include "common/names.h"
 
 using namespace impala;
+using namespace impala::io;
 
 const int BaseSequenceScanner::HEADER_SIZE = 1024;
 const int BaseSequenceScanner::SYNC_MARKER = -1;
@@ -48,7 +49,7 @@ Status BaseSequenceScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
   // Issue just the header range for each file.  When the header is complete,
   // we'll issue the splits for that file.  Splits cannot be processed until the
   // header is parsed (the header object is then shared across splits for that file).
-  vector<DiskIoMgr::ScanRange*> header_ranges;
+  vector<ScanRange*> header_ranges;
   for (int i = 0; i < files.size(); ++i) {
     ScanRangeMetadata* metadata =
         static_cast<ScanRangeMetadata*>(files[i]->splits[0]->meta_data());
@@ -57,9 +58,9 @@ Status BaseSequenceScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
     // it is not cached.
     // TODO: add remote disk id and plumb that through to the io mgr.  It should have
     // 1 queue for each NIC as well?
-    DiskIoMgr::ScanRange* header_range = scan_node->AllocateScanRange(files[i]->fs,
+    ScanRange* header_range = scan_node->AllocateScanRange(files[i]->fs,
         files[i]->filename.c_str(), header_size, 0, metadata->partition_id, -1, false,
-        DiskIoMgr::BufferOpts::Uncached());
+        BufferOpts::Uncached());
     header_ranges.push_back(header_range);
   }
   // Issue the header ranges only. GetNextInternal() will issue the files' scan ranges
@@ -310,7 +311,7 @@ void BaseSequenceScanner::CloseFileRanges(const char* filename) {
   DCHECK(only_parsing_header_);
   HdfsFileDesc* desc = scan_node_->GetFileDesc(
       context_->partition_descriptor()->id(), filename);
-  const vector<DiskIoMgr::ScanRange*>& splits = desc->splits;
+  const vector<ScanRange*>& splits = desc->splits;
   for (int i = 0; i < splits.size(); ++i) {
     COUNTER_ADD(bytes_skipped_counter_, splits[i]->len());
     scan_node_->RangeComplete(file_format(), THdfsCompression::NONE);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 7fae959..f407877 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -27,6 +27,7 @@
 #include "exec/parquet-column-stats.h"
 #include "exec/scanner-context.inline.h"
 #include "runtime/collection-value-builder.h"
+#include "runtime/io/disk-io-mgr.h"
 #include "runtime/runtime-state.h"
 #include "runtime/runtime-filter.inline.h"
 #include "rpc/thrift-util.h"
@@ -35,6 +36,7 @@
 
 using std::move;
 using namespace impala;
+using namespace impala::io;
 
 DEFINE_double(parquet_min_filter_reject_ratio, 0.1, "(Advanced) If the percentage of "
     "rows rejected by a runtime filter drops below this value, the filter is disabled.");
@@ -67,7 +69,7 @@ const string PARQUET_MEM_LIMIT_EXCEEDED =
 
 Status HdfsParquetScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
     const std::vector<HdfsFileDesc*>& files) {
-  vector<DiskIoMgr::ScanRange*> footer_ranges;
+  vector<ScanRange*> footer_ranges;
   for (int i = 0; i < files.size(); ++i) {
     // If the file size is less than 12 bytes, it is an invalid Parquet file.
     if (files[i]->file_length < 12) {
@@ -80,10 +82,10 @@ Status HdfsParquetScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
     DCHECK_GE(footer_start, 0);
 
     // Try to find the split with the footer.
-    DiskIoMgr::ScanRange* footer_split = FindFooterSplit(files[i]);
+    ScanRange* footer_split = FindFooterSplit(files[i]);
 
     for (int j = 0; j < files[i]->splits.size(); ++j) {
-      DiskIoMgr::ScanRange* split = files[i]->splits[j];
+      ScanRange* split = files[i]->splits[j];
 
       DCHECK_LE(split->offset() + split->len(), files[i]->file_length);
       // If there are no materialized slots (such as count(*) over the table), we can
@@ -98,19 +100,19 @@ Status HdfsParquetScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
         // is done here, followed by scan ranges for the columns of each row group within
         // the actual split (in InitColumns()). The original split is stored in the
         // metadata associated with the footer range.
-        DiskIoMgr::ScanRange* footer_range;
+        ScanRange* footer_range;
         if (footer_split != NULL) {
           footer_range = scan_node->AllocateScanRange(files[i]->fs,
               files[i]->filename.c_str(), footer_size, footer_start,
               split_metadata->partition_id, footer_split->disk_id(),
               footer_split->expected_local(),
-              DiskIoMgr::BufferOpts(footer_split->try_cache(), files[i]->mtime), split);
+              BufferOpts(footer_split->try_cache(), files[i]->mtime), split);
         } else {
           // If we did not find the last split, we know it is going to be a remote read.
           footer_range =
               scan_node->AllocateScanRange(files[i]->fs, files[i]->filename.c_str(),
                   footer_size, footer_start, split_metadata->partition_id, -1, false,
-                  DiskIoMgr::BufferOpts::Uncached(), split);
+                  BufferOpts::Uncached(), split);
         }
 
         footer_ranges.push_back(footer_range);
@@ -125,10 +127,10 @@ Status HdfsParquetScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
   return Status::OK();
 }
 
-DiskIoMgr::ScanRange* HdfsParquetScanner::FindFooterSplit(HdfsFileDesc* file) {
+ScanRange* HdfsParquetScanner::FindFooterSplit(HdfsFileDesc* file) {
   DCHECK(file != NULL);
   for (int i = 0; i < file->splits.size(); ++i) {
-    DiskIoMgr::ScanRange* split = file->splits[i];
+    ScanRange* split = file->splits[i];
     if (split->offset() + split->len() == file->file_length) return split;
   }
   return NULL;
@@ -341,7 +343,7 @@ static int64_t GetRowGroupMidOffset(const parquet::RowGroup& row_group) {
 
 // Returns true if 'row_group' overlaps with 'split_range'.
 static bool CheckRowGroupOverlapsSplit(const parquet::RowGroup& row_group,
-    const DiskIoMgr::ScanRange* split_range) {
+    const ScanRange* split_range) {
   int64_t row_group_start = GetColumnStartOffset(row_group.columns[0].meta_data);
 
   const parquet::ColumnMetaData& last_column =
@@ -598,7 +600,7 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
 }
 
 Status HdfsParquetScanner::NextRowGroup() {
-  const DiskIoMgr::ScanRange* split_range = static_cast<ScanRangeMetadata*>(
+  const ScanRange* split_range = static_cast<ScanRangeMetadata*>(
       metadata_range_->meta_data())->original_split;
   int64_t split_offset = split_range->offset();
   int64_t split_length = split_range->len();
@@ -1377,12 +1379,12 @@ Status HdfsParquetScanner::ProcessFooter() {
     DiskIoMgr* io_mgr = scan_node_->runtime_state()->io_mgr();
 
     // Read the header into the metadata buffer.
-    DiskIoMgr::ScanRange* metadata_range = scan_node_->AllocateScanRange(
+    ScanRange* metadata_range = scan_node_->AllocateScanRange(
         metadata_range_->fs(), filename(), metadata_size, metadata_start, partition_id,
         metadata_range_->disk_id(), metadata_range_->expected_local(),
-        DiskIoMgr::BufferOpts::ReadInto(metadata_buffer.buffer(), metadata_size));
+        BufferOpts::ReadInto(metadata_buffer.buffer(), metadata_size));
 
-    unique_ptr<DiskIoMgr::BufferDescriptor> io_buffer;
+    unique_ptr<BufferDescriptor> io_buffer;
     RETURN_IF_ERROR(
         io_mgr->Read(scan_node_->reader_context(), metadata_range, &io_buffer));
     DCHECK_EQ(io_buffer->buffer(), metadata_buffer.buffer());
@@ -1589,7 +1591,7 @@ Status HdfsParquetScanner::InitColumns(
   parquet::RowGroup& row_group = file_metadata_.row_groups[row_group_idx];
 
   // All the scan ranges (one for each column).
-  vector<DiskIoMgr::ScanRange*> col_ranges;
+  vector<ScanRange*> col_ranges;
   // Used to validate that the number of values in each reader in column_readers_ is the
   // same.
   int num_values = -1;
@@ -1656,17 +1658,17 @@ Status HdfsParquetScanner::InitColumns(
           "filename '$1'", col_chunk.file_path, filename()));
     }
 
-    const DiskIoMgr::ScanRange* split_range =
+    const ScanRange* split_range =
         static_cast<ScanRangeMetadata*>(metadata_range_->meta_data())->original_split;
 
     // Determine if the column is completely contained within a local split.
     bool col_range_local = split_range->expected_local()
         && col_start >= split_range->offset()
         && col_end <= split_range->offset() + split_range->len();
-    DiskIoMgr::ScanRange* col_range = scan_node_->AllocateScanRange(metadata_range_->fs(),
+    ScanRange* col_range = scan_node_->AllocateScanRange(metadata_range_->fs(),
         filename(), col_len, col_start, partition_id, split_range->disk_id(),
         col_range_local,
-        DiskIoMgr::BufferOpts(split_range->try_cache(), file_desc->mtime));
+        BufferOpts(split_range->try_cache(), file_desc->mtime));
     col_ranges.push_back(col_range);
 
     // Get the stream that will be used for this column

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-parquet-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.h b/be/src/exec/hdfs-parquet-scanner.h
index e4b6ae7..0eea458 100644
--- a/be/src/exec/hdfs-parquet-scanner.h
+++ b/be/src/exec/hdfs-parquet-scanner.h
@@ -442,7 +442,7 @@ class HdfsParquetScanner : public HdfsScanner {
   ParquetFileVersion file_version_;
 
   /// Scan range for the metadata.
-  const DiskIoMgr::ScanRange* metadata_range_;
+  const io::ScanRange* metadata_range_;
 
   /// Pool to copy dictionary page buffer into. This pool is shared across all the
   /// pages in a column chunk.
@@ -585,7 +585,7 @@ class HdfsParquetScanner : public HdfsScanner {
 
   /// Find and return the last split in the file if it is assigned to this scan node.
   /// Returns NULL otherwise.
-  static DiskIoMgr::ScanRange* FindFooterSplit(HdfsFileDesc* file);
+  static io::ScanRange* FindFooterSplit(HdfsFileDesc* file);
 
   /// Process the file footer and parse file_metadata_.  This should be called with the
   /// last FOOTER_SIZE bytes in context_.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-scan-node-base.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.cc b/be/src/exec/hdfs-scan-node-base.cc
index 9149097..62dbd6a 100644
--- a/be/src/exec/hdfs-scan-node-base.cc
+++ b/be/src/exec/hdfs-scan-node-base.cc
@@ -32,11 +32,12 @@
 #include "codegen/llvm-codegen.h"
 #include "common/logging.h"
 #include "common/object-pool.h"
-#include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
+#include "exprs/scalar-expr.h"
 #include "runtime/descriptors.h"
-#include "runtime/disk-io-mgr-reader-context.h"
 #include "runtime/hdfs-fs-cache.h"
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/io/request-context.h"
 #include "runtime/runtime-filter.inline.h"
 #include "runtime/runtime-state.h"
 #include "util/disk-info.h"
@@ -54,6 +55,7 @@ DECLARE_bool(skip_file_runtime_filtering);
 
 namespace filesystem = boost::filesystem;
 using namespace impala;
+using namespace impala::io;
 using namespace strings;
 
 const string HdfsScanNodeBase::HDFS_SPLIT_STATS_DESC =
@@ -236,7 +238,7 @@ Status HdfsScanNodeBase::Prepare(RuntimeState* state) {
     file_desc->splits.push_back(
         AllocateScanRange(file_desc->fs, file_desc->filename.c_str(), split.length,
             split.offset, split.partition_id, params.volume_id, expected_local,
-            DiskIoMgr::BufferOpts(try_cache, file_desc->mtime)));
+            BufferOpts(try_cache, file_desc->mtime)));
   }
 
   // Update server wide metrics for number of scan ranges and ranges that have
@@ -485,10 +487,10 @@ bool HdfsScanNodeBase::FilePassesFilterPredicates(
   return true;
 }
 
-DiskIoMgr::ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file,
+ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file,
     int64_t len, int64_t offset, int64_t partition_id, int disk_id, bool expected_local,
-    const DiskIoMgr::BufferOpts& buffer_opts,
-    const DiskIoMgr::ScanRange* original_split) {
+    const BufferOpts& buffer_opts,
+    const ScanRange* original_split) {
   DCHECK_GE(disk_id, -1);
   // Require that the scan range is within [0, file_length). While this cannot be used
   // to guarantee safety (file_length metadata may be stale), it avoids different
@@ -502,21 +504,20 @@ DiskIoMgr::ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char*
 
   ScanRangeMetadata* metadata = runtime_state_->obj_pool()->Add(
         new ScanRangeMetadata(partition_id, original_split));
-  DiskIoMgr::ScanRange* range =
-      runtime_state_->obj_pool()->Add(new DiskIoMgr::ScanRange());
+  ScanRange* range = runtime_state_->obj_pool()->Add(new ScanRange);
   range->Reset(fs, file, len, offset, disk_id, expected_local, buffer_opts, metadata);
   return range;
 }
 
-DiskIoMgr::ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file,
+ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file,
     int64_t len, int64_t offset, int64_t partition_id, int disk_id, bool try_cache,
-    bool expected_local, int mtime, const DiskIoMgr::ScanRange* original_split) {
+    bool expected_local, int mtime, const ScanRange* original_split) {
   return AllocateScanRange(fs, file, len, offset, partition_id, disk_id, expected_local,
-      DiskIoMgr::BufferOpts(try_cache, mtime), original_split);
+      BufferOpts(try_cache, mtime), original_split);
 }
 
 Status HdfsScanNodeBase::AddDiskIoRanges(
-    const vector<DiskIoMgr::ScanRange*>& ranges, int num_files_queued) {
+    const vector<ScanRange*>& ranges, int num_files_queued) {
   RETURN_IF_ERROR(runtime_state_->io_mgr()->AddScanRanges(reader_context_.get(), ranges));
   num_unqueued_files_.Add(-num_files_queued);
   DCHECK_GE(num_unqueued_files_.Load(), 0);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-scan-node-base.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.h b/be/src/exec/hdfs-scan-node-base.h
index e6b2154..923b50a 100644
--- a/be/src/exec/hdfs-scan-node-base.h
+++ b/be/src/exec/hdfs-scan-node-base.h
@@ -31,11 +31,11 @@
 #include "exec/filter-context.h"
 #include "exec/scan-node.h"
 #include "runtime/descriptors.h"
-#include "runtime/disk-io-mgr.h"
+#include "runtime/io/request-ranges.h"
 #include "util/avro-util.h"
+#include "util/container-util.h"
 #include "util/progress-updater.h"
 #include "util/spinlock.h"
-#include "util/container-util.h"
 
 namespace impala {
 
@@ -72,7 +72,7 @@ struct HdfsFileDesc {
   THdfsCompression::type file_compression;
 
   /// Splits (i.e. raw byte ranges) for this file, assigned to this scan node.
-  std::vector<DiskIoMgr::ScanRange*> splits;
+  std::vector<io::ScanRange*> splits;
 };
 
 /// Struct for additional metadata for scan ranges. This contains the partition id
@@ -84,9 +84,9 @@ struct ScanRangeMetadata {
   /// For parquet scan ranges we initially create a request for the file footer for each
   /// split; we store a pointer to the actual split so that we can recover its information
   /// for the scanner to process.
-  const DiskIoMgr::ScanRange* original_split;
+  const io::ScanRange* original_split;
 
-  ScanRangeMetadata(int64_t partition_id, const DiskIoMgr::ScanRange* original_split)
+  ScanRangeMetadata(int64_t partition_id, const io::ScanRange* original_split)
       : partition_id(partition_id), original_split(original_split) { }
 };
 
@@ -154,7 +154,7 @@ class HdfsScanNodeBase : public ScanNode {
   const HdfsTableDescriptor* hdfs_table() const { return hdfs_table_; }
   const AvroSchemaElement& avro_schema() const { return *avro_schema_.get(); }
   int skip_header_line_count() const { return skip_header_line_count_; }
-  DiskIoRequestContext* reader_context() const { return reader_context_.get(); }
+  io::RequestContext* reader_context() const { return reader_context_.get(); }
   bool optimize_parquet_count_star() const { return optimize_parquet_count_star_; }
   int parquet_count_star_slot_offset() const { return parquet_count_star_slot_offset_; }
 
@@ -204,22 +204,22 @@ class HdfsScanNodeBase : public ScanNode {
   /// If not NULL, the 'original_split' pointer is stored for reference in the scan range
   /// metadata of the scan range that is to be allocated.
   /// This is thread safe.
-  DiskIoMgr::ScanRange* AllocateScanRange(hdfsFS fs, const char* file, int64_t len,
+  io::ScanRange* AllocateScanRange(hdfsFS fs, const char* file, int64_t len,
       int64_t offset, int64_t partition_id, int disk_id, bool expected_local,
-      const DiskIoMgr::BufferOpts& buffer_opts,
-      const DiskIoMgr::ScanRange* original_split = NULL);
+      const io::BufferOpts& buffer_opts,
+      const io::ScanRange* original_split = NULL);
 
   /// Old API for compatibility with text scanners (e.g. LZO text scanner).
-  DiskIoMgr::ScanRange* AllocateScanRange(hdfsFS fs, const char* file, int64_t len,
+  io::ScanRange* AllocateScanRange(hdfsFS fs, const char* file, int64_t len,
       int64_t offset, int64_t partition_id, int disk_id, bool try_cache,
-      bool expected_local, int mtime, const DiskIoMgr::ScanRange* original_split = NULL);
+      bool expected_local, int mtime, const io::ScanRange* original_split = NULL);
 
   /// Adds ranges to the io mgr queue. 'num_files_queued' indicates how many file's scan
   /// ranges have been added completely.  A file's scan ranges are added completely if no
   /// new scanner threads will be needed to process that file besides the additional
   /// threads needed to process those in 'ranges'.
   /// Can be overridden to add scan-node specific actions like starting scanner threads.
-  virtual Status AddDiskIoRanges(const std::vector<DiskIoMgr::ScanRange*>& ranges,
+  virtual Status AddDiskIoRanges(const std::vector<io::ScanRange*>& ranges,
       int num_files_queued) WARN_UNUSED_RESULT;
 
   /// Adds all splits for file_desc to the io mgr queue and indicates one file has
@@ -336,7 +336,7 @@ class HdfsScanNodeBase : public ScanNode {
   const int parquet_count_star_slot_offset_;
 
   /// RequestContext object to use with the disk-io-mgr for reads.
-  std::unique_ptr<DiskIoRequestContext> reader_context_;
+  std::unique_ptr<io::RequestContext> reader_context_;
 
   /// Descriptor for tuples this scan node constructs
   const TupleDescriptor* tuple_desc_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-scan-node-mt.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-mt.h b/be/src/exec/hdfs-scan-node-mt.h
index 4ce12fe..3502b18 100644
--- a/be/src/exec/hdfs-scan-node-mt.h
+++ b/be/src/exec/hdfs-scan-node-mt.h
@@ -50,7 +50,7 @@ class HdfsScanNodeMt : public HdfsScanNodeBase {
 
  private:
   /// Current scan range and corresponding scanner.
-  DiskIoMgr::ScanRange* scan_range_;
+  io::ScanRange* scan_range_;
   boost::scoped_ptr<ScannerContext> scanner_ctx_;
   boost::scoped_ptr<HdfsScanner> scanner_;
 };

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node.cc b/be/src/exec/hdfs-scan-node.cc
index 78f2ffa..2d58c05 100644
--- a/be/src/exec/hdfs-scan-node.cc
+++ b/be/src/exec/hdfs-scan-node.cc
@@ -43,6 +43,7 @@ DECLARE_bool(skip_file_runtime_filtering);
 #endif
 
 using namespace impala;
+using namespace impala::io;
 
 // Amount of memory that we approximate a scanner thread will use not including IoBuffers.
 // The memory used does not vary considerably between file formats (just a couple of MBs).
@@ -251,7 +252,7 @@ void HdfsScanNode::AddMaterializedRowBatch(unique_ptr<RowBatch> row_batch) {
   materialized_row_batches_->AddBatch(move(row_batch));
 }
 
-Status HdfsScanNode::AddDiskIoRanges(const vector<DiskIoMgr::ScanRange*>& ranges,
+Status HdfsScanNode::AddDiskIoRanges(const vector<ScanRange*>& ranges,
     int num_files_queued) {
   RETURN_IF_ERROR(
       runtime_state_->io_mgr()->AddScanRanges(reader_context_.get(), ranges));
@@ -420,7 +421,7 @@ void HdfsScanNode::ScannerThread() {
     // to return if there's an error.
     ranges_issued_barrier_.Wait(SCANNER_THREAD_WAIT_TIME_MS, &unused);
 
-    DiskIoMgr::ScanRange* scan_range;
+    ScanRange* scan_range;
     // Take a snapshot of num_unqueued_files_ before calling GetNextRange().
     // We don't want num_unqueued_files_ to go to zero between the return from
     // GetNextRange() and the check for when all ranges are complete.
@@ -480,7 +481,7 @@ exit:
 }
 
 Status HdfsScanNode::ProcessSplit(const vector<FilterContext>& filter_ctxs,
-    MemPool* expr_results_pool, DiskIoMgr::ScanRange* scan_range) {
+    MemPool* expr_results_pool, ScanRange* scan_range) {
   DCHECK(scan_range != NULL);
 
   ScanRangeMetadata* metadata = static_cast<ScanRangeMetadata*>(scan_range->meta_data());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-scan-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node.h b/be/src/exec/hdfs-scan-node.h
index 30435c2..a1c97cf 100644
--- a/be/src/exec/hdfs-scan-node.h
+++ b/be/src/exec/hdfs-scan-node.h
@@ -29,7 +29,7 @@
 
 #include "exec/filter-context.h"
 #include "exec/hdfs-scan-node-base.h"
-#include "runtime/disk-io-mgr.h"
+#include "runtime/io/disk-io-mgr.h"
 #include "util/counting-barrier.h"
 #include "util/thread.h"
 
@@ -79,7 +79,7 @@ class HdfsScanNode : public HdfsScanNodeBase {
   bool done() const { return done_; }
 
   /// Adds ranges to the io mgr queue and starts up new scanner threads if possible.
-  virtual Status AddDiskIoRanges(const std::vector<DiskIoMgr::ScanRange*>& ranges,
+  virtual Status AddDiskIoRanges(const std::vector<io::ScanRange*>& ranges,
       int num_files_queued) WARN_UNUSED_RESULT;
 
   /// Adds a materialized row batch for the scan node.  This is called from scanner
@@ -166,7 +166,7 @@ class HdfsScanNode : public HdfsScanNodeBase {
   /// thread. 'filter_ctxs' is a clone of the class-wide filter_ctxs_, used to filter rows
   /// in this split.
   Status ProcessSplit(const std::vector<FilterContext>& filter_ctxs,
-      MemPool* expr_results_pool, DiskIoMgr::ScanRange* scan_range) WARN_UNUSED_RESULT;
+      MemPool* expr_results_pool, io::ScanRange* scan_range) WARN_UNUSED_RESULT;
 
   /// Returns true if there is enough memory (against the mem tracker limits) to
   /// have a scanner thread.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/hdfs-text-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc
index d633734..487c6fc 100644
--- a/be/src/exec/hdfs-text-scanner.cc
+++ b/be/src/exec/hdfs-text-scanner.cc
@@ -40,6 +40,7 @@
 using boost::algorithm::ends_with;
 using boost::algorithm::to_lower;
 using namespace impala;
+using namespace impala::io;
 using namespace strings;
 
 const char* HdfsTextScanner::LLVM_CLASS_NAME = "class.impala::HdfsTextScanner";
@@ -74,7 +75,7 @@ HdfsTextScanner::~HdfsTextScanner() {
 
 Status HdfsTextScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
     const vector<HdfsFileDesc*>& files) {
-  vector<DiskIoMgr::ScanRange*> compressed_text_scan_ranges;
+  vector<ScanRange*> compressed_text_scan_ranges;
   int compressed_text_files = 0;
   vector<HdfsFileDesc*> lzo_text_files;
   for (int i = 0; i < files.size(); ++i) {
@@ -95,7 +96,7 @@ Status HdfsTextScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
           // In order to decompress gzip-, snappy- and bzip2-compressed text files, we
           // need to read entire files. Only read a file if we're assigned the first split
           // to avoid reading multi-block files with multiple scanners.
-          DiskIoMgr::ScanRange* split = files[i]->splits[j];
+          ScanRange* split = files[i]->splits[j];
 
           // We only process the split that starts at offset 0.
           if (split->offset() != 0) {
@@ -114,10 +115,10 @@ Status HdfsTextScanner::IssueInitialRanges(HdfsScanNodeBase* scan_node,
           DCHECK_GT(files[i]->file_length, 0);
           ScanRangeMetadata* metadata =
               static_cast<ScanRangeMetadata*>(split->meta_data());
-          DiskIoMgr::ScanRange* file_range = scan_node->AllocateScanRange(files[i]->fs,
+          ScanRange* file_range = scan_node->AllocateScanRange(files[i]->fs,
               files[i]->filename.c_str(), files[i]->file_length, 0,
               metadata->partition_id, split->disk_id(), split->expected_local(),
-              DiskIoMgr::BufferOpts(split->try_cache(), files[i]->mtime));
+              BufferOpts(split->try_cache(), files[i]->mtime));
           compressed_text_scan_ranges.push_back(file_range);
           scan_node->max_compressed_text_file_length()->Set(files[i]->file_length);
         }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/kudu-scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scan-node.cc b/be/src/exec/kudu-scan-node.cc
index 77fac89..6d5e085 100644
--- a/be/src/exec/kudu-scan-node.cc
+++ b/be/src/exec/kudu-scan-node.cc
@@ -52,7 +52,7 @@ KuduScanNode::KuduScanNode(ObjectPool* pool, const TPlanNode& tnode,
     // This value is built the same way as it assumes that the scan node runs co-located
     // with a Kudu tablet server and that the tablet server is using disks similarly as
     // a datanode would.
-    max_row_batches = 10 * (DiskInfo::num_disks() + DiskIoMgr::REMOTE_NUM_DISKS);
+    max_row_batches = 10 * (DiskInfo::num_disks() + io::DiskIoMgr::REMOTE_NUM_DISKS);
   }
   materialized_row_batches_.reset(new RowBatchQueue(max_row_batches));
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/scanner-context.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/scanner-context.cc b/be/src/exec/scanner-context.cc
index 8cb195d..d9de769 100644
--- a/be/src/exec/scanner-context.cc
+++ b/be/src/exec/scanner-context.cc
@@ -21,6 +21,7 @@
 
 #include "exec/hdfs-scan-node-base.h"
 #include "exec/hdfs-scan-node.h"
+#include "runtime/io/disk-io-mgr.h"
 #include "runtime/exec-env.h"
 #include "runtime/mem-pool.h"
 #include "runtime/row-batch.h"
@@ -32,6 +33,7 @@
 #include "common/names.h"
 
 using namespace impala;
+using namespace impala::io;
 using namespace strings;
 
 static const int64_t INIT_READ_PAST_SIZE_BYTES = 64 * 1024;
@@ -43,7 +45,7 @@ static const int64_t INIT_READ_PAST_SIZE_BYTES = 64 * 1024;
 static const int64_t OUTPUT_BUFFER_BYTES_LEFT_INIT = 0;
 
 ScannerContext::ScannerContext(RuntimeState* state, HdfsScanNodeBase* scan_node,
-    HdfsPartitionDescriptor* partition_desc, DiskIoMgr::ScanRange* scan_range,
+    HdfsPartitionDescriptor* partition_desc, ScanRange* scan_range,
     const vector<FilterContext>& filter_ctxs, MemPool* expr_results_pool)
   : state_(state),
     scan_node_(scan_node),
@@ -75,7 +77,7 @@ ScannerContext::Stream::Stream(ScannerContext* parent)
     boundary_buffer_(new StringBuffer(boundary_pool_.get())) {
 }
 
-ScannerContext::Stream* ScannerContext::AddStream(DiskIoMgr::ScanRange* range) {
+ScannerContext::Stream* ScannerContext::AddStream(ScanRange* range) {
   std::unique_ptr<Stream> stream(new Stream(this));
   stream->scan_range_ = range;
   stream->file_desc_ = scan_node_->GetFileDesc(partition_desc_->id(), stream->filename());
@@ -105,7 +107,7 @@ void ScannerContext::Stream::ReleaseCompletedResources(bool done) {
     scan_range_->Cancel(Status::CANCELLED);
   }
 
-  for (unique_ptr<DiskIoMgr::BufferDescriptor>& buffer : completed_io_buffers_) {
+  for (unique_ptr<BufferDescriptor>& buffer : completed_io_buffers_) {
     ExecEnv::GetInstance()->disk_io_mgr()->ReturnBuffer(move(buffer));
   }
   parent_->num_completed_io_buffers_ -= completed_io_buffers_.size();
@@ -164,9 +166,9 @@ Status ScannerContext::Stream::GetNextBuffer(int64_t read_past_size) {
       return Status::OK();
     }
     int64_t partition_id = parent_->partition_descriptor()->id();
-    DiskIoMgr::ScanRange* range = parent_->scan_node_->AllocateScanRange(
+    ScanRange* range = parent_->scan_node_->AllocateScanRange(
         scan_range_->fs(), filename(), read_past_buffer_size, offset, partition_id,
-        scan_range_->disk_id(), false, DiskIoMgr::BufferOpts::Uncached());
+        scan_range_->disk_id(), false, BufferOpts::Uncached());
     RETURN_IF_ERROR(parent_->state_->io_mgr()->Read(
         parent_->scan_node_->reader_context(), range, &io_buffer_));
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/exec/scanner-context.h
----------------------------------------------------------------------
diff --git a/be/src/exec/scanner-context.h b/be/src/exec/scanner-context.h
index 216209f..3ad6753 100644
--- a/be/src/exec/scanner-context.h
+++ b/be/src/exec/scanner-context.h
@@ -27,7 +27,7 @@
 #include "common/compiler-util.h"
 #include "common/status.h"
 #include "exec/filter-context.h"
-#include "runtime/disk-io-mgr.h"
+#include "runtime/io/request-ranges.h"
 
 namespace impala {
 
@@ -65,7 +65,7 @@ class ScannerContext {
   /// get pushed to) and the scan range to process.
   /// This context starts with 1 stream.
   ScannerContext(RuntimeState*, HdfsScanNodeBase*, HdfsPartitionDescriptor*,
-      DiskIoMgr::ScanRange* scan_range, const std::vector<FilterContext>& filter_ctxs,
+      io::ScanRange* scan_range, const std::vector<FilterContext>& filter_ctxs,
       MemPool* expr_results_pool);
 
   /// Destructor verifies that all stream objects have been released.
@@ -125,7 +125,7 @@ class ScannerContext {
     bool eof() const { return file_offset() == file_len_; }
 
     const char* filename() { return scan_range_->file(); }
-    const DiskIoMgr::ScanRange* scan_range() { return scan_range_; }
+    const io::ScanRange* scan_range() { return scan_range_; }
     const HdfsFileDesc* file_desc() { return file_desc_; }
 
     /// Returns the buffer's current offset in the file.
@@ -176,7 +176,7 @@ class ScannerContext {
    private:
     friend class ScannerContext;
     ScannerContext* parent_;
-    DiskIoMgr::ScanRange* scan_range_;
+    io::ScanRange* scan_range_;
     const HdfsFileDesc* file_desc_;
 
     /// Total number of bytes returned from GetBytes()
@@ -195,7 +195,7 @@ class ScannerContext {
     int64_t next_read_past_size_bytes_;
 
     /// The current io buffer. This starts as NULL before we've read any bytes.
-    std::unique_ptr<DiskIoMgr::BufferDescriptor> io_buffer_;
+    std::unique_ptr<io::BufferDescriptor> io_buffer_;
 
     /// Next byte to read in io_buffer_
     uint8_t* io_buffer_pos_;
@@ -227,7 +227,7 @@ class ScannerContext {
     /// On the next GetBytes() call, these buffers are released (the caller by calling
     /// GetBytes() signals it is done with its previous bytes).  At this point the
     /// buffers are returned to the I/O manager.
-    std::deque<std::unique_ptr<DiskIoMgr::BufferDescriptor>> completed_io_buffers_;
+    std::deque<std::unique_ptr<io::BufferDescriptor>> completed_io_buffers_;
 
     Stream(ScannerContext* parent);
 
@@ -290,7 +290,7 @@ class ScannerContext {
 
   /// Add a stream to this ScannerContext for 'range'. Returns the added stream.
   /// The stream is created in the runtime state's object pool
-  Stream* AddStream(DiskIoMgr::ScanRange* range);
+  Stream* AddStream(io::ScanRange* range);
 
   /// Returns false if scan_node_ is multi-threaded and has been cancelled.
   /// Always returns false if the scan_node_ is not multi-threaded.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index 41805af..0d4b61c 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -16,6 +16,7 @@
 # under the License.
 
 add_subdirectory(bufferpool)
+add_subdirectory(io)
 
 # where to put generated libraries
 set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
@@ -36,10 +37,6 @@ add_library(Runtime
   data-stream-sender.cc
   debug-options.cc
   descriptors.cc
-  disk-io-mgr.cc
-  disk-io-mgr-reader-context.cc
-  disk-io-mgr-scan-range.cc
-  disk-io-mgr-stress.cc
   exec-env.cc
   fragment-instance-state.cc
   hbase-table.cc
@@ -78,16 +75,11 @@ add_library(Runtime
 )
 add_dependencies(Runtime gen-deps)
 
-# This test runs forever so should not be part of 'make test'
-add_executable(disk-io-mgr-stress-test disk-io-mgr-stress-test.cc)
-target_link_libraries(disk-io-mgr-stress-test ${IMPALA_TEST_LINK_LIBS})
-
 ADD_BE_TEST(mem-pool-test)
 ADD_BE_TEST(free-pool-test)
 ADD_BE_TEST(string-buffer-test)
 ADD_BE_TEST(data-stream-test)
 ADD_BE_TEST(timestamp-test)
-ADD_BE_TEST(disk-io-mgr-test)
 ADD_BE_TEST(raw-value-test)
 ADD_BE_TEST(string-compare-test)
 ADD_BE_TEST(string-search-test)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-handle-cache.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-handle-cache.h b/be/src/runtime/disk-io-mgr-handle-cache.h
deleted file mode 100644
index 4ba2342..0000000
--- a/be/src/runtime/disk-io-mgr-handle-cache.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_H
-#define IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_H
-
-#include <array>
-#include <list>
-#include <map>
-#include <memory>
-
-#include <boost/thread/mutex.hpp>
-
-#include "common/hdfs.h"
-#include "common/status.h"
-#include "util/aligned-new.h"
-#include "util/impalad-metrics.h"
-#include "util/spinlock.h"
-#include "util/thread.h"
-
-namespace impala {
-
-/// This class is a small wrapper around the hdfsFile handle and the file system
-/// instance which is needed to close the file handle. The handle incorporates
-/// the last modified time of the file when it was opened. This is used to distinguish
-/// between file handles for files that can be updated or overwritten.
-class HdfsFileHandle {
- public:
-
-  /// Constructor will open the file
-  HdfsFileHandle(const hdfsFS& fs, const char* fname, int64_t mtime);
-
-  /// Destructor will close the file handle
-  ~HdfsFileHandle();
-
-  hdfsFile file() const { return hdfs_file_;  }
-  int64_t mtime() const { return mtime_; }
-  bool ok() const { return hdfs_file_ != nullptr; }
-
- private:
-  hdfsFS fs_;
-  hdfsFile hdfs_file_;
-  int64_t mtime_;
-};
-
-/// The FileHandleCache is a data structure that owns HdfsFileHandles to share between
-/// threads. The HdfsFileHandles are hash partitioned across NUM_PARTITIONS partitions.
-/// Each partition operates independently with its own locks, reducing contention
-/// between concurrent threads. The `capacity` is split between the partitions and is
-/// enforced independently.
-///
-/// Threads check out a file handle for exclusive access and return it when finished.
-/// If the file handle is not already present in the cache or all file handles for this
-/// file are checked out, the file handle is constructed and added to the cache.
-/// The cache can contain multiple file handles for the same file. If a file handle
-/// is checked out, it cannot be evicted from the cache. In this case, a cache can
-/// exceed the specified capacity.
-///
-/// The file handle cache is currently not suitable for remote files that maintain a
-/// connection as part of the handle. Most remote systems have a limit on the number
-/// of concurrent connections, and file handles in the cache would be counted towards
-/// that limit.
-///
-/// If there is a file handle in the cache and the underlying file is deleted,
-/// the file handle might keep the file from being deleted at the OS level. This can
-/// take up disk space and impact correctness. To avoid this, the cache will evict any
-/// file handle that has been unused for longer than threshold specified by
-/// `unused_handle_timeout_secs`. Eviction is disabled when the threshold is 0.
-///
-/// TODO: The cache should also evict file handles more aggressively if the file handle's
-/// mtime is older than the file's current mtime.
-template <size_t NUM_PARTITIONS>
-class FileHandleCache {
- public:
-  /// Instantiates the cache with `capacity` split evenly across NUM_PARTITIONS
-  /// partitions. If the capacity does not split evenly, then the capacity is rounded
-  /// up. The cache will age out any file handle that is unused for
-  /// `unused_handle_timeout_secs` seconds. Age out is disabled if this is set to zero.
-  FileHandleCache(size_t capacity, uint64_t unused_handle_timeout_secs);
-
-  /// Destructor is only called for backend tests
-  ~FileHandleCache();
-
-  /// Starts up a thread that monitors the age of file handles and evicts any that
-  /// exceed the limit.
-  Status Init() WARN_UNUSED_RESULT;
-
-  /// Get a file handle from the cache for the specified filename (fname) and
-  /// last modification time (mtime). This will hash the filename to determine
-  /// which partition to use for this file handle.
-  ///
-  /// If 'require_new_handle' is false and the partition contains an available handle,
-  /// the handle is returned and cache_hit is set to true. Otherwise, the partition will
-  /// try to construct a file handle and add it to the partition. On success, the new
-  /// file handle will be returned with cache_hit set to false. On failure, nullptr will
-  /// be returned. In either case, the partition may evict a file handle to make room
-  /// for the new file handle.
-  ///
-  /// This obtains exclusive control over the returned file handle. It must be paired
-  /// with a call to ReleaseFileHandle to release exclusive control.
-  HdfsFileHandle* GetFileHandle(const hdfsFS& fs, std::string* fname, int64_t mtime,
-      bool require_new_handle, bool* cache_hit);
-
-  /// Release the exclusive hold on the specified file handle (which was obtained
-  /// by calling GetFileHandle). The cache may evict a file handle if the cache is
-  /// above capacity. If 'destroy_handle' is true, immediately remove this handle
-  /// from the cache.
-  void ReleaseFileHandle(std::string* fname, HdfsFileHandle* fh, bool destroy_handle);
-
- private:
-  struct FileHandleEntry;
-  typedef std::multimap<std::string, FileHandleEntry> MapType;
-
-  struct LruListEntry {
-    LruListEntry(typename MapType::iterator map_entry_in);
-    typename MapType::iterator map_entry;
-    uint64_t timestamp_seconds;
-  };
-  typedef std::list<LruListEntry> LruListType;
-
-  struct FileHandleEntry {
-    FileHandleEntry(HdfsFileHandle* fh_in, LruListType& lru_list)
-    : fh(fh_in), lru_entry(lru_list.end()) {}
-    std::unique_ptr<HdfsFileHandle> fh;
-
-    /// in_use is true for a file handle checked out via GetFileHandle() that has not
-    /// been returned via ReleaseFileHandle().
-    bool in_use = false;
-
-    /// Iterator to this element's location in the LRU list. This only points to a
-    /// valid location when in_use is true. For error-checking, this is set to
-    /// lru_list.end() when in_use is false.
-    typename LruListType::iterator lru_entry;
-  };
-
-  /// Each partition operates independently, and thus has its own cache, LRU list,
-  /// and corresponding lock. To avoid contention on the lock_ due to false sharing
-  /// the partitions are aligned to cache line boundaries.
-  struct FileHandleCachePartition : public CacheLineAligned {
-    /// Protects access to cache and lru_list.
-    SpinLock lock;
-
-    /// Multimap from the file name to the file handles for that file. The cache
-    /// can contain multiple file handles for the same file and some may have
-    /// different mtimes if the file is being modified. All file handles are always
-    /// owned by the cache.
-    MapType cache;
-
-    /// The LRU list only contains file handles that are not in use.
-    LruListType lru_list;
-
-    /// Maximum number of file handles in cache without evicting unused file handles.
-    /// It is not a strict limit, and can be exceeded if all file handles are in use.
-    size_t capacity;
-
-    /// Current number of file handles in the cache
-    size_t size;
-  };
-
-  /// Periodic check to evict unused file handles. Only executed by eviction_thread_.
-  void EvictHandlesLoop();
-  static const int64_t EVICT_HANDLES_PERIOD_MS = 1000;
-
-  /// If the partition is above its capacity, evict the oldest unused file handles to
-  /// enforce the capacity.
-  void EvictHandles(FileHandleCachePartition& p);
-
-  std::array<FileHandleCachePartition, NUM_PARTITIONS> cache_partitions_;
-
-  /// Maximum time before an unused file handle is aged out of the cache.
-  /// Aging out is disabled if this is set to 0.
-  uint64_t unused_handle_timeout_secs_;
-
-  /// Thread to check for unused file handles to evict. This thread will exit when
-  /// the shut_down_promise_ is set.
-  std::unique_ptr<Thread> eviction_thread_;
-  Promise<bool> shut_down_promise_;
-};
-
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-handle-cache.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-handle-cache.inline.h b/be/src/runtime/disk-io-mgr-handle-cache.inline.h
deleted file mode 100644
index 3068971..0000000
--- a/be/src/runtime/disk-io-mgr-handle-cache.inline.h
+++ /dev/null
@@ -1,231 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <tuple>
-
-#include "runtime/disk-io-mgr-handle-cache.h"
-#include "util/hash-util.h"
-#include "util/time.h"
-
-#ifndef IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_INLINE_H
-#define IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_INLINE_H
-
-namespace impala {
-
-HdfsFileHandle::HdfsFileHandle(const hdfsFS& fs, const char* fname,
-    int64_t mtime)
-    : fs_(fs), hdfs_file_(hdfsOpenFile(fs, fname, O_RDONLY, 0, 0, 0)), mtime_(mtime) {
-  ImpaladMetrics::IO_MGR_NUM_CACHED_FILE_HANDLES->Increment(1L);
-  VLOG_FILE << "hdfsOpenFile() file=" << fname << " fid=" << hdfs_file_;
-}
-
-HdfsFileHandle::~HdfsFileHandle() {
-  if (hdfs_file_ != nullptr && fs_ != nullptr) {
-    ImpaladMetrics::IO_MGR_NUM_CACHED_FILE_HANDLES->Increment(-1L);
-    VLOG_FILE << "hdfsCloseFile() fid=" << hdfs_file_;
-    hdfsCloseFile(fs_, hdfs_file_);
-  }
-  fs_ = nullptr;
-  hdfs_file_ = nullptr;
-}
-
-template <size_t NUM_PARTITIONS>
-  FileHandleCache<NUM_PARTITIONS>::FileHandleCache(size_t capacity,
-      uint64_t unused_handle_timeout_secs)
-  : unused_handle_timeout_secs_(unused_handle_timeout_secs) {
-  DCHECK_GT(NUM_PARTITIONS, 0);
-  size_t remainder = capacity % NUM_PARTITIONS;
-  size_t base_capacity = capacity / NUM_PARTITIONS;
-  size_t partition_capacity = (remainder > 0 ? base_capacity + 1 : base_capacity);
-  for (FileHandleCachePartition& p : cache_partitions_) {
-    p.size = 0;
-    p.capacity = partition_capacity;
-  }
-}
-
-template <size_t NUM_PARTITIONS>
-FileHandleCache<NUM_PARTITIONS>::LruListEntry::LruListEntry(
-    typename MapType::iterator map_entry_in)
-     : map_entry(map_entry_in), timestamp_seconds(MonotonicSeconds()) {}
-
-template <size_t NUM_PARTITIONS>
-FileHandleCache<NUM_PARTITIONS>::~FileHandleCache() {
-  shut_down_promise_.Set(true);
-  if (eviction_thread_ != nullptr) eviction_thread_->Join();
-}
-
-template <size_t NUM_PARTITIONS>
-Status FileHandleCache<NUM_PARTITIONS>::Init() {
-  return Thread::Create("disk-io-mgr-handle-cache", "File Handle Timeout",
-      &FileHandleCache<NUM_PARTITIONS>::EvictHandlesLoop, this, &eviction_thread_);
-}
-
-template <size_t NUM_PARTITIONS>
-HdfsFileHandle* FileHandleCache<NUM_PARTITIONS>::GetFileHandle(
-    const hdfsFS& fs, std::string* fname, int64_t mtime, bool require_new_handle,
-    bool* cache_hit) {
-  // Hash the key and get appropriate partition
-  int index = HashUtil::Hash(fname->data(), fname->size(), 0) % NUM_PARTITIONS;
-  FileHandleCachePartition& p = cache_partitions_[index];
-  boost::lock_guard<SpinLock> g(p.lock);
-  pair<typename MapType::iterator, typename MapType::iterator> range =
-    p.cache.equal_range(*fname);
-
-  // If this requires a new handle, skip to the creation codepath. Otherwise,
-  // find an unused entry with the same mtime
-  FileHandleEntry* ret_elem = nullptr;
-  if (!require_new_handle) {
-    while (range.first != range.second) {
-      FileHandleEntry* elem = &range.first->second;
-      if (!elem->in_use && elem->fh->mtime() == mtime) {
-        // This element is currently in the lru_list, which means that lru_entry must
-        // be an iterator pointing into the lru_list.
-        DCHECK(elem->lru_entry != p.lru_list.end());
-        // Remove the element from the lru_list and designate that it is not on
-        // the lru_list by resetting its iterator to point to the end of the list.
-        p.lru_list.erase(elem->lru_entry);
-        elem->lru_entry = p.lru_list.end();
-        ret_elem = elem;
-        *cache_hit = true;
-        break;
-      }
-      ++range.first;
-    }
-  }
-
-  // There was no entry that was free or caller asked for a new handle
-  if (!ret_elem) {
-    *cache_hit = false;
-    // Create a new entry and move it into the map
-    HdfsFileHandle* new_fh = new HdfsFileHandle(fs, fname->data(), mtime);
-    if (!new_fh->ok()) {
-      delete new_fh;
-      return nullptr;
-    }
-    FileHandleEntry entry(new_fh, p.lru_list);
-    typename MapType::iterator new_it = p.cache.emplace_hint(range.second,
-        *fname, std::move(entry));
-    ret_elem = &new_it->second;
-    ++p.size;
-    if (p.size > p.capacity) EvictHandles(p);
-  }
-
-  DCHECK(ret_elem->fh.get() != nullptr);
-  DCHECK(!ret_elem->in_use);
-  ret_elem->in_use = true;
-  ImpaladMetrics::IO_MGR_NUM_FILE_HANDLES_OUTSTANDING->Increment(1L);
-  return ret_elem->fh.get();
-}
-
-template <size_t NUM_PARTITIONS>
-void FileHandleCache<NUM_PARTITIONS>::ReleaseFileHandle(std::string* fname,
-    HdfsFileHandle* fh, bool destroy_handle) {
-  DCHECK(fh != nullptr);
-  // Hash the key and get appropriate partition
-  int index = HashUtil::Hash(fname->data(), fname->size(), 0) % NUM_PARTITIONS;
-  FileHandleCachePartition& p = cache_partitions_[index];
-  boost::lock_guard<SpinLock> g(p.lock);
-  pair<typename MapType::iterator, typename MapType::iterator> range =
-    p.cache.equal_range(*fname);
-
-  // TODO: This can be optimized by maintaining some state in the file handle about
-  // its location in the map.
-  typename MapType::iterator release_it = range.first;
-  while (release_it != range.second) {
-    FileHandleEntry* elem = &release_it->second;
-    if (elem->fh.get() == fh) break;
-    ++release_it;
-  }
-  DCHECK(release_it != range.second);
-
-  // This file handle is no longer referenced
-  FileHandleEntry* release_elem = &release_it->second;
-  DCHECK(release_elem->in_use);
-  release_elem->in_use = false;
-  ImpaladMetrics::IO_MGR_NUM_FILE_HANDLES_OUTSTANDING->Increment(-1L);
-  if (destroy_handle) {
-    --p.size;
-    p.cache.erase(release_it);
-    return;
-  }
-  // Hdfs can use some memory for readahead buffering. Calling unbuffer reduces
-  // this buffering so that the file handle takes up less memory when in the cache.
-  // If unbuffering is not supported, then hdfsUnbufferFile() will return a non-zero
-  // return code, and we close the file handle and remove it from the cache.
-  if (hdfsUnbufferFile(release_elem->fh->file()) == 0) {
-    // This FileHandleEntry must not be in the lru list already, because it was
-    // in use. Verify this by checking that the lru_entry is pointing to the end,
-    // which cannot be true for any element in the lru list.
-    DCHECK(release_elem->lru_entry == p.lru_list.end());
-    // Add this to the lru list, establishing links in both directions.
-    // The FileHandleEntry has an iterator to the LruListEntry and the
-    // LruListEntry has an iterator to the location of the FileHandleEntry in
-    // the cache.
-    release_elem->lru_entry = p.lru_list.emplace(p.lru_list.end(), release_it);
-    if (p.size > p.capacity) EvictHandles(p);
-  } else {
-    VLOG_FILE << "FS does not support file handle unbuffering, closing file="
-              << fname;
-    --p.size;
-    p.cache.erase(release_it);
-  }
-}
-
-template <size_t NUM_PARTITIONS>
-void FileHandleCache<NUM_PARTITIONS>::EvictHandlesLoop() {
-  while (true) {
-    for (FileHandleCachePartition& p : cache_partitions_) {
-      boost::lock_guard<SpinLock> g(p.lock);
-      EvictHandles(p);
-    }
-    // This Get() will time out until shutdown, when the promise is set.
-    bool timed_out;
-    shut_down_promise_.Get(EVICT_HANDLES_PERIOD_MS, &timed_out);
-    if (!timed_out) break;
-  }
-  // The promise must be set to true.
-  DCHECK(shut_down_promise_.IsSet());
-  DCHECK(shut_down_promise_.Get());
-}
-
-template <size_t NUM_PARTITIONS>
-void FileHandleCache<NUM_PARTITIONS>::EvictHandles(
-    FileHandleCache<NUM_PARTITIONS>::FileHandleCachePartition& p) {
-  uint64_t now = MonotonicSeconds();
-  uint64_t oldest_allowed_timestamp =
-      now > unused_handle_timeout_secs_ ? now - unused_handle_timeout_secs_ : 0;
-  while (p.lru_list.size() > 0) {
-    // Peek at the oldest element
-    LruListEntry oldest_entry = p.lru_list.front();
-    typename MapType::iterator oldest_entry_map_it = oldest_entry.map_entry;
-    uint64_t oldest_entry_timestamp = oldest_entry.timestamp_seconds;
-    // If the oldest element does not need to be aged out and the cache is not over
-    // capacity, then we are done and there is nothing to evict.
-    if (p.size <= p.capacity && (unused_handle_timeout_secs_ == 0 ||
-        oldest_entry_timestamp >= oldest_allowed_timestamp)) {
-      return;
-    }
-    // Evict the oldest element
-    DCHECK(!oldest_entry_map_it->second.in_use);
-    p.cache.erase(oldest_entry_map_it);
-    p.lru_list.pop_front();
-    --p.size;
-  }
-}
-
-}
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-internal.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-internal.h b/be/src/runtime/disk-io-mgr-internal.h
deleted file mode 100644
index cc50af7..0000000
--- a/be/src/runtime/disk-io-mgr-internal.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_DISK_IO_MGR_INTERNAL_H
-#define IMPALA_RUNTIME_DISK_IO_MGR_INTERNAL_H
-
-#include <unistd.h>
-#include <queue>
-#include <boost/thread/locks.hpp>
-#include <gutil/strings/substitute.h>
-
-#include "common/logging.h"
-#include "runtime/disk-io-mgr-reader-context.h"
-#include "runtime/disk-io-mgr.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/thread-resource-mgr.h"
-#include "util/condition-variable.h"
-#include "util/cpu-info.h"
-#include "util/debug-util.h"
-#include "util/disk-info.h"
-#include "util/filesystem-util.h"
-#include "util/hdfs-util.h"
-#include "util/impalad-metrics.h"
-
-/// This file contains internal structures shared between submodules of the IoMgr. Users
-/// of the IoMgr do not need to include this file.
-namespace impala {
-
-/// Per disk state
-struct DiskIoMgr::DiskQueue {
-  /// Disk id (0-based)
-  int disk_id;
-
-  /// Lock that protects access to 'request_contexts' and 'work_available'
-  boost::mutex lock;
-
-  /// Condition variable to signal the disk threads that there is work to do or the
-  /// thread should shut down.  A disk thread will be woken up when there is a reader
-  /// added to the queue. A reader is only on the queue when it has at least one
-  /// scan range that is not blocked on available buffers.
-  ConditionVariable work_available;
-
-  /// list of all request contexts that have work queued on this disk
-  std::list<DiskIoRequestContext*> request_contexts;
-
-  /// Enqueue the request context to the disk queue.  The DiskQueue lock must not be taken.
-  inline void EnqueueContext(DiskIoRequestContext* worker) {
-    {
-      boost::unique_lock<boost::mutex> disk_lock(lock);
-      /// Check that the reader is not already on the queue
-      DCHECK(find(request_contexts.begin(), request_contexts.end(), worker) ==
-          request_contexts.end());
-      request_contexts.push_back(worker);
-    }
-    work_available.NotifyAll();
-  }
-
-  DiskQueue(int id) : disk_id(id) {}
-};
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-reader-context.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-reader-context.cc b/be/src/runtime/disk-io-mgr-reader-context.cc
deleted file mode 100644
index d62545b..0000000
--- a/be/src/runtime/disk-io-mgr-reader-context.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/disk-io-mgr-internal.h"
-
-#include "common/names.h"
-
-using namespace impala;
-
-void DiskIoRequestContext::Cancel(const Status& status) {
-  DCHECK(!status.ok());
-
-  // Callbacks are collected in this vector and invoked while no lock is held.
-  vector<WriteRange::WriteDoneCallback> write_callbacks;
-  {
-    lock_guard<mutex> lock(lock_);
-    DCHECK(Validate()) << endl << DebugString();
-
-    // Already being cancelled
-    if (state_ == DiskIoRequestContext::Cancelled) return;
-
-    DCHECK(status_.ok());
-    status_ = status;
-
-    // The reader will be put into a cancelled state until call cleanup is complete.
-    state_ = DiskIoRequestContext::Cancelled;
-
-    // Cancel all scan ranges for this reader. Each range could be one one of
-    // four queues.
-    for (int i = 0; i < disk_states_.size(); ++i) {
-      DiskIoRequestContext::PerDiskState& state = disk_states_[i];
-      RequestRange* range = NULL;
-      while ((range = state.in_flight_ranges()->Dequeue()) != NULL) {
-        if (range->request_type() == RequestType::READ) {
-          static_cast<ScanRange*>(range)->Cancel(status);
-        } else {
-          DCHECK(range->request_type() == RequestType::WRITE);
-          write_callbacks.push_back(static_cast<WriteRange*>(range)->callback_);
-        }
-      }
-
-      ScanRange* scan_range;
-      while ((scan_range = state.unstarted_scan_ranges()->Dequeue()) != NULL) {
-        scan_range->Cancel(status);
-      }
-      WriteRange* write_range;
-      while ((write_range = state.unstarted_write_ranges()->Dequeue()) != NULL) {
-        write_callbacks.push_back(write_range->callback_);
-      }
-    }
-
-    ScanRange* range = NULL;
-    while ((range = ready_to_start_ranges_.Dequeue()) != NULL) {
-      range->Cancel(status);
-    }
-    while ((range = blocked_ranges_.Dequeue()) != NULL) {
-      range->Cancel(status);
-    }
-    while ((range = cached_ranges_.Dequeue()) != NULL) {
-      range->Cancel(status);
-    }
-
-    // Schedule reader on all disks. The disks will notice it is cancelled and do any
-    // required cleanup
-    for (int i = 0; i < disk_states_.size(); ++i) {
-      DiskIoRequestContext::PerDiskState& state = disk_states_[i];
-      state.ScheduleContext(this, i);
-    }
-  }
-
-  for (const WriteRange::WriteDoneCallback& write_callback: write_callbacks) {
-    write_callback(status_);
-  }
-
-  // Signal reader and unblock the GetNext/Read thread.  That read will fail with
-  // a cancelled status.
-  ready_to_start_ranges_cv_.NotifyAll();
-}
-
-void DiskIoRequestContext::CancelAndMarkInactive() {
-  Cancel(Status::CANCELLED);
-
-  boost::unique_lock<boost::mutex> l(lock_);
-  DCHECK_NE(state_, Inactive);
-  DCHECK(Validate()) << endl << DebugString();
-
-  // Wait until the ranges finish up.
-  while (num_disks_with_ranges_ > 0) disks_complete_cond_var_.Wait(l);
-
-  // Validate that no buffers were leaked from this context.
-  DCHECK_EQ(num_buffers_in_reader_.Load(), 0) << endl << DebugString();
-  DCHECK_EQ(num_used_buffers_.Load(), 0) << endl << DebugString();
-  DCHECK(Validate()) << endl << DebugString();
-  state_ = Inactive;
-}
-
-void DiskIoRequestContext::AddRequestRange(
-    DiskIoMgr::RequestRange* range, bool schedule_immediately) {
-  // DCHECK(lock_.is_locked()); // TODO: boost should have this API
-  DiskIoRequestContext::PerDiskState& state = disk_states_[range->disk_id()];
-  if (state.done()) {
-    DCHECK_EQ(state.num_remaining_ranges(), 0);
-    state.set_done(false);
-    ++num_disks_with_ranges_;
-  }
-
-  bool schedule_context;
-  if (range->request_type() == RequestType::READ) {
-    DiskIoMgr::ScanRange* scan_range = static_cast<DiskIoMgr::ScanRange*>(range);
-    if (schedule_immediately) {
-      ScheduleScanRange(scan_range);
-    } else {
-      state.unstarted_scan_ranges()->Enqueue(scan_range);
-      num_unstarted_scan_ranges_.Add(1);
-    }
-    // If next_scan_range_to_start is NULL, schedule this DiskIoRequestContext so that it will
-    // be set. If it's not NULL, this context will be scheduled when GetNextRange() is
-    // invoked.
-    schedule_context = state.next_scan_range_to_start() == NULL;
-  } else {
-    DCHECK(range->request_type() == RequestType::WRITE);
-    DCHECK(!schedule_immediately);
-    DiskIoMgr::WriteRange* write_range = static_cast<DiskIoMgr::WriteRange*>(range);
-    state.unstarted_write_ranges()->Enqueue(write_range);
-
-    // ScheduleContext() has no effect if the context is already scheduled,
-    // so this is safe.
-    schedule_context = true;
-  }
-
-  if (schedule_context) state.ScheduleContext(this, range->disk_id());
-  ++state.num_remaining_ranges();
-}
-
-DiskIoRequestContext::DiskIoRequestContext(
-    DiskIoMgr* parent, int num_disks, MemTracker* tracker)
-  : parent_(parent), mem_tracker_(tracker), disk_states_(num_disks) {}
-
-// Dumps out request context information. Lock should be taken by caller
-string DiskIoRequestContext::DebugString() const {
-  stringstream ss;
-  ss << endl << "  DiskIoRequestContext: " << (void*)this << " (state=";
-  if (state_ == DiskIoRequestContext::Inactive) ss << "Inactive";
-  if (state_ == DiskIoRequestContext::Cancelled) ss << "Cancelled";
-  if (state_ == DiskIoRequestContext::Active) ss << "Active";
-  if (state_ != DiskIoRequestContext::Inactive) {
-    ss << " status_=" << (status_.ok() ? "OK" : status_.GetDetail())
-       << " #ready_buffers=" << num_ready_buffers_.Load()
-       << " #used_buffers=" << num_used_buffers_.Load()
-       << " #num_buffers_in_reader=" << num_buffers_in_reader_.Load()
-       << " #finished_scan_ranges=" << num_finished_ranges_.Load()
-       << " #disk_with_ranges=" << num_disks_with_ranges_
-       << " #disks=" << num_disks_with_ranges_;
-    for (int i = 0; i < disk_states_.size(); ++i) {
-      ss << endl << "   " << i << ": "
-         << "is_on_queue=" << disk_states_[i].is_on_queue()
-         << " done=" << disk_states_[i].done()
-         << " #num_remaining_scan_ranges=" << disk_states_[i].num_remaining_ranges()
-         << " #in_flight_ranges=" << disk_states_[i].in_flight_ranges()->size()
-         << " #unstarted_scan_ranges=" << disk_states_[i].unstarted_scan_ranges()->size()
-         << " #unstarted_write_ranges="
-         << disk_states_[i].unstarted_write_ranges()->size()
-         << " #reading_threads=" << disk_states_[i].num_threads_in_op();
-    }
-  }
-  ss << ")";
-  return ss.str();
-}
-
-bool DiskIoRequestContext::Validate() const {
-  if (state_ == DiskIoRequestContext::Inactive) {
-    LOG(WARNING) << "state_ == DiskIoRequestContext::Inactive";
-    return false;
-  }
-
-  if (num_used_buffers_.Load() < 0) {
-    LOG(WARNING) << "num_used_buffers_ < 0: #used=" << num_used_buffers_.Load();
-    return false;
-  }
-
-  if (num_ready_buffers_.Load() < 0) {
-    LOG(WARNING) << "num_ready_buffers_ < 0: #used=" << num_ready_buffers_.Load();
-    return false;
-  }
-
-  int total_unstarted_ranges = 0;
-  for (int i = 0; i < disk_states_.size(); ++i) {
-    const PerDiskState& state = disk_states_[i];
-    bool on_queue = state.is_on_queue();
-    int num_reading_threads = state.num_threads_in_op();
-
-    total_unstarted_ranges += state.unstarted_scan_ranges()->size();
-
-    if (num_reading_threads < 0) {
-      LOG(WARNING) << "disk_id=" << i
-                   << "state.num_threads_in_read < 0: #threads="
-                   << num_reading_threads;
-      return false;
-    }
-
-    if (state_ != DiskIoRequestContext::Cancelled) {
-      if (state.unstarted_scan_ranges()->size() + state.in_flight_ranges()->size() >
-          state.num_remaining_ranges()) {
-        LOG(WARNING) << "disk_id=" << i
-                     << " state.unstarted_ranges.size() + state.in_flight_ranges.size()"
-                     << " > state.num_remaining_ranges:"
-                     << " #unscheduled=" << state.unstarted_scan_ranges()->size()
-                     << " #in_flight=" << state.in_flight_ranges()->size()
-                     << " #remaining=" << state.num_remaining_ranges();
-        return false;
-      }
-
-      // If we have an in_flight range, the reader must be on the queue or have a
-      // thread actively reading for it.
-      if (!state.in_flight_ranges()->empty() && !on_queue && num_reading_threads == 0) {
-        LOG(WARNING) << "disk_id=" << i
-                     << " reader has inflight ranges but is not on the disk queue."
-                     << " #in_flight_ranges=" << state.in_flight_ranges()->size()
-                     << " #reading_threads=" << num_reading_threads
-                     << " on_queue=" << on_queue;
-        return false;
-      }
-
-      if (state.done() && num_reading_threads > 0) {
-        LOG(WARNING) << "disk_id=" << i
-                     << " state set to done but there are still threads working."
-                     << " #reading_threads=" << num_reading_threads;
-        return false;
-      }
-    } else {
-      // Is Cancelled
-      if (!state.in_flight_ranges()->empty()) {
-        LOG(WARNING) << "disk_id=" << i
-                     << "Reader cancelled but has in flight ranges.";
-        return false;
-      }
-      if (!state.unstarted_scan_ranges()->empty()) {
-        LOG(WARNING) << "disk_id=" << i
-                     << "Reader cancelled but has unstarted ranges.";
-        return false;
-      }
-    }
-
-    if (state.done() && on_queue) {
-      LOG(WARNING) << "disk_id=" << i
-                   << " state set to done but the reader is still on the disk queue."
-                   << " state.done=true and state.is_on_queue=true";
-      return false;
-    }
-  }
-
-  if (state_ != DiskIoRequestContext::Cancelled) {
-    if (total_unstarted_ranges != num_unstarted_scan_ranges_.Load()) {
-      LOG(WARNING) << "total_unstarted_ranges=" << total_unstarted_ranges
-                   << " sum_in_states=" << num_unstarted_scan_ranges_.Load();
-      return false;
-    }
-  } else {
-    if (!ready_to_start_ranges_.empty()) {
-      LOG(WARNING) << "Reader cancelled but has ready to start ranges.";
-      return false;
-    }
-    if (!blocked_ranges_.empty()) {
-      LOG(WARNING) << "Reader cancelled but has blocked ranges.";
-      return false;
-    }
-  }
-
-  return true;
-}
-
-void DiskIoRequestContext::PerDiskState::ScheduleContext(
-    DiskIoRequestContext* context, int disk_id) {
-  if (!is_on_queue_ && !done_) {
-    is_on_queue_ = true;
-    context->parent_->disk_queues_[disk_id]->EnqueueContext(context);
-  }
-}

[04/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
index fe3fd41..ec9f8d8 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
@@ -725,7 +725,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
@@ -739,7 +739,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -763,7 +763,7 @@ Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
@@ -783,7 +783,7 @@ Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB
 |
 00:SCAN HDFS [tpch.lineitem, RANDOM]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -808,7 +808,7 @@ Per-Host Resources: mem-estimate=776.83MB mem-reservation=68.00MB
 |  hash-table-id=00
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
@@ -835,7 +835,7 @@ Per-Host Resources: mem-estimate=776.83MB mem-reservation=68.00MB
 |
 00:SCAN HDFS [tpch.lineitem, RANDOM]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -857,7 +857,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
@@ -871,7 +871,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -895,7 +895,7 @@ Per-Host Resources: mem-estimate=100.14MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=100.14MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
@@ -921,7 +921,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
 Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
 00:SCAN HDFS [tpch.lineitem, RANDOM]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -946,7 +946,7 @@ Per-Host Resources: mem-estimate=100.14MB mem-reservation=68.00MB
 |  hash-table-id=00
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=50.07MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
@@ -979,7 +979,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
 Per-Host Resources: mem-estimate=176.00MB mem-reservation=0B
 00:SCAN HDFS [tpch.lineitem, RANDOM]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -1465,7 +1465,7 @@ PLAN-ROOT SINK
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  |  runtime filters: RF002 <- o_orderkey
+|  |  runtime filters: RF004[bloom] <- o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=5,6 row-size=99B cardinality=822530
 |  |
@@ -1480,7 +1480,7 @@ PLAN-ROOT SINK
 |  08:SCAN HDFS [tpch_parquet.lineitem]
 |     partitions=1/1 files=3 size=193.92MB
 |     predicates: l_shipmode = 'F'
-|     runtime filters: RF002 -> l_orderkey
+|     runtime filters: RF004[bloom] -> l_orderkey
 |     stats-rows=6001215 extrapolated-rows=disabled
 |     table stats: rows=6001215 size=193.92MB
 |     column stats: all
@@ -1492,7 +1492,7 @@ PLAN-ROOT SINK
 |--07:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  |  runtime filters: RF001 <- o_orderkey
+|  |  runtime filters: RF002[bloom] <- o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=3,4 row-size=103B cardinality=1151542
 |  |
@@ -1509,7 +1509,7 @@ PLAN-ROOT SINK
 |  |
 |  05:SCAN HDFS [tpch_parquet.lineitem]
 |     partitions=1/1 files=3 size=193.92MB
-|     runtime filters: RF001 -> l_orderkey
+|     runtime filters: RF002[bloom] -> l_orderkey
 |     stats-rows=6001215 extrapolated-rows=disabled
 |     table stats: rows=6001215 size=193.92MB
 |     column stats: all
@@ -1524,7 +1524,7 @@ PLAN-ROOT SINK
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=86B cardinality=575772
 |
@@ -1539,7 +1539,7 @@ PLAN-ROOT SINK
 01:SCAN HDFS [tpch_parquet.lineitem]
    partitions=1/1 files=3 size=193.92MB
    predicates: l_tax > 10
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=193.92MB
    column stats: all
@@ -1570,7 +1570,7 @@ Per-Host Resources: mem-estimate=97.00MB mem-reservation=34.00MB
 |--10:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  |  runtime filters: RF002 <- o_orderkey
+|  |  runtime filters: RF004[bloom] <- o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=5,6 row-size=99B cardinality=822530
 |  |
@@ -1591,7 +1591,7 @@ Per-Host Resources: mem-estimate=97.00MB mem-reservation=34.00MB
 |  08:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
 |     partitions=1/1 files=3 size=193.92MB
 |     predicates: l_shipmode = 'F'
-|     runtime filters: RF002 -> l_orderkey
+|     runtime filters: RF004[bloom] -> l_orderkey
 |     stats-rows=6001215 extrapolated-rows=disabled
 |     table stats: rows=6001215 size=193.92MB
 |     column stats: all
@@ -1603,7 +1603,7 @@ Per-Host Resources: mem-estimate=97.00MB mem-reservation=34.00MB
 |--07:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  |  runtime filters: RF001 <- o_orderkey
+|  |  runtime filters: RF002[bloom] <- o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=3,4 row-size=103B cardinality=1151542
 |  |
@@ -1626,7 +1626,7 @@ Per-Host Resources: mem-estimate=97.00MB mem-reservation=34.00MB
 |  |
 |  05:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
 |     partitions=1/1 files=3 size=193.92MB
-|     runtime filters: RF001 -> l_orderkey
+|     runtime filters: RF002[bloom] -> l_orderkey
 |     stats-rows=6001215 extrapolated-rows=disabled
 |     table stats: rows=6001215 size=193.92MB
 |     column stats: all
@@ -1652,7 +1652,7 @@ Per-Host Resources: mem-estimate=47.33MB mem-reservation=38.75MB
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=4.75MB mem-reservation=4.75MB spill-buffer=256.00KB
 |  tuple-ids=0,1 row-size=86B cardinality=575772
 |
@@ -1679,7 +1679,7 @@ Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
 01:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
    partitions=1/1 files=3 size=193.92MB
    predicates: l_tax > 10
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=193.92MB
    column stats: all
@@ -1711,7 +1711,7 @@ Per-Host Resources: mem-estimate=194.00MB mem-reservation=68.00MB
 |  |  hash-table-id=01
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  |  runtime filters: RF002 <- o_orderkey
+|  |  runtime filters: RF004[bloom] <- o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=5,6 row-size=99B cardinality=822530
 |  |
@@ -1739,7 +1739,7 @@ Per-Host Resources: mem-estimate=194.00MB mem-reservation=68.00MB
 |  08:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
 |     partitions=1/1 files=3 size=193.92MB
 |     predicates: l_shipmode = 'F'
-|     runtime filters: RF002 -> l_orderkey
+|     runtime filters: RF004[bloom] -> l_orderkey
 |     stats-rows=6001215 extrapolated-rows=disabled
 |     table stats: rows=6001215 size=193.92MB
 |     column stats: all
@@ -1752,7 +1752,7 @@ Per-Host Resources: mem-estimate=194.00MB mem-reservation=68.00MB
 |  |  hash-table-id=00
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  |  runtime filters: RF001 <- o_orderkey
+|  |  runtime filters: RF002[bloom] <- o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=3,4 row-size=103B cardinality=1151542
 |  |
@@ -1782,7 +1782,7 @@ Per-Host Resources: mem-estimate=194.00MB mem-reservation=68.00MB
 |  |
 |  05:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
 |     partitions=1/1 files=3 size=193.92MB
-|     runtime filters: RF001 -> l_orderkey
+|     runtime filters: RF002[bloom] -> l_orderkey
 |     stats-rows=6001215 extrapolated-rows=disabled
 |     table stats: rows=6001215 size=193.92MB
 |     column stats: all
@@ -1809,7 +1809,7 @@ Per-Host Resources: mem-estimate=90.91MB mem-reservation=73.75MB
 |  hash-table-id=02
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=2.88MB mem-reservation=2.88MB spill-buffer=128.00KB
 |  tuple-ids=0,1 row-size=86B cardinality=575772
 |
@@ -1843,7 +1843,7 @@ Per-Host Resources: mem-estimate=160.00MB mem-reservation=0B
 01:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
    partitions=1/1 files=3 size=193.92MB
    predicates: l_tax > 10
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=193.92MB
    column stats: all
@@ -1909,7 +1909,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: o_orderkey = l_orderkey
-|  runtime filters: RF000 <- l_orderkey
+|  runtime filters: RF000[bloom] <- l_orderkey
 |  mem-estimate=4.75MB mem-reservation=4.75MB spill-buffer=256.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=575772
 |
@@ -1931,7 +1931,7 @@ PLAN-ROOT SINK
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002[bloom] <- c_custkey
 |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=5757710
 |
@@ -1946,13 +1946,13 @@ PLAN-ROOT SINK
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004[bloom] <- o_orderkey
 |  mem-estimate=78.68MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2,1 row-size=66B cardinality=5757710
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> o_orderkey, RF001 -> o_custkey
+|     runtime filters: RF000[bloom] -> o_orderkey, RF002[bloom] -> o_custkey
 |     stats-rows=1500000 extrapolated-rows=disabled
 |     table stats: rows=1500000 size=162.56MB
 |     column stats: all
@@ -1961,7 +1961,7 @@ PLAN-ROOT SINK
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> l_orderkey
+   runtime filters: RF000[bloom] -> tpch.lineitem.l_orderkey, RF004[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -2009,7 +2009,7 @@ Per-Host Resources: mem-estimate=115.78MB mem-reservation=84.12MB
 |
 07:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]
 |  hash predicates: o_orderkey = l_orderkey
-|  runtime filters: RF000 <- l_orderkey
+|  runtime filters: RF000[bloom] <- l_orderkey
 |  mem-estimate=2.88MB mem-reservation=2.88MB spill-buffer=128.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=575772
 |
@@ -2043,7 +2043,7 @@ Per-Host Resources: mem-estimate=115.78MB mem-reservation=84.12MB
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002[bloom] <- c_custkey
 |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=5757710
 |
@@ -2064,7 +2064,7 @@ Per-Host Resources: mem-estimate=115.78MB mem-reservation=84.12MB
 05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004[bloom] <- o_orderkey
 |  mem-estimate=34.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2,1 row-size=66B cardinality=5757710
 |
@@ -2076,7 +2076,7 @@ Per-Host Resources: mem-estimate=115.78MB mem-reservation=84.12MB
 |  Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
 |  01:SCAN HDFS [tpch.orders, RANDOM]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> o_orderkey, RF001 -> o_custkey
+|     runtime filters: RF000[bloom] -> o_orderkey, RF002[bloom] -> o_custkey
 |     stats-rows=1500000 extrapolated-rows=disabled
 |     table stats: rows=1500000 size=162.56MB
 |     column stats: all
@@ -2091,7 +2091,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
 Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
 02:SCAN HDFS [tpch.lineitem, RANDOM]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> l_orderkey
+   runtime filters: RF000[bloom] -> tpch.lineitem.l_orderkey, RF004[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -2140,7 +2140,7 @@ Per-Host Resources: mem-estimate=175.68MB mem-reservation=122.88MB
 07:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]
 |  hash-table-id=00
 |  hash predicates: o_orderkey = l_orderkey
-|  runtime filters: RF000 <- l_orderkey
+|  runtime filters: RF000[bloom] <- l_orderkey
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=575772
 |
@@ -2182,7 +2182,7 @@ Per-Host Resources: mem-estimate=175.68MB mem-reservation=122.88MB
 |  hash-table-id=01
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002[bloom] <- c_custkey
 |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=5757710
 |
@@ -2211,7 +2211,7 @@ Per-Host Resources: mem-estimate=175.68MB mem-reservation=122.88MB
 |  hash-table-id=02
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004[bloom] <- o_orderkey
 |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=2,1 row-size=66B cardinality=5757710
 |
@@ -2230,7 +2230,7 @@ Per-Host Resources: mem-estimate=175.68MB mem-reservation=122.88MB
 |  Per-Host Resources: mem-estimate=176.00MB mem-reservation=0B
 |  01:SCAN HDFS [tpch.orders, RANDOM]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> o_orderkey, RF001 -> o_custkey
+|     runtime filters: RF000[bloom] -> o_orderkey, RF002[bloom] -> o_custkey
 |     stats-rows=1500000 extrapolated-rows=disabled
 |     table stats: rows=1500000 size=162.56MB
 |     column stats: all
@@ -2245,7 +2245,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
 Per-Host Resources: mem-estimate=176.00MB mem-reservation=0B
 02:SCAN HDFS [tpch.lineitem, RANDOM]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> l_orderkey
+   runtime filters: RF000[bloom] -> tpch.lineitem.l_orderkey, RF004[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=718.94MB
    column stats: all
@@ -2872,21 +2872,21 @@ PLAN-ROOT SINK
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.o_orderkey = t3.o_orderkey
 |  fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
-|  runtime filters: RF000 <- t3.o_orderkey
+|  runtime filters: RF000[bloom] <- t3.o_orderkey
 |  mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.o_orderkey = t3.o_orderkey
 |  |  fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
-|  |  runtime filters: RF001 <- t3.o_orderkey
+|  |  runtime filters: RF002[bloom] <- t3.o_orderkey
 |  |  mem-estimate=34.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  |--04:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: t3.o_orderkey = t4.o_orderkey
 |  |  |  fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
-|  |  |  runtime filters: RF002 <- t4.o_orderkey
+|  |  |  runtime filters: RF004[bloom] <- t4.o_orderkey
 |  |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  |  tuple-ids=2,3 row-size=16B cardinality=1500000
 |  |  |
@@ -2900,7 +2900,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch_parquet.orders t3]
 |  |     partitions=1/1 files=2 size=54.20MB
-|  |     runtime filters: RF002 -> t3.o_orderkey
+|  |     runtime filters: RF004[bloom] -> t3.o_orderkey
 |  |     stats-rows=1500000 extrapolated-rows=disabled
 |  |     table stats: rows=1500000 size=54.20MB
 |  |     column stats: all
@@ -2909,7 +2909,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [tpch_parquet.orders t2]
 |     partitions=1/1 files=2 size=54.20MB
-|     runtime filters: RF001 -> t2.o_orderkey
+|     runtime filters: RF002[bloom] -> t2.o_orderkey
 |     stats-rows=1500000 extrapolated-rows=disabled
 |     table stats: rows=1500000 size=54.20MB
 |     column stats: all
@@ -2918,7 +2918,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch_parquet.orders t1]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> t1.o_orderkey
+   runtime filters: RF000[bloom] -> t1.o_orderkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all
@@ -2942,7 +2942,7 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: t1.o_orderkey = t3.o_orderkey
 |  fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
-|  runtime filters: RF000 <- t3.o_orderkey
+|  runtime filters: RF000[bloom] <- t3.o_orderkey
 |  mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
@@ -2955,14 +2955,14 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
 |  05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  hash predicates: t2.o_orderkey = t3.o_orderkey
 |  |  fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
-|  |  runtime filters: RF001 <- t3.o_orderkey
+|  |  runtime filters: RF002[bloom] <- t3.o_orderkey
 |  |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  |--04:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  |  hash predicates: t3.o_orderkey = t4.o_orderkey
 |  |  |  fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
-|  |  |  runtime filters: RF002 <- t4.o_orderkey
+|  |  |  runtime filters: RF004[bloom] <- t4.o_orderkey
 |  |  |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  |  |  tuple-ids=2,3 row-size=16B cardinality=1500000
 |  |  |
@@ -2988,7 +2988,7 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
 |  |  Per-Host Resources: mem-estimate=40.00MB mem-reservation=0B
 |  |  02:SCAN HDFS [tpch_parquet.orders t3, RANDOM]
 |  |     partitions=1/1 files=2 size=54.20MB
-|  |     runtime filters: RF002 -> t3.o_orderkey
+|  |     runtime filters: RF004[bloom] -> t3.o_orderkey
 |  |     stats-rows=1500000 extrapolated-rows=disabled
 |  |     table stats: rows=1500000 size=54.20MB
 |  |     column stats: all
@@ -3003,7 +3003,7 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
 |  Per-Host Resources: mem-estimate=40.00MB mem-reservation=0B
 |  01:SCAN HDFS [tpch_parquet.orders t2, RANDOM]
 |     partitions=1/1 files=2 size=54.20MB
-|     runtime filters: RF001 -> t2.o_orderkey
+|     runtime filters: RF002[bloom] -> t2.o_orderkey
 |     stats-rows=1500000 extrapolated-rows=disabled
 |     table stats: rows=1500000 size=54.20MB
 |     column stats: all
@@ -3012,7 +3012,7 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
 |
 00:SCAN HDFS [tpch_parquet.orders t1, RANDOM]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> t1.o_orderkey
+   runtime filters: RF000[bloom] -> t1.o_orderkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all
@@ -3037,7 +3037,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 |  hash-table-id=00
 |  hash predicates: t1.o_orderkey = t3.o_orderkey
 |  fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
-|  runtime filters: RF000 <- t3.o_orderkey
+|  runtime filters: RF000[bloom] <- t3.o_orderkey
 |  mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
@@ -3058,7 +3058,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 |  |  hash-table-id=01
 |  |  hash predicates: t2.o_orderkey = t3.o_orderkey
 |  |  fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
-|  |  runtime filters: RF001 <- t3.o_orderkey
+|  |  runtime filters: RF002[bloom] <- t3.o_orderkey
 |  |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
@@ -3073,7 +3073,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 |  |  |  hash-table-id=02
 |  |  |  hash predicates: t3.o_orderkey = t4.o_orderkey
 |  |  |  fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
-|  |  |  runtime filters: RF002 <- t4.o_orderkey
+|  |  |  runtime filters: RF004[bloom] <- t4.o_orderkey
 |  |  |  mem-estimate=4.75MB mem-reservation=4.75MB spill-buffer=256.00KB
 |  |  |  tuple-ids=2,3 row-size=16B cardinality=1500000
 |  |  |
@@ -3106,7 +3106,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 |  |  Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
 |  |  02:SCAN HDFS [tpch_parquet.orders t3, RANDOM]
 |  |     partitions=1/1 files=2 size=54.20MB
-|  |     runtime filters: RF002 -> t3.o_orderkey
+|  |     runtime filters: RF004[bloom] -> t3.o_orderkey
 |  |     stats-rows=1500000 extrapolated-rows=disabled
 |  |     table stats: rows=1500000 size=54.20MB
 |  |     column stats: all
@@ -3121,7 +3121,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 |  Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
 |  01:SCAN HDFS [tpch_parquet.orders t2, RANDOM]
 |     partitions=1/1 files=2 size=54.20MB
-|     runtime filters: RF001 -> t2.o_orderkey
+|     runtime filters: RF002[bloom] -> t2.o_orderkey
 |     stats-rows=1500000 extrapolated-rows=disabled
 |     table stats: rows=1500000 size=54.20MB
 |     column stats: all
@@ -3130,7 +3130,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 |
 00:SCAN HDFS [tpch_parquet.orders t1, RANDOM]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> t1.o_orderkey
+   runtime filters: RF000[bloom] -> t1.o_orderkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test b/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
index c1675af..5e60d35 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-propagation.test
@@ -54,7 +54,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: t2.id = t3.tinyint_col
-|  runtime filters: RF001 <- t3.tinyint_col
+|  runtime filters: RF002 <- t3.tinyint_col
 |
 |--02:SCAN HDFS [functional.alltypestiny t3]
 |     partitions=4/4 files=4 size=460B
@@ -62,16 +62,16 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.year = t2.int_col
-|  runtime filters: RF002 <- t2.int_col
+|  runtime filters: RF004 <- t2.int_col
 |
 |--01:SCAN HDFS [functional.alltypesagg t2]
 |     partitions=11/11 files=11 size=814.73KB
 |     predicates: t2.bool_col = TRUE
-|     runtime filters: RF001 -> t2.id
+|     runtime filters: RF002 -> t2.id
 |
 00:SCAN HDFS [functional.alltypestiny t1]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF002 -> t1.year
+   runtime filters: RF004 -> t1.year
 ====
 # Two-way join query where multiple runtime filters are generated
 select straight_join * from functional.alltypesagg t1, functional.alltypesnopart t2
@@ -260,14 +260,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.int_col = t3.int_col
-|  |  runtime filters: RF002 <- t3.int_col
+|  |  runtime filters: RF004 <- t3.int_col
 |  |
 |  |--02:SCAN HDFS [functional.alltypesnopart t3]
 |  |     partitions=1/1 files=0 size=0B
 |  |
 |  01:SCAN HDFS [functional.alltypesnopart t2]
 |     partitions=1/1 files=0 size=0B
-|     runtime filters: RF002 -> t2.int_col
+|     runtime filters: RF004 -> t2.int_col
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
@@ -289,23 +289,23 @@ PLAN-ROOT SINK
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.tinyint_col = t4.tinyint_col, t3.int_col = t4.int_col
-|  |  runtime filters: RF002 <- t4.tinyint_col, RF003 <- t4.int_col
+|  |  runtime filters: RF004 <- t4.tinyint_col, RF005 <- t4.int_col
 |  |
 |  |--03:SCAN HDFS [functional.alltypesnopart t4]
 |  |     partitions=1/1 files=0 size=0B
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.id = t3.id
-|  |  runtime filters: RF004 <- t3.id
+|  |  runtime filters: RF008 <- t3.id
 |  |
 |  |--02:SCAN HDFS [functional.alltypesnopart t3]
 |  |     partitions=1/1 files=0 size=0B
-|  |     runtime filters: RF003 -> t3.int_col
+|  |     runtime filters: RF005 -> t3.int_col
 |  |
 |  01:SCAN HDFS [functional.alltypesnopart t2]
 |     partitions=1/1 files=0 size=0B
 |     predicates: t2.int_col = t2.id
-|     runtime filters: RF002 -> t2.tinyint_col, RF004 -> t2.id
+|     runtime filters: RF004 -> t2.tinyint_col, RF008 -> t2.id
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
@@ -329,7 +329,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.year = t3.int_col
-|  runtime filters: RF001 <- t3.int_col
+|  runtime filters: RF002 <- t3.int_col
 |
 |--02:SCAN HDFS [functional.alltypesnopart t3]
 |     partitions=1/1 files=0 size=0B
@@ -338,16 +338,16 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.year = t2.id
-|  runtime filters: RF002 <- t2.id
+|  runtime filters: RF004 <- t2.id
 |
 |--01:SCAN HDFS [functional.alltypesnopart t2]
 |     partitions=1/1 files=0 size=0B
 |     predicates: t2.bool_col = FALSE
-|     runtime filters: RF000 -> t2.id, RF001 -> t2.id
+|     runtime filters: RF000 -> t2.id, RF002 -> t2.id
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t1.year, RF001 -> t1.year, RF002 -> t1.year
+   runtime filters: RF000 -> t1.year, RF002 -> t1.year, RF004 -> t1.year
 ====
 # Five-way cyclic join query
 select straight_join * from functional.alltypesagg t1, functional.alltypesnopart t2,
@@ -366,7 +366,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: t3.month = t4.bigint_col
-|  runtime filters: RF002 <- t4.bigint_col
+|  runtime filters: RF004 <- t4.bigint_col
 |
 |--03:SCAN HDFS [functional.alltypesnopart t4]
 |     partitions=1/1 files=0 size=0B
@@ -374,23 +374,23 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: t2.int_col = t3.tinyint_col
-|  runtime filters: RF003 <- t3.tinyint_col
+|  runtime filters: RF006 <- t3.tinyint_col
 |
 |--02:SCAN HDFS [functional.alltypessmall t3]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF002 -> t3.month
+|     runtime filters: RF004 -> t3.month
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.year = t2.id
-|  runtime filters: RF004 <- t2.id
+|  runtime filters: RF008 <- t2.id
 |
 |--01:SCAN HDFS [functional.alltypesnopart t2]
 |     partitions=1/1 files=0 size=0B
-|     runtime filters: RF003 -> t2.int_col
+|     runtime filters: RF006 -> t2.int_col
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t1.month, RF004 -> t1.year
+   runtime filters: RF000 -> t1.month, RF008 -> t1.year
 ====
 # Two-way left outer join query; no runtime filters should be generated from the
 # ON-clause equi-join predicate
@@ -520,14 +520,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: t1.year = t2.int_col
-|  runtime filters: RF001 <- t2.int_col
+|  runtime filters: RF002 <- t2.int_col
 |
 |--01:SCAN HDFS [functional.alltypesnopart t2]
 |     partitions=1/1 files=0 size=0B
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t1.month, RF001 -> t1.year
+   runtime filters: RF000 -> t1.month, RF002 -> t1.year
 ====
 # Query with a subquery that is converted to a null-aware left anti join
 select straight_join * from functional.alltypesagg t1
@@ -621,7 +621,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: month = t2.int_col
-|  runtime filters: RF001 <- t2.int_col
+|  runtime filters: RF002 <- t2.int_col
 |
 |--02:SCAN HDFS [functional.alltypesnopart t2]
 |     partitions=1/1 files=0 size=0B
@@ -631,7 +631,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t1.year, RF001 -> t1.month
+   runtime filters: RF000 -> t1.year, RF002 -> t1.month
 ====
 # Four-way join query between an inline view with an aggregation and three base tables
 select straight_join 1 from
@@ -652,7 +652,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: year = b.year
-|  runtime filters: RF001 <- b.year
+|  runtime filters: RF002 <- b.year
 |
 |--03:SCAN HDFS [functional.alltypestiny b]
 |     partitions=4/4 files=4 size=460B
@@ -661,18 +661,18 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: year = a.year
-|  runtime filters: RF002 <- a.year
+|  runtime filters: RF004 <- a.year
 |
 |--02:SCAN HDFS [functional.alltypestiny a]
 |     partitions=4/4 files=4 size=460B
-|     runtime filters: RF000 -> a.year, RF001 -> a.year
+|     runtime filters: RF000 -> a.year, RF002 -> a.year
 |
 01:AGGREGATE [FINALIZE]
 |  group by: id, year, month
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> functional.alltypes.year, RF001 -> functional.alltypes.year, RF002 -> functional.alltypes.year
+   runtime filters: RF000 -> functional.alltypes.year, RF002 -> functional.alltypes.year, RF004 -> functional.alltypes.year
 ====
 # Two-way join query with an inline view in the probe side of the join where the
 # scan node to apply the filter is below a top-n (order by with limit) operator
@@ -818,7 +818,7 @@ PLAN-ROOT SINK
 |
 |--06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t3.month = t4.smallint_col
-|  |  runtime filters: RF001 <- t4.smallint_col
+|  |  runtime filters: RF002 <- t4.smallint_col
 |  |
 |  |--05:SCAN HDFS [functional.alltypesnopart t4]
 |  |     partitions=1/1 files=0 size=0B
@@ -826,7 +826,7 @@ PLAN-ROOT SINK
 |  |
 |  04:SCAN HDFS [functional.alltypes t3]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> t3.month
+|     runtime filters: RF002 -> t3.month
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.year = t2.int_col
@@ -975,7 +975,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.year = t2.id + t3.id + 1
-|  runtime filters: RF001 <- t2.id + t3.id + 1
+|  runtime filters: RF002 <- t2.id + t3.id + 1
 |
 |--03:HASH JOIN [LEFT OUTER JOIN]
 |  |  hash predicates: t2.id = t3.id
@@ -990,7 +990,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t1.month, RF001 -> t1.year
+   runtime filters: RF000 -> t1.month, RF002 -> t1.year
 ====
 # Multi-way join query where the slots of all the join predicates belong to the same
 # equivalence class
@@ -1010,7 +1010,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF001 <- t2.id
+|  runtime filters: RF002 <- t2.id
 |
 |--01:SCAN HDFS [functional.alltypestiny t2]
 |     partitions=4/4 files=4 size=460B
@@ -1018,7 +1018,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny t1]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> t1.id, RF001 -> t1.id
+   runtime filters: RF000 -> t1.id, RF002 -> t1.id
 ====
 # Equivalent query to the one above; the same runtime filters should be generated
 select straight_join 1 from functional.alltypestiny t1 join functional.alltypestiny t2 on t1.id = t2.id
@@ -1037,7 +1037,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF001 <- t2.id
+|  runtime filters: RF002 <- t2.id
 |
 |--01:SCAN HDFS [functional.alltypestiny t2]
 |     partitions=4/4 files=4 size=460B
@@ -1045,7 +1045,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny t1]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> t1.id, RF001 -> t1.id
+   runtime filters: RF000 -> t1.id, RF002 -> t1.id
 ====
 # Check that runtime filters are not generated in subplans
 select straight_join 1 from tpch_nested_parquet.customer c,
@@ -1124,7 +1124,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t3.id
-|  runtime filters: RF001 <- t3.id
+|  runtime filters: RF002 <- t3.id
 |
 |--02:SCAN HDFS [functional.alltypestiny t3]
 |     partitions=4/4 files=4 size=460B
@@ -1132,15 +1132,15 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF002 <- t2.id
+|  runtime filters: RF004 <- t2.id
 |
 |--01:SCAN HDFS [functional.alltypestiny t2]
 |     partitions=4/4 files=4 size=460B
-|     runtime filters: RF000 -> t2.id + t2.id, RF001 -> t2.id
+|     runtime filters: RF000 -> t2.id + t2.id, RF002 -> t2.id
 |
 00:SCAN HDFS [functional.alltypestiny t1]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> t1.id + t1.id, RF001 -> t1.id, RF002 -> t1.id
+   runtime filters: RF000 -> t1.id + t1.id, RF002 -> t1.id, RF004 -> t1.id
 ====
 # IMPALA-3074: Generated runtime filter has multiple candidate target nodes not all of
 # which are valid due to type mismatch between the associated source and target
@@ -1160,7 +1160,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a1.int_col = a3.smallint_col
-|  runtime filters: RF001 <- a3.smallint_col
+|  runtime filters: RF002 <- a3.smallint_col
 |
 |--01:SCAN HDFS [functional.alltypestiny a3]
 |     partitions=4/4 files=4 size=460B
@@ -1168,7 +1168,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny a1]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF001 -> a1.int_col
+   runtime filters: RF002 -> a1.int_col
 ====
 # IMPALA-3574: Runtime filter generated from a targer expr that contains a TupleIsNull
 # predicate.
@@ -1195,7 +1195,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: int_col = int_col
-|  runtime filters: RF001 <- int_col
+|  runtime filters: RF002 <- int_col
 |
 |--01:SCAN HDFS [functional.alltypes]
 |     partitions=24/24 files=24 size=478.45KB
@@ -1203,7 +1203,7 @@ PLAN-ROOT SINK
 |
 02:SCAN HDFS [functional.alltypesagg]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> coalesce(int_col, 384), RF001 -> int_col
+   runtime filters: RF000 -> coalesce(int_col, 384), RF002 -> int_col
 ====
 # IMPALA-4076: Test pruning the least selective runtime filters to obey
 # MAX_NUM_RUNTIME_FILTERS in the presence of zero-cardinality plan nodes. This query was
@@ -1363,14 +1363,14 @@ PLAN-ROOT SINK
 |
 |--11:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.bigint_col = b.bigint_col, a.bool_col = b.bool_col, a.double_col = b.double_col, a.float_col = b.float_col, a.id = b.id, a.int_col = b.int_col, a.smallint_col = b.smallint_col, a.tinyint_col = b.tinyint_col
-|  |  runtime filters: RF016 <- b.bigint_col, RF017 <- b.bool_col, RF018 <- b.double_col, RF019 <- b.float_col, RF020 <- b.id, RF021 <- b.int_col, RF022 <- b.smallint_col, RF023 <- b.tinyint_col
+|  |  runtime filters: RF032 <- b.bigint_col, RF033 <- b.bool_col, RF034 <- b.double_col, RF035 <- b.float_col, RF036 <- b.id, RF037 <- b.int_col, RF038 <- b.smallint_col, RF039 <- b.tinyint_col
 |  |
 |  |--10:SCAN HDFS [functional.alltypestiny b]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  09:SCAN HDFS [functional.alltypes a]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF016 -> a.bigint_col, RF017 -> a.bool_col, RF018 -> a.double_col, RF019 -> a.float_col, RF020 -> a.id, RF021 -> a.int_col, RF022 -> a.smallint_col, RF023 -> a.tinyint_col
+|     runtime filters: RF032 -> a.bigint_col, RF033 -> a.bool_col, RF034 -> a.double_col, RF035 -> a.float_col, RF036 -> a.id, RF037 -> a.int_col, RF038 -> a.smallint_col, RF039 -> a.tinyint_col
 |
 30:NESTED LOOP JOIN [CROSS JOIN]
 |
@@ -1387,14 +1387,14 @@ PLAN-ROOT SINK
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.bool_col = b.bool_col, a.id = b.id
-|  |  runtime filters: RF006 <- b.bool_col, RF007 <- b.id
+|  |  runtime filters: RF012 <- b.bool_col, RF013 <- b.id
 |  |
 |  |--04:SCAN HDFS [functional.alltypestiny b]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  03:SCAN HDFS [functional.alltypes a]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF006 -> a.bool_col, RF007 -> a.id
+|     runtime filters: RF012 -> a.bool_col, RF013 -> a.id
 |
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: a.bigint_col = b.bigint_col, a.bool_col = b.bool_col, a.id = b.id, a.int_col = b.int_col, a.smallint_col = b.smallint_col, a.tinyint_col = b.tinyint_col

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-query-options.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-query-options.test b/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-query-options.test
index a8909be..5188d25 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-query-options.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/runtime-filter-query-options.test
@@ -20,21 +20,21 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col, RF003 <- c.month
+|  runtime filters: RF004 <- c.int_col, RF005 <- c.month
 |
 |--02:SCAN HDFS [functional.alltypes c]
 |     partitions=24/24 files=24 size=478.45KB
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.bool_col, RF001 -> a.year, RF002 -> a.int_col, RF003 -> a.month, RF004 -> a.id, RF005 -> a.date_string_col
+   runtime filters: RF000 -> a.bool_col, RF001 -> a.year, RF004 -> a.int_col, RF005 -> a.month, RF008 -> a.id, RF009 -> a.date_string_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -59,7 +59,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col, RF003 <- c.month
+|  runtime filters: RF004 <- c.int_col, RF005 <- c.month
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -68,7 +68,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--08:EXCHANGE [BROADCAST]
 |  |
@@ -77,7 +77,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.bool_col, RF001 -> a.year, RF002 -> a.int_col, RF003 -> a.month, RF004 -> a.id, RF005 -> a.date_string_col
+   runtime filters: RF000 -> a.bool_col, RF001 -> a.year, RF004 -> a.int_col, RF005 -> a.month, RF008 -> a.id, RF009 -> a.date_string_col
 ====
 # Keep only MAX_NUM_RUNTIME_FILTERS most selective filters, remove the rest.
 # In this query RF000 (<- d.bool_col) and RF001 (<- d.year) are the least selective
@@ -113,7 +113,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col, RF003 <- c.month
+|  runtime filters: RF004 <- c.int_col, RF005 <- c.month
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -122,7 +122,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--08:EXCHANGE [BROADCAST]
 |  |
@@ -131,7 +131,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF004 -> a.id, RF005 -> a.date_string_col, RF002 -> a.int_col, RF003 -> a.month
+   runtime filters: RF008 -> a.id, RF009 -> a.date_string_col, RF004 -> a.int_col, RF005 -> a.month
 ====
 # DISABLE_ROW_RUNTIME_FILTERING is set: only partition column filters are applied.
 select /* +straight_join */ count(*) from functional.alltypes a
@@ -157,7 +157,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF003 <- c.month
+|  runtime filters: RF005 <- c.month
 |
 |--02:SCAN HDFS [functional.alltypes c]
 |     partitions=24/24 files=24 size=478.45KB
@@ -170,7 +170,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF001 -> a.year, RF003 -> a.month
+   runtime filters: RF001 -> a.year, RF005 -> a.month
 ====
 # DISABLE_ROW_RUNTIME_FILTERING is set and MAX_NUM_RUNTIME_FILTERS is set to 2: only the 2
 # partition column filters are applied
@@ -198,7 +198,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF003 <- c.month
+|  runtime filters: RF005 <- c.month
 |
 |--02:SCAN HDFS [functional.alltypes c]
 |     partitions=24/24 files=24 size=478.45KB
@@ -211,7 +211,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF001 -> a.year, RF003 -> a.month
+   runtime filters: RF001 -> a.year, RF005 -> a.month
 ====
 # RUNTIME_FILTER_MODE is set to LOCAL: only local filters are applied
 select /* +straight_join */ count(*) from functional.alltypes a
@@ -237,21 +237,21 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col, RF003 <- c.month
+|  runtime filters: RF004 <- c.int_col, RF005 <- c.month
 |
 |--02:SCAN HDFS [functional.alltypes c]
 |     partitions=24/24 files=24 size=478.45KB
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.bool_col, RF001 -> a.year, RF002 -> a.int_col, RF003 -> a.month, RF004 -> a.id, RF005 -> a.date_string_col
+   runtime filters: RF000 -> a.bool_col, RF001 -> a.year, RF004 -> a.int_col, RF005 -> a.month, RF008 -> a.id, RF009 -> a.date_string_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -275,7 +275,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col, RF003 <- c.month
+|  runtime filters: RF004 <- c.int_col, RF005 <- c.month
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -284,7 +284,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--08:EXCHANGE [BROADCAST]
 |  |
@@ -293,7 +293,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF002 -> a.int_col, RF003 -> a.month, RF004 -> a.id, RF005 -> a.date_string_col
+   runtime filters: RF004 -> a.int_col, RF005 -> a.month, RF008 -> a.id, RF009 -> a.date_string_col
 ====
 # RUNTIME_FILTER_MODE is set to LOCAL and MAX_NUM_RUNTIME_FILTERS is set to 3: only 3
 # local filters are kept, which means that both local and non-local filters are removed
@@ -321,21 +321,21 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col
+|  runtime filters: RF004 <- c.int_col
 |
 |--02:SCAN HDFS [functional.alltypes c]
 |     partitions=24/24 files=24 size=478.45KB
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF004 -> a.id, RF005 -> a.date_string_col, RF002 -> a.int_col
+   runtime filters: RF008 -> a.id, RF009 -> a.date_string_col, RF004 -> a.int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -359,7 +359,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF002 <- c.int_col
+|  runtime filters: RF004 <- c.int_col
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -368,7 +368,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id, a.date_string_col = b.date_string_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.date_string_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.date_string_col
 |
 |--08:EXCHANGE [BROADCAST]
 |  |
@@ -377,7 +377,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF004 -> a.id, RF005 -> a.date_string_col, RF002 -> a.int_col
+   runtime filters: RF008 -> a.id, RF009 -> a.date_string_col, RF004 -> a.int_col
 ====
 # DISABLE_ROW_RUNTIME_FILTERING is set and RUNTIME_FILTER_MODE is set to LOCAL: only local
 # partition column filters are applied
@@ -405,7 +405,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF003 <- c.month
+|  runtime filters: RF005 <- c.month
 |
 |--02:SCAN HDFS [functional.alltypes c]
 |     partitions=24/24 files=24 size=478.45KB
@@ -418,7 +418,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF001 -> a.year, RF003 -> a.month
+   runtime filters: RF001 -> a.year, RF005 -> a.month
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -442,7 +442,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.int_col = c.int_col, a.month = c.month
-|  runtime filters: RF003 <- c.month
+|  runtime filters: RF005 <- c.month
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -459,7 +459,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF003 -> a.month
+   runtime filters: RF005 -> a.month
 ====
 # RUNTIME_FILTER_MODE is OFF: no filters are applied
 select /* +straight_join */ count(*) from functional.alltypes a
@@ -533,3 +533,21 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
 ====
+# DISABLE_ROW_RUNTIME_FILTERING completely disables filters for Kudu.
+select /* +straight_join */ count(*) from functional_kudu.alltypes a
+  join functional_kudu.alltypes b on a.id = b.id
+---- QUERYOPTIONS
+DISABLE_ROW_RUNTIME_FILTERING=TRUE
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|
+02:HASH JOIN [INNER JOIN]
+|  hash predicates: a.id = b.id
+|
+|--01:SCAN KUDU [functional_kudu.alltypes b]
+|
+00:SCAN KUDU [functional_kudu.alltypes a]
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test b/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
index 719b07d..6520238 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
@@ -20,7 +20,7 @@ Per-Host Resources: mem-estimate=25.94MB mem-reservation=1.94MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = n_nationkey
 |  fk/pk conjuncts: c_nationkey = n_nationkey
-|  runtime filters: RF000 <- n_nationkey
+|  runtime filters: RF000[bloom] <- n_nationkey
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=150000
 |
@@ -40,7 +40,7 @@ Per-Host Resources: mem-estimate=25.94MB mem-reservation=1.94MB
 |
 00:SCAN HDFS [tpch_parquet.customer, RANDOM]
    partitions=1/1 files=1 size=12.34MB
-   runtime filters: RF000 -> c_nationkey
+   runtime filters: RF000[bloom] -> c_nationkey
    stats-rows=150000 extrapolated-rows=disabled
    table stats: rows=150000 size=12.34MB
    column stats: all
@@ -65,7 +65,7 @@ Per-Host Resources: mem-estimate=51.88MB mem-reservation=3.88MB
 |  hash-table-id=00
 |  hash predicates: c_nationkey = n_nationkey
 |  fk/pk conjuncts: c_nationkey = n_nationkey
-|  runtime filters: RF000 <- n_nationkey
+|  runtime filters: RF000[bloom] <- n_nationkey
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=150000
 |
@@ -92,7 +92,7 @@ Per-Host Resources: mem-estimate=51.88MB mem-reservation=3.88MB
 |
 00:SCAN HDFS [tpch_parquet.customer, RANDOM]
    partitions=1/1 files=1 size=12.34MB
-   runtime filters: RF000 -> c_nationkey
+   runtime filters: RF000[bloom] -> c_nationkey
    stats-rows=150000 extrapolated-rows=disabled
    table stats: rows=150000 size=12.34MB
    column stats: all
@@ -218,7 +218,7 @@ Per-Host Resources: mem-estimate=34.00MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF000 <- c_custkey
+|  runtime filters: RF000[bloom] <- c_custkey
 |  mem-estimate=34.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
@@ -244,7 +244,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
 Per-Host Resources: mem-estimate=40.00MB mem-reservation=0B
 00:SCAN HDFS [tpch_parquet.orders, RANDOM]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> o_custkey
+   runtime filters: RF000[bloom] -> o_custkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all
@@ -269,7 +269,7 @@ Per-Host Resources: mem-estimate=34.00MB mem-reservation=34.00MB
 |  hash-table-id=00
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF000 <- c_custkey
+|  runtime filters: RF000[bloom] <- c_custkey
 |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
@@ -302,7 +302,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
 Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
 00:SCAN HDFS [tpch_parquet.orders, RANDOM]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> o_custkey
+   runtime filters: RF000[bloom] -> o_custkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all
@@ -331,7 +331,7 @@ Per-Host Resources: mem-estimate=77.38MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF000 <- c_custkey
+|  runtime filters: RF000[bloom] <- c_custkey
 |  mem-estimate=37.38MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
@@ -351,7 +351,7 @@ Per-Host Resources: mem-estimate=77.38MB mem-reservation=34.00MB
 |
 00:SCAN HDFS [tpch_parquet.orders, RANDOM]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> o_custkey
+   runtime filters: RF000[bloom] -> o_custkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all
@@ -376,7 +376,7 @@ Per-Host Resources: mem-estimate=154.76MB mem-reservation=68.00MB
 |  hash-table-id=00
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
-|  runtime filters: RF000 <- c_custkey
+|  runtime filters: RF000[bloom] <- c_custkey
 |  mem-estimate=37.38MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
@@ -403,7 +403,7 @@ Per-Host Resources: mem-estimate=154.76MB mem-reservation=68.00MB
 |
 00:SCAN HDFS [tpch_parquet.orders, RANDOM]
    partitions=1/1 files=2 size=54.20MB
-   runtime filters: RF000 -> o_custkey
+   runtime filters: RF000[bloom] -> o_custkey
    stats-rows=1500000 extrapolated-rows=disabled
    table stats: rows=1500000 size=54.20MB
    column stats: all
@@ -639,7 +639,7 @@ Per-Host Resources: mem-estimate=71.12MB mem-reservation=51.00MB
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=33B cardinality=5757710
 |
@@ -665,7 +665,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
 Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
    partitions=1/1 files=3 size=193.92MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=193.92MB
    column stats: all
@@ -709,7 +709,7 @@ Per-Host Resources: mem-estimate=85.00MB mem-reservation=85.00MB
 |  hash-table-id=00
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0,1 row-size=33B cardinality=5757710
 |
@@ -742,7 +742,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
 Per-Host Resources: mem-estimate=160.00MB mem-reservation=0B
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
    partitions=1/1 files=3 size=193.92MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=193.92MB
    column stats: all

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
index 9823a8b..790bcc3 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
@@ -191,14 +191,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = t.id
-|  runtime filters: RF001 <- t.id
+|  runtime filters: RF002 <- t.id
 |
 |--01:SCAN HDFS [functional.alltypes t]
 |     partitions=24/24 files=24 size=478.45KB
 |
 00:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> a.int_col, RF001 -> a.id
+   runtime filters: RF000 -> a.int_col, RF002 -> a.id
 ====
 # Multiple tables in the subquery
 select count(*)
@@ -220,14 +220,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s.int_col = t.int_col
-|  |  runtime filters: RF002 <- t.int_col
+|  |  runtime filters: RF004 <- t.int_col
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny t]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypessmall s]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF002 -> s.int_col
+|     runtime filters: RF004 -> s.int_col
 |
 00:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
@@ -321,7 +321,7 @@ PLAN-ROOT SINK
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s.bigint_col = n.bigint_col
-|  |  runtime filters: RF001 <- n.bigint_col
+|  |  runtime filters: RF002 <- n.bigint_col
 |  |
 |  |--03:SCAN HDFS [functional.alltypestiny n]
 |  |     partitions=4/4 files=4 size=460B
@@ -329,15 +329,15 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t.id = s.id
-|  |  runtime filters: RF002 <- s.id
+|  |  runtime filters: RF004 <- s.id
 |  |
 |  |--02:SCAN HDFS [functional.alltypessmall s]
 |  |     partitions=4/4 files=4 size=6.32KB
-|  |     runtime filters: RF001 -> s.bigint_col
+|  |     runtime filters: RF002 -> s.bigint_col
 |  |
 |  01:SCAN HDFS [functional.alltypes t]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF002 -> t.id
+|     runtime filters: RF004 -> t.id
 |
 00:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
@@ -359,7 +359,7 @@ PLAN-ROOT SINK
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: id = a.id
-|  |  runtime filters: RF001 <- a.id
+|  |  runtime filters: RF002 <- a.id
 |  |
 |  |--01:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
@@ -371,7 +371,7 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [functional.alltypessmall]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> functional.alltypessmall.id
+|     runtime filters: RF002 -> functional.alltypessmall.id
 |
 00:SCAN HDFS [functional.alltypes t]
    partitions=24/24 files=24 size=478.45KB
@@ -416,22 +416,22 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: g.string_col = a.string_col
-|  runtime filters: RF001 <- a.string_col
+|  runtime filters: RF002 <- a.string_col
 |
 |--02:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: id = id
-|  |  runtime filters: RF003 <- id
+|  |  runtime filters: RF006 <- id
 |  |
 |  |--01:SCAN HDFS [functional.alltypestiny]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  00:SCAN HDFS [functional.alltypes a]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF000 -> a.int_col, RF003 -> id
+|     runtime filters: RF000 -> a.int_col, RF006 -> id
 |
 05:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: g.id = id
-|  runtime filters: RF002 <- id
+|  runtime filters: RF004 <- id
 |
 |--04:SCAN HDFS [functional.alltypes]
 |     partitions=24/24 files=24 size=478.45KB
@@ -439,7 +439,7 @@ PLAN-ROOT SINK
 03:SCAN HDFS [functional.alltypesagg g]
    partitions=11/11 files=11 size=814.73KB
    predicates: g.bool_col = FALSE
-   runtime filters: RF001 -> g.string_col, RF002 -> g.id
+   runtime filters: RF002 -> g.string_col, RF004 -> g.id
 ====
 # Correlated subqueries
 select *
@@ -479,7 +479,7 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: int_col = int_col
-|  |  runtime filters: RF001 <- int_col
+|  |  runtime filters: RF002 <- int_col
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny]
 |  |     partitions=4/4 files=4 size=460B
@@ -487,7 +487,7 @@ PLAN-ROOT SINK
 |  01:SCAN HDFS [functional.alltypesagg]
 |     partitions=11/11 files=11 size=814.73KB
 |     predicates: bool_col = FALSE
-|     runtime filters: RF001 -> int_col
+|     runtime filters: RF002 -> int_col
 |
 00:SCAN HDFS [functional.alltypes t]
    partitions=24/24 files=24 size=478.45KB
@@ -511,14 +511,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: a.bigint_col = s.bigint_col, a.tinyint_col = tinyint_col
-|  |  runtime filters: RF002 <- s.bigint_col, RF003 <- tinyint_col
+|  |  runtime filters: RF004 <- s.bigint_col, RF005 <- tinyint_col
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny s]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF002 -> a.bigint_col, RF003 -> a.tinyint_col
+|     runtime filters: RF004 -> a.bigint_col, RF005 -> a.tinyint_col
 |
 00:SCAN HDFS [functional.alltypes t]
    partitions=24/24 files=24 size=478.45KB
@@ -539,14 +539,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: a.bigint_col = s.bigint_col, a.int_col = int_col
-|  |  runtime filters: RF001 <- s.bigint_col, RF002 <- int_col
+|  |  runtime filters: RF002 <- s.bigint_col, RF003 <- int_col
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny s]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF001 -> a.bigint_col, RF002 -> a.int_col
+|     runtime filters: RF002 -> a.bigint_col, RF003 -> a.int_col
 |
 00:SCAN HDFS [functional.alltypes t]
    partitions=24/24 files=24 size=478.45KB
@@ -591,7 +591,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = t.id
-|  runtime filters: RF002 <- t.id
+|  runtime filters: RF004 <- t.id
 |
 |--01:SCAN HDFS [functional.alltypes t]
 |     partitions=24/24 files=24 size=478.45KB
@@ -600,7 +600,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> a.id, RF002 -> a.id
+   runtime filters: RF000 -> a.id, RF004 -> a.id
 ====
 # Correlated EXISTS
 select count(*)
@@ -846,7 +846,7 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [RIGHT SEMI JOIN]
 |  |  hash predicates: g.int_col = t.int_col
-|  |  runtime filters: RF001 <- t.int_col
+|  |  runtime filters: RF002 <- t.int_col
 |  |
 |  |--01:SCAN HDFS [functional.alltypestiny t]
 |  |     partitions=4/4 files=4 size=460B
@@ -854,7 +854,7 @@ PLAN-ROOT SINK
 |  02:SCAN HDFS [functional.alltypesagg g]
 |     partitions=11/11 files=11 size=814.73KB
 |     predicates: g.bool_col = FALSE
-|     runtime filters: RF001 -> g.int_col
+|     runtime filters: RF002 -> g.int_col
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
@@ -893,11 +893,11 @@ PLAN-ROOT SINK
 |
 |--08:HASH JOIN [RIGHT SEMI JOIN]
 |  |  hash predicates: bigint_col = g.bigint_col, s.id = g.id
-|  |  runtime filters: RF001 <- g.bigint_col, RF002 <- g.id
+|  |  runtime filters: RF002 <- g.bigint_col, RF003 <- g.id
 |  |
 |  |--07:HASH JOIN [LEFT SEMI JOIN]
 |  |  |  hash predicates: g.id = t.id
-|  |  |  runtime filters: RF003 <- t.id
+|  |  |  runtime filters: RF006 <- t.id
 |  |  |
 |  |  |--02:SCAN HDFS [functional.alltypestiny t]
 |  |  |     partitions=4/4 files=4 size=460B
@@ -905,21 +905,21 @@ PLAN-ROOT SINK
 |  |  |
 |  |  06:HASH JOIN [RIGHT OUTER JOIN]
 |  |  |  hash predicates: a.id = g.id
-|  |  |  runtime filters: RF004 <- g.id
+|  |  |  runtime filters: RF008 <- g.id
 |  |  |
 |  |  |--00:SCAN HDFS [functional.alltypesagg g]
 |  |  |     partitions=11/11 files=11 size=814.73KB
 |  |  |     predicates: g.int_col < 100
-|  |  |     runtime filters: RF003 -> g.id
+|  |  |     runtime filters: RF006 -> g.id
 |  |  |
 |  |  01:SCAN HDFS [functional.alltypes a]
 |  |     partitions=24/24 files=24 size=478.45KB
-|  |     runtime filters: RF003 -> a.id, RF004 -> a.id
+|  |     runtime filters: RF006 -> a.id, RF008 -> a.id
 |  |
 |  03:SCAN HDFS [functional.alltypessmall s]
 |     partitions=4/4 files=4 size=6.32KB
 |     predicates: s.int_col > 10
-|     runtime filters: RF001 -> bigint_col, RF002 -> s.id
+|     runtime filters: RF002 -> bigint_col, RF003 -> s.id
 |
 05:AGGREGATE [FINALIZE]
 |  output: count(*)
@@ -1121,7 +1121,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = t.id
-|  runtime filters: RF001 <- t.id
+|  runtime filters: RF002 <- t.id
 |
 |--01:SCAN HDFS [functional.alltypes t]
 |     partitions=24/24 files=24 size=478.45KB
@@ -1130,7 +1130,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF001 -> a.id
+   runtime filters: RF002 -> a.id
 ====
 # Multiple nesting levels with aggregation subqueries
 select *
@@ -1650,14 +1650,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF001 <- t2.id
+|  runtime filters: RF002 <- t2.id
 |
 |--02:SCAN HDFS [functional.alltypes t2]
 |     partitions=24/24 files=24 size=478.45KB
 |
 01:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF001 -> t1.id
+   runtime filters: RF002 -> t1.id
 ====
 # Correlated scalar subquery with complex correlared predicate (IMPALA-1335)
 select 1
@@ -1681,14 +1681,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF002 <- t2.id
+|  runtime filters: RF004 <- t2.id
 |
 |--02:SCAN HDFS [functional.alltypes t2]
 |     partitions=24/24 files=24 size=478.45KB
 |
 01:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF002 -> t1.id
+   runtime filters: RF004 -> t1.id
 ====
 # Outer query block with multiple tables and a correlated scalar subquery with
 # complex correlated predicate that references multiple subquery tables and multiple
@@ -1708,14 +1708,14 @@ PLAN-ROOT SINK
 |
 |--06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.id = t1.id
-|  |  runtime filters: RF002 <- t1.id
+|  |  runtime filters: RF004 <- t1.id
 |  |
 |  |--00:SCAN HDFS [functional.alltypestiny t1]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypessmall t2]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF002 -> t2.id
+|     runtime filters: RF004 -> t2.id
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(tt1.id)
@@ -1723,14 +1723,14 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: tt1.int_col = tt2.int_col
-|  runtime filters: RF001 <- tt2.int_col
+|  runtime filters: RF002 <- tt2.int_col
 |
 |--03:SCAN HDFS [functional.alltypes tt2]
 |     partitions=24/24 files=24 size=478.45KB
 |
 02:SCAN HDFS [functional.alltypesagg tt1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF001 -> tt1.int_col
+   runtime filters: RF002 -> tt1.int_col
 ====
 # IMPALA-1550/IMPALA-4423: Correlated EXISTS and NOT EXISTS subqueries with aggregates
 # that can be evaluated at query compile time. All predicates evaluate to FALSE.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
index a4c06e0..34efe63 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
@@ -160,7 +160,7 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF000 <- t2.id
+|  runtime filters: RF000[bloom] <- t2.id
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=0 row-size=4B cardinality=10
 |
@@ -174,7 +174,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes t1]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> t1.id
+   runtime filters: RF000[bloom] -> t1.id
    stats-rows=7300 extrapolated-rows=disabled
    table stats: rows=7300 size=478.45KB
    column stats: all

[11/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr-test.cc b/be/src/runtime/io/disk-io-mgr-test.cc
new file mode 100644
index 0000000..b03ec31
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr-test.cc
@@ -0,0 +1,1129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sched.h>
+#include <boost/bind.hpp>
+#include <boost/thread/thread.hpp>
+#include <sys/stat.h>
+
+#include "codegen/llvm-codegen.h"
+#include "common/init.h"
+#include "runtime/io/request-context.h"
+#include "runtime/io/disk-io-mgr-stress.h"
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/thread-resource-mgr.h"
+#include "testutil/gtest-util.h"
+#include "util/condition-variable.h"
+#include "util/cpu-info.h"
+#include "util/disk-info.h"
+#include "util/thread.h"
+
+#include "common/names.h"
+
+DECLARE_int32(num_remote_hdfs_io_threads);
+DECLARE_int32(num_s3_io_threads);
+DECLARE_int32(num_adls_io_threads);
+
+const int MIN_BUFFER_SIZE = 512;
+const int MAX_BUFFER_SIZE = 1024;
+const int LARGE_MEM_LIMIT = 1024 * 1024 * 1024;
+
+namespace impala {
+namespace io {
+
+class DiskIoMgrTest : public testing::Test {
+ public:
+
+  virtual void SetUp() {}
+
+  virtual void TearDown() {
+    pool_.Clear();
+  }
+  void WriteValidateCallback(int num_writes, WriteRange** written_range,
+      DiskIoMgr* io_mgr, RequestContext* reader, int32_t* data,
+      Status expected_status, const Status& status) {
+    if (expected_status.code() == TErrorCode::CANCELLED) {
+      EXPECT_TRUE(status.ok() || status.IsCancelled()) << "Error: " << status.GetDetail();
+    } else {
+      EXPECT_EQ(status.code(), expected_status.code());
+    }
+    if (status.ok()) {
+      ScanRange* scan_range = pool_.Add(new ScanRange());
+      scan_range->Reset(nullptr, (*written_range)->file(), (*written_range)->len(),
+          (*written_range)->offset(), 0, false, BufferOpts::Uncached());
+      ValidateSyncRead(io_mgr, reader, scan_range, reinterpret_cast<const char*>(data),
+          sizeof(int32_t));
+    }
+
+    {
+      lock_guard<mutex> l(written_mutex_);
+      ++num_ranges_written_;
+      if (num_ranges_written_ == num_writes) writes_done_.NotifyOne();
+    }
+  }
+
+  void WriteCompleteCallback(int num_writes, const Status& status) {
+    EXPECT_OK(status);
+    {
+      lock_guard<mutex> l(written_mutex_);
+      ++num_ranges_written_;
+      if (num_ranges_written_ == num_writes) writes_done_.NotifyAll();
+    }
+  }
+
+ protected:
+  void CreateTempFile(const char* filename, const char* data) {
+    FILE* file = fopen(filename, "w");
+    EXPECT_TRUE(file != nullptr);
+    fwrite(data, 1, strlen(data), file);
+    fclose(file);
+  }
+
+  int CreateTempFile(const char* filename, int file_size) {
+    FILE* file = fopen(filename, "w");
+    EXPECT_TRUE(file != nullptr);
+    int success = fclose(file);
+    if (success != 0) {
+      LOG(ERROR) << "Error closing file " << filename;
+      return success;
+    }
+    return truncate(filename, file_size);
+  }
+
+  // Validates that buffer[i] is \0 or expected[i]
+  static void ValidateEmptyOrCorrect(const char* expected, const char* buffer, int len) {
+    for (int i = 0; i < len; ++i) {
+      if (buffer[i] != '\0') {
+        EXPECT_EQ(expected[i], buffer[i]) << (int)expected[i] << " != " << (int)buffer[i];
+      }
+    }
+  }
+
+  static void ValidateSyncRead(DiskIoMgr* io_mgr, RequestContext* reader,
+      ScanRange* range, const char* expected, int expected_len = -1) {
+    unique_ptr<BufferDescriptor> buffer;
+    ASSERT_OK(io_mgr->Read(reader, range, &buffer));
+    ASSERT_TRUE(buffer != nullptr);
+    EXPECT_EQ(buffer->len(), range->len());
+    if (expected_len < 0) expected_len = strlen(expected);
+    int cmp = memcmp(buffer->buffer(), expected, expected_len);
+    EXPECT_TRUE(cmp == 0);
+    io_mgr->ReturnBuffer(move(buffer));
+  }
+
+  static void ValidateScanRange(DiskIoMgr* io_mgr, ScanRange* range,
+      const char* expected, int expected_len, const Status& expected_status) {
+    char result[expected_len + 1];
+    memset(result, 0, expected_len + 1);
+
+    while (true) {
+      unique_ptr<BufferDescriptor> buffer;
+      Status status = range->GetNext(&buffer);
+      ASSERT_TRUE(status.ok() || status.code() == expected_status.code());
+      if (buffer == nullptr || !status.ok()) {
+        if (buffer != nullptr) io_mgr->ReturnBuffer(move(buffer));
+        break;
+      }
+      ASSERT_LE(buffer->len(), expected_len);
+      memcpy(result + range->offset() + buffer->scan_range_offset(),
+          buffer->buffer(), buffer->len());
+      io_mgr->ReturnBuffer(move(buffer));
+    }
+    ValidateEmptyOrCorrect(expected, result, expected_len);
+  }
+
+  // Continues pulling scan ranges from the io mgr until they are all done.
+  // Updates num_ranges_processed with the number of ranges seen by this thread.
+  static void ScanRangeThread(DiskIoMgr* io_mgr, RequestContext* reader,
+      const char* expected_result, int expected_len, const Status& expected_status,
+      int max_ranges, AtomicInt32* num_ranges_processed) {
+    int num_ranges = 0;
+    while (max_ranges == 0 || num_ranges < max_ranges) {
+      ScanRange* range;
+      Status status = io_mgr->GetNextRange(reader, &range);
+      ASSERT_TRUE(status.ok() || status.code() == expected_status.code());
+      if (range == nullptr) break;
+      ValidateScanRange(io_mgr, range, expected_result, expected_len, expected_status);
+      num_ranges_processed->Add(1);
+      ++num_ranges;
+    }
+  }
+
+  ScanRange* AllocateRange() {
+    return pool_.Add(new ScanRange);
+  }
+
+  ScanRange* InitRange(const char* file_path, int offset, int len,
+      int disk_id, int64_t mtime, void* meta_data = nullptr, bool is_cached = false) {
+    ScanRange* range = AllocateRange();
+    range->Reset(nullptr, file_path, len, offset, disk_id, true,
+        BufferOpts(is_cached, mtime), meta_data);
+    EXPECT_EQ(mtime, range->mtime());
+    return range;
+  }
+
+  ObjectPool pool_;
+
+  mutex written_mutex_;
+  ConditionVariable writes_done_;
+  int num_ranges_written_;
+};
+
+// Test a single writer with multiple disks and threads per disk. Each WriteRange
+// writes random 4-byte integers, and upon completion, the written data is validated
+// by reading the data back via a separate IoMgr instance. All writes are expected to
+// complete successfully.
+TEST_F(DiskIoMgrTest, SingleWriter) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  num_ranges_written_ = 0;
+  string tmp_file = "/tmp/disk_io_mgr_test.txt";
+  int num_ranges = 100;
+  int64_t file_size = 1024 * 1024;
+  int64_t cur_offset = 0;
+  int success = CreateTempFile(tmp_file.c_str(), file_size);
+  if (success != 0) {
+    LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " <<
+        file_size;
+    EXPECT_TRUE(false);
+  }
+
+  scoped_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 1, 10));
+  MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+  ASSERT_OK(read_io_mgr->Init(&reader_mem_tracker));
+  unique_ptr<RequestContext> reader =
+      read_io_mgr->RegisterContext(&reader_mem_tracker);
+  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
+    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+      pool_.Clear(); // Destroy scan ranges from previous iterations.
+      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 10);
+      ASSERT_OK(io_mgr.Init(&mem_tracker));
+      unique_ptr<RequestContext> writer = io_mgr.RegisterContext(&mem_tracker);
+      for (int i = 0; i < num_ranges; ++i) {
+        int32_t* data = pool_.Add(new int32_t);
+        *data = rand();
+        WriteRange** new_range = pool_.Add(new WriteRange*);
+        WriteRange::WriteDoneCallback callback =
+            bind(mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, num_ranges,
+                new_range, read_io_mgr.get(), reader.get(), data, Status::OK(), _1);
+        *new_range = pool_.Add(new WriteRange(
+            tmp_file, cur_offset, num_ranges % num_disks, callback));
+        (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
+        EXPECT_OK(io_mgr.AddWriteRange(writer.get(), *new_range));
+        cur_offset += sizeof(int32_t);
+      }
+
+      {
+        unique_lock<mutex> lock(written_mutex_);
+        while (num_ranges_written_ < num_ranges) writes_done_.Wait(lock);
+      }
+      num_ranges_written_ = 0;
+      io_mgr.UnregisterContext(writer.get());
+    }
+  }
+
+  read_io_mgr->UnregisterContext(reader.get());
+  read_io_mgr.reset();
+}
+
+// Perform invalid writes (e.g. file in non-existent directory, negative offset) and
+// validate that an error status is returned via the write callback.
+TEST_F(DiskIoMgrTest, InvalidWrite) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  num_ranges_written_ = 0;
+  string tmp_file = "/non-existent/file.txt";
+  DiskIoMgr io_mgr(1, 1, 1, 1, 10);
+  ASSERT_OK(io_mgr.Init(&mem_tracker));
+  unique_ptr<RequestContext> writer = io_mgr.RegisterContext(nullptr);
+  int32_t* data = pool_.Add(new int32_t);
+  *data = rand();
+
+  // Write to file in non-existent directory.
+  WriteRange** new_range = pool_.Add(new WriteRange*);
+  WriteRange::WriteDoneCallback callback =
+      bind(mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, 2, new_range,
+          (DiskIoMgr*)nullptr, (RequestContext*)nullptr, data,
+          Status(TErrorCode::DISK_IO_ERROR, "Test Failure"), _1);
+  *new_range = pool_.Add(new WriteRange(tmp_file, rand(), 0, callback));
+
+  (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
+  EXPECT_OK(io_mgr.AddWriteRange(writer.get(), *new_range));
+
+  // Write to a bad location in a file that exists.
+  tmp_file = "/tmp/disk_io_mgr_test.txt";
+  int success = CreateTempFile(tmp_file.c_str(), 100);
+  if (success != 0) {
+    LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size 100";
+    EXPECT_TRUE(false);
+  }
+
+  new_range = pool_.Add(new WriteRange*);
+  callback = bind(mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, 2,
+      new_range, (DiskIoMgr*)nullptr, (RequestContext*)nullptr,
+      data, Status(TErrorCode::DISK_IO_ERROR, "Test Failure"), _1);
+
+  *new_range = pool_.Add(new WriteRange(tmp_file, -1, 0, callback));
+  (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
+  EXPECT_OK(io_mgr.AddWriteRange(writer.get(), *new_range));
+
+  {
+    unique_lock<mutex> lock(written_mutex_);
+    while (num_ranges_written_ < 2) writes_done_.Wait(lock);
+  }
+  num_ranges_written_ = 0;
+  io_mgr.UnregisterContext(writer.get());
+}
+
+// Issue a number of writes, cancel the writer context and issue more writes.
+// AddWriteRange() is expected to succeed before the cancel and fail after it.
+// The writes themselves may finish with status cancelled or ok.
+TEST_F(DiskIoMgrTest, SingleWriterCancel) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  num_ranges_written_ = 0;
+  string tmp_file = "/tmp/disk_io_mgr_test.txt";
+  int num_ranges = 100;
+  int num_ranges_before_cancel = 25;
+  int64_t file_size = 1024 * 1024;
+  int64_t cur_offset = 0;
+  int success = CreateTempFile(tmp_file.c_str(), file_size);
+  if (success != 0) {
+    LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " <<
+        file_size;
+    EXPECT_TRUE(false);
+  }
+
+  scoped_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 1, 10));
+  MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
+  ASSERT_OK(read_io_mgr->Init(&reader_mem_tracker));
+  unique_ptr<RequestContext> reader =
+      read_io_mgr->RegisterContext(&reader_mem_tracker);
+  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
+    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+      pool_.Clear(); // Destroy scan ranges from previous iterations.
+      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 10);
+      ASSERT_OK(io_mgr.Init(&mem_tracker));
+      unique_ptr<RequestContext> writer = io_mgr.RegisterContext(&mem_tracker);
+      Status validate_status = Status::OK();
+      for (int i = 0; i < num_ranges; ++i) {
+        if (i == num_ranges_before_cancel) {
+          io_mgr.CancelContext(writer.get());
+          validate_status = Status::CANCELLED;
+        }
+        int32_t* data = pool_.Add(new int32_t);
+        *data = rand();
+        WriteRange** new_range = pool_.Add(new WriteRange*);
+        WriteRange::WriteDoneCallback callback = bind(
+            mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, num_ranges_before_cancel,
+            new_range, read_io_mgr.get(), reader.get(), data, Status::CANCELLED, _1);
+        *new_range = pool_.Add(new WriteRange(
+            tmp_file, cur_offset, num_ranges % num_disks, callback));
+        (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
+        cur_offset += sizeof(int32_t);
+        Status add_status = io_mgr.AddWriteRange(writer.get(), *new_range);
+        EXPECT_TRUE(add_status.code() == validate_status.code());
+      }
+
+      {
+        unique_lock<mutex> lock(written_mutex_);
+        while (num_ranges_written_ < num_ranges_before_cancel) writes_done_.Wait(lock);
+      }
+      num_ranges_written_ = 0;
+      io_mgr.UnregisterContext(writer.get());
+    }
+  }
+
+  read_io_mgr->UnregisterContext(reader.get());
+  read_io_mgr.reset();
+}
+
+// Basic test with a single reader, testing multiple threads, disks and a different
+// number of buffers.
+TEST_F(DiskIoMgrTest, SingleReader) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "abcdefghijklm";
+  int len = strlen(data);
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  int64_t iters = 0;
+  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
+    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+      for (int num_read_threads = 1; num_read_threads <= 5; ++num_read_threads) {
+        ObjectPool pool;
+        LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
+                  << " num_disk=" << num_disks
+                  << " num_read_threads=" << num_read_threads;
+
+        if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
+        DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 1);
+
+        ASSERT_OK(io_mgr.Init(&mem_tracker));
+        MemTracker reader_mem_tracker;
+        unique_ptr<RequestContext> reader =
+            io_mgr.RegisterContext(&reader_mem_tracker);
+
+        vector<ScanRange*> ranges;
+        for (int i = 0; i < len; ++i) {
+          int disk_id = i % num_disks;
+          ranges.push_back(InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime));
+        }
+        ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
+
+        AtomicInt32 num_ranges_processed;
+        thread_group threads;
+        for (int i = 0; i < num_read_threads; ++i) {
+          threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data, len,
+              Status::OK(), 0, &num_ranges_processed));
+        }
+        threads.join_all();
+
+        EXPECT_EQ(num_ranges_processed.Load(), ranges.size());
+        io_mgr.UnregisterContext(reader.get());
+        EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+      }
+    }
+  }
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// This test issues adding additional scan ranges while there are some still in flight.
+TEST_F(DiskIoMgrTest, AddScanRangeTest) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "abcdefghijklm";
+  int len = strlen(data);
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  int64_t iters = 0;
+  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
+    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+      pool_.Clear(); // Destroy scan ranges from previous iterations.
+      LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
+                << " num_disk=" << num_disks;
+
+      if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
+      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 1);
+
+      ASSERT_OK(io_mgr.Init(&mem_tracker));
+      MemTracker reader_mem_tracker;
+      unique_ptr<RequestContext> reader =
+          io_mgr.RegisterContext(&reader_mem_tracker);
+
+      vector<ScanRange*> ranges_first_half;
+      vector<ScanRange*> ranges_second_half;
+      for (int i = 0; i < len; ++i) {
+        int disk_id = i % num_disks;
+        if (i > len / 2) {
+          ranges_second_half.push_back(
+              InitRange(tmp_file, i, 1, disk_id, stat_val.st_mtime));
+        } else {
+          ranges_first_half.push_back(
+              InitRange(tmp_file, i, 1, disk_id, stat_val.st_mtime));
+        }
+      }
+      AtomicInt32 num_ranges_processed;
+
+      // Issue first half the scan ranges.
+      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges_first_half));
+
+      // Read a couple of them
+      ScanRangeThread(&io_mgr, reader.get(), data, strlen(data), Status::OK(), 2,
+          &num_ranges_processed);
+
+      // Issue second half
+      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges_second_half));
+
+      // Start up some threads and then cancel
+      thread_group threads;
+      for (int i = 0; i < 3; ++i) {
+        threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
+            strlen(data), Status::CANCELLED, 0, &num_ranges_processed));
+      }
+
+      threads.join_all();
+      EXPECT_EQ(num_ranges_processed.Load(), len);
+      io_mgr.UnregisterContext(reader.get());
+      EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+    }
+  }
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Test to make sure that sync reads and async reads work together
+// Note: this test is constructed so the number of buffers is greater than the
+// number of scan ranges.
+TEST_F(DiskIoMgrTest, SyncReadTest) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "abcdefghijklm";
+  int len = strlen(data);
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  int64_t iters = 0;
+  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
+    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+      pool_.Clear(); // Destroy scan ranges from previous iterations.
+      LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
+                << " num_disk=" << num_disks;
+
+      if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
+      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk,
+          MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
+
+      ASSERT_OK(io_mgr.Init(&mem_tracker));
+      MemTracker reader_mem_tracker;
+      unique_ptr<RequestContext> reader =
+          io_mgr.RegisterContext(&reader_mem_tracker);
+
+      ScanRange* complete_range =
+          InitRange(tmp_file, 0, strlen(data), 0, stat_val.st_mtime);
+
+      // Issue some reads before the async ones are issued
+      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+
+      vector<ScanRange*> ranges;
+      for (int i = 0; i < len; ++i) {
+        int disk_id = i % num_disks;
+        ranges.push_back(InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime));
+      }
+      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
+
+      AtomicInt32 num_ranges_processed;
+      thread_group threads;
+      for (int i = 0; i < 5; ++i) {
+        threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
+            strlen(data), Status::OK(), 0, &num_ranges_processed));
+      }
+
+      // Issue some more sync ranges
+      for (int i = 0; i < 5; ++i) {
+        sched_yield();
+        ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+      }
+
+      threads.join_all();
+
+      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+
+      EXPECT_EQ(num_ranges_processed.Load(), ranges.size());
+      io_mgr.UnregisterContext(reader.get());
+      EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+    }
+  }
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Tests a single reader cancelling half way through scan ranges.
+TEST_F(DiskIoMgrTest, SingleReaderCancel) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "abcdefghijklm";
+  int len = strlen(data);
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  int64_t iters = 0;
+  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
+    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+      pool_.Clear(); // Destroy scan ranges from previous iterations.
+      LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
+                << " num_disk=" << num_disks;
+
+      if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
+      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 1);
+
+      ASSERT_OK(io_mgr.Init(&mem_tracker));
+      MemTracker reader_mem_tracker;
+      unique_ptr<RequestContext> reader =
+          io_mgr.RegisterContext(&reader_mem_tracker);
+
+      vector<ScanRange*> ranges;
+      for (int i = 0; i < len; ++i) {
+        int disk_id = i % num_disks;
+        ranges.push_back(InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime));
+      }
+      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
+
+      AtomicInt32 num_ranges_processed;
+      int num_succesful_ranges = ranges.size() / 2;
+      // Read half the ranges
+      for (int i = 0; i < num_succesful_ranges; ++i) {
+        ScanRangeThread(&io_mgr, reader.get(), data, strlen(data), Status::OK(), 1,
+            &num_ranges_processed);
+      }
+      EXPECT_EQ(num_ranges_processed.Load(), num_succesful_ranges);
+
+      // Start up some threads and then cancel
+      thread_group threads;
+      for (int i = 0; i < 3; ++i) {
+        threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
+            strlen(data), Status::CANCELLED, 0, &num_ranges_processed));
+      }
+
+      io_mgr.CancelContext(reader.get());
+      sched_yield();
+
+      threads.join_all();
+      EXPECT_TRUE(io_mgr.context_status(reader.get()).IsCancelled());
+      io_mgr.UnregisterContext(reader.get());
+      EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+    }
+  }
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Test when the reader goes over the mem limit
+TEST_F(DiskIoMgrTest, MemLimits) {
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "abcdefghijklm";
+  int len = strlen(data);
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  const int mem_limit_num_buffers = 2;
+  // Allocate enough ranges so that the total buffers exceeds the mem limit.
+  const int num_ranges = 25;
+  {
+    MemTracker root_mem_tracker(mem_limit_num_buffers * MAX_BUFFER_SIZE);
+    DiskIoMgr io_mgr(1, 1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
+
+    ASSERT_OK(io_mgr.Init(&root_mem_tracker));
+    MemTracker reader_mem_tracker(-1, "Reader", &root_mem_tracker);
+    unique_ptr<RequestContext> reader = io_mgr.RegisterContext(&reader_mem_tracker);
+
+    vector<ScanRange*> ranges;
+    for (int i = 0; i < num_ranges; ++i) {
+      ranges.push_back(InitRange(tmp_file, 0, len, 0, stat_val.st_mtime));
+    }
+    ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
+
+    // Don't return buffers to force memory pressure
+    vector<unique_ptr<BufferDescriptor>> buffers;
+
+    AtomicInt32 num_ranges_processed;
+    ScanRangeThread(&io_mgr, reader.get(), data, strlen(data), Status::MemLimitExceeded(),
+        1, &num_ranges_processed);
+
+    char result[strlen(data) + 1];
+    // Keep reading new ranges without returning buffers. This forces us
+    // to go over the limit eventually.
+    while (true) {
+      memset(result, 0, strlen(data) + 1);
+      ScanRange* range = nullptr;
+      Status status = io_mgr.GetNextRange(reader.get(), &range);
+      ASSERT_TRUE(status.ok() || status.IsMemLimitExceeded());
+      if (range == nullptr) break;
+
+      while (true) {
+        unique_ptr<BufferDescriptor> buffer;
+        Status status = range->GetNext(&buffer);
+        ASSERT_TRUE(status.ok() || status.IsMemLimitExceeded());
+        if (buffer == nullptr) break;
+        memcpy(result + range->offset() + buffer->scan_range_offset(),
+            buffer->buffer(), buffer->len());
+        buffers.push_back(move(buffer));
+      }
+      ValidateEmptyOrCorrect(data, result, strlen(data));
+    }
+
+    for (int i = 0; i < buffers.size(); ++i) {
+      io_mgr.ReturnBuffer(move(buffers[i]));
+    }
+
+    EXPECT_TRUE(io_mgr.context_status(reader.get()).IsMemLimitExceeded());
+    io_mgr.UnregisterContext(reader.get());
+    EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+  }
+}
+
+// Test when some scan ranges are marked as being cached.
+// Since these files are not in HDFS, the cached path always fails so this
+// only tests the fallback mechanism.
+// TODO: we can fake the cached read path without HDFS
+TEST_F(DiskIoMgrTest, CachedReads) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "abcdefghijklm";
+  int len = strlen(data);
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  const int num_disks = 2;
+  {
+    DiskIoMgr io_mgr(num_disks, 1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
+
+    ASSERT_OK(io_mgr.Init(&mem_tracker));
+    MemTracker reader_mem_tracker;
+    unique_ptr<RequestContext> reader = io_mgr.RegisterContext(&reader_mem_tracker);
+
+    ScanRange* complete_range =
+        InitRange(tmp_file, 0, strlen(data), 0, stat_val.st_mtime, nullptr, true);
+
+    // Issue some reads before the async ones are issued
+    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+
+    vector<ScanRange*> ranges;
+    for (int i = 0; i < len; ++i) {
+      int disk_id = i % num_disks;
+      ranges.push_back(
+          InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime, nullptr, true));
+    }
+    ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
+
+    AtomicInt32 num_ranges_processed;
+    thread_group threads;
+    for (int i = 0; i < 5; ++i) {
+      threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
+          strlen(data), Status::OK(), 0, &num_ranges_processed));
+    }
+
+    // Issue some more sync ranges
+    for (int i = 0; i < 5; ++i) {
+      sched_yield();
+      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+    }
+
+    threads.join_all();
+
+    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
+
+    EXPECT_EQ(num_ranges_processed.Load(), ranges.size());
+    io_mgr.UnregisterContext(reader.get());
+    EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+  }
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const int ITERATIONS = 1;
+  const char* data = "abcdefghijklmnopqrstuvwxyz";
+  const int num_contexts = 5;
+  const int file_size = 4 * 1024;
+  const int num_writes_queued = 5;
+  const int num_reads_queued = 5;
+
+  string file_name = "/tmp/disk_io_mgr_test.txt";
+  int success = CreateTempFile(file_name.c_str(), file_size);
+  if (success != 0) {
+    LOG(ERROR) << "Error creating temp file " << file_name.c_str() << " of size " <<
+        file_size;
+    ASSERT_TRUE(false);
+  }
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(file_name.c_str(), &stat_val);
+
+  int64_t iters = 0;
+  vector<unique_ptr<RequestContext>> contexts(num_contexts);
+  Status status;
+  for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
+    for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
+      for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+        DiskIoMgr io_mgr(num_disks, threads_per_disk, threads_per_disk, MIN_BUFFER_SIZE,
+            MAX_BUFFER_SIZE);
+        ASSERT_OK(io_mgr.Init(&mem_tracker));
+        for (int file_index = 0; file_index < num_contexts; ++file_index) {
+          contexts[file_index] = io_mgr.RegisterContext(&mem_tracker);
+        }
+        pool_.Clear();
+        int read_offset = 0;
+        int write_offset = 0;
+        while (read_offset < file_size) {
+          for (int context_index = 0; context_index < num_contexts; ++context_index) {
+            if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
+            AtomicInt32 num_ranges_processed;
+            thread_group threads;
+            vector<ScanRange*> ranges;
+            int num_scan_ranges = min<int>(num_reads_queued, write_offset - read_offset);
+            for (int i = 0; i < num_scan_ranges; ++i) {
+              ranges.push_back(InitRange(
+                  file_name.c_str(), read_offset, 1, i % num_disks, stat_val.st_mtime));
+              threads.add_thread(
+                  new thread(ScanRangeThread, &io_mgr, contexts[context_index].get(),
+                      reinterpret_cast<const char*>(data + (read_offset % strlen(data))),
+                      1, Status::OK(), num_scan_ranges, &num_ranges_processed));
+              ++read_offset;
+            }
+
+            num_ranges_written_ = 0;
+            int num_write_ranges = min<int>(num_writes_queued, file_size - write_offset);
+            for (int i = 0; i < num_write_ranges; ++i) {
+              WriteRange::WriteDoneCallback callback =
+                  bind(mem_fn(&DiskIoMgrTest::WriteCompleteCallback),
+                      this, num_write_ranges, _1);
+              WriteRange* new_range = pool_.Add(new WriteRange(
+                  file_name, write_offset, i % num_disks, callback));
+              new_range->SetData(
+                  reinterpret_cast<const uint8_t*>(data + (write_offset % strlen(data))),
+                  1);
+              status = io_mgr.AddWriteRange(contexts[context_index].get(), new_range);
+              ++write_offset;
+            }
+
+            {
+              unique_lock<mutex> lock(written_mutex_);
+              while (num_ranges_written_ < num_write_ranges) writes_done_.Wait(lock);
+            }
+
+            threads.join_all();
+          } // for (int context_index
+        } // while (read_offset < file_size)
+
+        for (int file_index = 0; file_index < num_contexts; ++file_index) {
+          io_mgr.UnregisterContext(contexts[file_index].get());
+        }
+      } // for (int num_disks
+    } // for (int threads_per_disk
+  } // for (int iteration
+}
+
+// This test will test multiple concurrent reads each reading a different file.
+TEST_F(DiskIoMgrTest, MultipleReader) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const int NUM_READERS = 5;
+  const int DATA_LEN = 50;
+  const int ITERATIONS = 25;
+  const int NUM_THREADS_PER_READER = 3;
+
+  vector<string> file_names;
+  vector<int64_t> mtimes;
+  vector<string> data;
+  vector<unique_ptr<RequestContext>> readers;
+  vector<char*> results;
+
+  file_names.resize(NUM_READERS);
+  readers.resize(NUM_READERS);
+  mtimes.resize(NUM_READERS);
+  data.resize(NUM_READERS);
+  results.resize(NUM_READERS);
+
+  // Initialize data for each reader.  The data will be
+  // 'abcd...' for reader one, 'bcde...' for reader two (wrapping around at 'z')
+  for (int i = 0; i < NUM_READERS; ++i) {
+    char buf[DATA_LEN];
+    for (int j = 0; j < DATA_LEN; ++j) {
+      int c = (j + i) % 26;
+      buf[j] = 'a' + c;
+    }
+    data[i] = string(buf, DATA_LEN);
+
+    stringstream ss;
+    ss << "/tmp/disk_io_mgr_test" << i << ".txt";
+    file_names[i] = ss.str();
+    CreateTempFile(ss.str().c_str(), data[i].c_str());
+
+    // Get mtime for file
+    struct stat stat_val;
+    stat(file_names[i].c_str(), &stat_val);
+    mtimes[i] = stat_val.st_mtime;
+
+    results[i] = new char[DATA_LEN + 1];
+    memset(results[i], 0, DATA_LEN + 1);
+  }
+
+  // This exercises concurrency, run the test multiple times
+  int64_t iters = 0;
+  for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
+    for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
+      for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
+        pool_.Clear(); // Destroy scan ranges from previous iterations.
+        LOG(INFO) << "Starting test with num_threads_per_disk=" << threads_per_disk
+                  << " num_disk=" << num_disks;
+        if (++iters % 2500 == 0) LOG(ERROR) << "Starting iteration " << iters;
+
+        DiskIoMgr io_mgr(num_disks, threads_per_disk, threads_per_disk, MIN_BUFFER_SIZE,
+            MAX_BUFFER_SIZE);
+        EXPECT_OK(io_mgr.Init(&mem_tracker));
+
+        for (int i = 0; i < NUM_READERS; ++i) {
+          readers[i] = io_mgr.RegisterContext(&mem_tracker);
+
+          vector<ScanRange*> ranges;
+          for (int j = 0; j < DATA_LEN; ++j) {
+            int disk_id = j % num_disks;
+            ranges.push_back(InitRange(file_names[i].c_str(), j, 1, disk_id, mtimes[i]));
+          }
+          ASSERT_OK(io_mgr.AddScanRanges(readers[i].get(), ranges));
+        }
+
+        AtomicInt32 num_ranges_processed;
+        thread_group threads;
+        for (int i = 0; i < NUM_READERS; ++i) {
+          for (int j = 0; j < NUM_THREADS_PER_READER; ++j) {
+            threads.add_thread(new thread(ScanRangeThread, &io_mgr, readers[i].get(),
+                data[i].c_str(), data[i].size(), Status::OK(), 0, &num_ranges_processed));
+          }
+        }
+        threads.join_all();
+        EXPECT_EQ(num_ranges_processed.Load(), DATA_LEN * NUM_READERS);
+        for (int i = 0; i < NUM_READERS; ++i) {
+          io_mgr.UnregisterContext(readers[i].get());
+        }
+      }
+    }
+  }
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Stress test for multiple clients with cancellation
+// TODO: the stress app should be expanded to include sync reads and adding scan
+// ranges in the middle.
+TEST_F(DiskIoMgrTest, StressTest) {
+  // Run the test with 5 disks, 5 threads per disk, 10 clients and with cancellation
+  DiskIoMgrStress test(5, 5, 10, true);
+  test.Run(2); // In seconds
+}
+
+TEST_F(DiskIoMgrTest, Buffers) {
+  // Test default min/max buffer size
+  int min_buffer_size = 1024;
+  int max_buffer_size = 8 * 1024 * 1024; // 8 MB
+  MemTracker root_mem_tracker(max_buffer_size * 2);
+
+  DiskIoMgr io_mgr(1, 1, 1, min_buffer_size, max_buffer_size);
+  ASSERT_OK(io_mgr.Init(&root_mem_tracker));
+  ASSERT_EQ(root_mem_tracker.consumption(), 0);
+
+  MemTracker reader_mem_tracker(-1, "Reader", &root_mem_tracker);
+  unique_ptr<RequestContext> reader;
+  reader = io_mgr.RegisterContext(&reader_mem_tracker);
+
+  ScanRange* dummy_range = InitRange("dummy", 0, 0, 0, 0);
+
+  // buffer length should be rounded up to min buffer size
+  int64_t buffer_len = 1;
+  unique_ptr<BufferDescriptor> buffer_desc;
+  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
+  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
+  EXPECT_EQ(min_buffer_size, buffer_desc->buffer_len());
+  EXPECT_EQ(1, io_mgr.num_allocated_buffers_.Load());
+  io_mgr.FreeBufferMemory(buffer_desc.get());
+  io_mgr.ReturnBuffer(move(buffer_desc));
+  EXPECT_EQ(min_buffer_size, root_mem_tracker.consumption());
+
+  // reuse buffer
+  buffer_len = min_buffer_size;
+  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
+  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
+  EXPECT_EQ(min_buffer_size, buffer_desc->buffer_len());
+  EXPECT_EQ(1, io_mgr.num_allocated_buffers_.Load());
+  io_mgr.FreeBufferMemory(buffer_desc.get());
+  io_mgr.ReturnBuffer(move(buffer_desc));
+  EXPECT_EQ(min_buffer_size, root_mem_tracker.consumption());
+
+  // bump up to next buffer size
+  buffer_len = min_buffer_size + 1;
+  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
+  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
+  EXPECT_EQ(min_buffer_size * 2, buffer_desc->buffer_len());
+  EXPECT_EQ(2, io_mgr.num_allocated_buffers_.Load());
+  EXPECT_EQ(min_buffer_size * 3, root_mem_tracker.consumption());
+
+  // gc unused buffer
+  io_mgr.GcIoBuffers();
+  EXPECT_EQ(1, io_mgr.num_allocated_buffers_.Load());
+  EXPECT_EQ(min_buffer_size * 2, root_mem_tracker.consumption());
+
+  io_mgr.FreeBufferMemory(buffer_desc.get());
+  io_mgr.ReturnBuffer(move(buffer_desc));
+
+  // max buffer size
+  buffer_len = max_buffer_size;
+  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
+  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
+  EXPECT_EQ(max_buffer_size, buffer_desc->buffer_len());
+  EXPECT_EQ(2, io_mgr.num_allocated_buffers_.Load());
+  io_mgr.FreeBufferMemory(buffer_desc.get());
+  io_mgr.ReturnBuffer(move(buffer_desc));
+  EXPECT_EQ(min_buffer_size * 2 + max_buffer_size, root_mem_tracker.consumption());
+
+  // gc buffers
+  io_mgr.GcIoBuffers();
+  EXPECT_EQ(io_mgr.num_allocated_buffers_.Load(), 0);
+  EXPECT_EQ(root_mem_tracker.consumption(), 0);
+  io_mgr.UnregisterContext(reader.get());
+}
+
+// IMPALA-2366: handle partial read where range goes past end of file.
+TEST_F(DiskIoMgrTest, PartialRead) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "the quick brown fox jumped over the lazy dog";
+  int len = strlen(data);
+  int read_len = len + 1000; // Read past end of file.
+  CreateTempFile(tmp_file, data);
+
+  // Get mtime for file
+  struct stat stat_val;
+  stat(tmp_file, &stat_val);
+
+  scoped_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, 1, read_len, read_len));
+
+  ASSERT_OK(io_mgr->Init(&mem_tracker));
+  MemTracker reader_mem_tracker;
+  unique_ptr<RequestContext> reader;
+  reader = io_mgr->RegisterContext(&reader_mem_tracker);
+
+  // We should not read past the end of file.
+  ScanRange* range = InitRange(tmp_file, 0, read_len, 0, stat_val.st_mtime);
+  unique_ptr<BufferDescriptor> buffer;
+  ASSERT_OK(io_mgr->Read(reader.get(), range, &buffer));
+  ASSERT_TRUE(buffer->eosr());
+  ASSERT_EQ(len, buffer->len());
+  ASSERT_TRUE(memcmp(buffer->buffer(), data, len) == 0);
+  io_mgr->ReturnBuffer(move(buffer));
+
+  io_mgr->UnregisterContext(reader.get());
+  pool_.Clear();
+  io_mgr.reset();
+  EXPECT_EQ(reader_mem_tracker.consumption(), 0);
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Test reading into a client-allocated buffer.
+TEST_F(DiskIoMgrTest, ReadIntoClientBuffer) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
+  const char* data = "the quick brown fox jumped over the lazy dog";
+  int len = strlen(data);
+  int read_len = 4; // Make buffer size smaller than client-provided buffer.
+  CreateTempFile(tmp_file, data);
+
+  scoped_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, 1, read_len, read_len));
+
+  ASSERT_OK(io_mgr->Init(&mem_tracker));
+  // Reader doesn't need to provide mem tracker if it's providing buffers.
+  MemTracker* reader_mem_tracker = nullptr;
+  unique_ptr<RequestContext> reader;
+  reader = io_mgr->RegisterContext(reader_mem_tracker);
+
+  for (int buffer_len : vector<int>({len - 1, len, len + 1})) {
+    vector<uint8_t> client_buffer(buffer_len);
+    int scan_len = min(len, buffer_len);
+    ScanRange* range = AllocateRange();
+    range->Reset(nullptr, tmp_file, scan_len, 0, 0, true,
+        BufferOpts::ReadInto(client_buffer.data(), buffer_len));
+    ASSERT_OK(io_mgr->AddScanRange(reader.get(), range, true));
+
+    unique_ptr<BufferDescriptor> io_buffer;
+    ASSERT_OK(range->GetNext(&io_buffer));
+    ASSERT_TRUE(io_buffer->eosr());
+    ASSERT_EQ(scan_len, io_buffer->len());
+    ASSERT_EQ(client_buffer.data(), io_buffer->buffer());
+    ASSERT_EQ(memcmp(io_buffer->buffer(), data, scan_len), 0);
+
+    // DiskIoMgr should not have allocated memory.
+    EXPECT_EQ(mem_tracker.consumption(), 0);
+    io_mgr->ReturnBuffer(move(io_buffer));
+  }
+
+  io_mgr->UnregisterContext(reader.get());
+  pool_.Clear();
+  io_mgr.reset();
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Test reading into a client-allocated buffer where the read fails.
+TEST_F(DiskIoMgrTest, ReadIntoClientBufferError) {
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const char* tmp_file = "/file/that/does/not/exist";
+  const int SCAN_LEN = 128;
+
+  scoped_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, 1, SCAN_LEN, SCAN_LEN));
+
+  ASSERT_OK(io_mgr->Init(&mem_tracker));
+  // Reader doesn't need to provide mem tracker if it's providing buffers.
+  MemTracker* reader_mem_tracker = nullptr;
+  unique_ptr<RequestContext> reader;
+  vector<uint8_t> client_buffer(SCAN_LEN);
+  for (int i = 0; i < 1000; ++i) {
+    reader = io_mgr->RegisterContext(reader_mem_tracker);
+    ScanRange* range = AllocateRange();
+    range->Reset(nullptr, tmp_file, SCAN_LEN, 0, 0, true,
+        BufferOpts::ReadInto(client_buffer.data(), SCAN_LEN));
+    ASSERT_OK(io_mgr->AddScanRange(reader.get(), range, true));
+
+    /// Also test the cancellation path. Run multiple iterations since it is racy whether
+    /// the read fails before the cancellation.
+    if (i >= 1) io_mgr->CancelContext(reader.get());
+
+    unique_ptr<BufferDescriptor> io_buffer;
+    ASSERT_FALSE(range->GetNext(&io_buffer).ok());
+
+    // DiskIoMgr should not have allocated memory.
+    EXPECT_EQ(mem_tracker.consumption(), 0);
+
+    io_mgr->UnregisterContext(reader.get());
+  }
+
+  pool_.Clear();
+  io_mgr.reset();
+  EXPECT_EQ(mem_tracker.consumption(), 0);
+}
+
+// Test to verify configuration parameters for number of I/O threads per disk.
+TEST_F(DiskIoMgrTest, VerifyNumThreadsParameter) {
+  const int num_io_threads_for_remote_disks = FLAGS_num_remote_hdfs_io_threads
+      + FLAGS_num_s3_io_threads + FLAGS_num_adls_io_threads;
+
+  // Verify num_io_threads_per_rotational_disk and num_io_threads_per_solid_state_disk.
+  // Since we do not have control over which disk is used, we check for either type
+  // (rotational/solid state)
+  MemTracker mem_tracker(LARGE_MEM_LIMIT);
+  const int num_io_threads_per_rotational_or_ssd = 2;
+  DiskIoMgr io_mgr(1, num_io_threads_per_rotational_or_ssd,
+      num_io_threads_per_rotational_or_ssd, 1, 10);
+  ASSERT_OK(io_mgr.Init(&mem_tracker));
+  const int num_io_threads = io_mgr.disk_thread_group_.Size();
+  ASSERT_TRUE(num_io_threads ==
+      num_io_threads_per_rotational_or_ssd + num_io_threads_for_remote_disks);
+}
+}
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
+  return RUN_ALL_TESTS();
+}

[14/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-test.cc b/be/src/runtime/disk-io-mgr-test.cc
deleted file mode 100644
index eb8d3c7..0000000
--- a/be/src/runtime/disk-io-mgr-test.cc
+++ /dev/null
@@ -1,1127 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <sched.h>
-#include <boost/bind.hpp>
-#include <boost/thread/thread.hpp>
-#include <sys/stat.h>
-
-#include "codegen/llvm-codegen.h"
-#include "common/init.h"
-#include "runtime/disk-io-mgr-reader-context.h"
-#include "runtime/disk-io-mgr-stress.h"
-#include "runtime/disk-io-mgr.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/thread-resource-mgr.h"
-#include "testutil/gtest-util.h"
-#include "util/condition-variable.h"
-#include "util/cpu-info.h"
-#include "util/disk-info.h"
-#include "util/thread.h"
-
-#include "common/names.h"
-
-DECLARE_int32(num_remote_hdfs_io_threads);
-DECLARE_int32(num_s3_io_threads);
-DECLARE_int32(num_adls_io_threads);
-
-const int MIN_BUFFER_SIZE = 512;
-const int MAX_BUFFER_SIZE = 1024;
-const int LARGE_MEM_LIMIT = 1024 * 1024 * 1024;
-
-namespace impala {
-
-class DiskIoMgrTest : public testing::Test {
- public:
-
-  virtual void SetUp() {}
-
-  virtual void TearDown() {
-    pool_.Clear();
-  }
-  void WriteValidateCallback(int num_writes, DiskIoMgr::WriteRange** written_range,
-      DiskIoMgr* io_mgr, DiskIoRequestContext* reader, int32_t* data,
-      Status expected_status, const Status& status) {
-    if (expected_status.code() == TErrorCode::CANCELLED) {
-      EXPECT_TRUE(status.ok() || status.IsCancelled()) << "Error: " << status.GetDetail();
-    } else {
-      EXPECT_EQ(status.code(), expected_status.code());
-    }
-    if (status.ok()) {
-      DiskIoMgr::ScanRange* scan_range = pool_.Add(new DiskIoMgr::ScanRange());
-      scan_range->Reset(nullptr, (*written_range)->file(), (*written_range)->len(),
-          (*written_range)->offset(), 0, false, DiskIoMgr::BufferOpts::Uncached());
-      ValidateSyncRead(io_mgr, reader, scan_range, reinterpret_cast<const char*>(data),
-          sizeof(int32_t));
-    }
-
-    {
-      lock_guard<mutex> l(written_mutex_);
-      ++num_ranges_written_;
-      if (num_ranges_written_ == num_writes) writes_done_.NotifyOne();
-    }
-  }
-
-  void WriteCompleteCallback(int num_writes, const Status& status) {
-    EXPECT_OK(status);
-    {
-      lock_guard<mutex> l(written_mutex_);
-      ++num_ranges_written_;
-      if (num_ranges_written_ == num_writes) writes_done_.NotifyAll();
-    }
-  }
-
- protected:
-  void CreateTempFile(const char* filename, const char* data) {
-    FILE* file = fopen(filename, "w");
-    EXPECT_TRUE(file != nullptr);
-    fwrite(data, 1, strlen(data), file);
-    fclose(file);
-  }
-
-  int CreateTempFile(const char* filename, int file_size) {
-    FILE* file = fopen(filename, "w");
-    EXPECT_TRUE(file != nullptr);
-    int success = fclose(file);
-    if (success != 0) {
-      LOG(ERROR) << "Error closing file " << filename;
-      return success;
-    }
-    return truncate(filename, file_size);
-  }
-
-  // Validates that buffer[i] is \0 or expected[i]
-  static void ValidateEmptyOrCorrect(const char* expected, const char* buffer, int len) {
-    for (int i = 0; i < len; ++i) {
-      if (buffer[i] != '\0') {
-        EXPECT_EQ(expected[i], buffer[i]) << (int)expected[i] << " != " << (int)buffer[i];
-      }
-    }
-  }
-
-  static void ValidateSyncRead(DiskIoMgr* io_mgr, DiskIoRequestContext* reader,
-      DiskIoMgr::ScanRange* range, const char* expected, int expected_len = -1) {
-    unique_ptr<DiskIoMgr::BufferDescriptor> buffer;
-    ASSERT_OK(io_mgr->Read(reader, range, &buffer));
-    ASSERT_TRUE(buffer != nullptr);
-    EXPECT_EQ(buffer->len(), range->len());
-    if (expected_len < 0) expected_len = strlen(expected);
-    int cmp = memcmp(buffer->buffer(), expected, expected_len);
-    EXPECT_TRUE(cmp == 0);
-    io_mgr->ReturnBuffer(move(buffer));
-  }
-
-  static void ValidateScanRange(DiskIoMgr* io_mgr, DiskIoMgr::ScanRange* range,
-      const char* expected, int expected_len, const Status& expected_status) {
-    char result[expected_len + 1];
-    memset(result, 0, expected_len + 1);
-
-    while (true) {
-      unique_ptr<DiskIoMgr::BufferDescriptor> buffer;
-      Status status = range->GetNext(&buffer);
-      ASSERT_TRUE(status.ok() || status.code() == expected_status.code());
-      if (buffer == nullptr || !status.ok()) {
-        if (buffer != nullptr) io_mgr->ReturnBuffer(move(buffer));
-        break;
-      }
-      ASSERT_LE(buffer->len(), expected_len);
-      memcpy(result + range->offset() + buffer->scan_range_offset(),
-          buffer->buffer(), buffer->len());
-      io_mgr->ReturnBuffer(move(buffer));
-    }
-    ValidateEmptyOrCorrect(expected, result, expected_len);
-  }
-
-  // Continues pulling scan ranges from the io mgr until they are all done.
-  // Updates num_ranges_processed with the number of ranges seen by this thread.
-  static void ScanRangeThread(DiskIoMgr* io_mgr, DiskIoRequestContext* reader,
-      const char* expected_result, int expected_len, const Status& expected_status,
-      int max_ranges, AtomicInt32* num_ranges_processed) {
-    int num_ranges = 0;
-    while (max_ranges == 0 || num_ranges < max_ranges) {
-      DiskIoMgr::ScanRange* range;
-      Status status = io_mgr->GetNextRange(reader, &range);
-      ASSERT_TRUE(status.ok() || status.code() == expected_status.code());
-      if (range == nullptr) break;
-      ValidateScanRange(io_mgr, range, expected_result, expected_len, expected_status);
-      num_ranges_processed->Add(1);
-      ++num_ranges;
-    }
-  }
-
-  DiskIoMgr::ScanRange* AllocateRange() {
-    return pool_.Add(new DiskIoMgr::ScanRange);
-  }
-
-  DiskIoMgr::ScanRange* InitRange(const char* file_path, int offset, int len,
-      int disk_id, int64_t mtime, void* meta_data = nullptr, bool is_cached = false) {
-    DiskIoMgr::ScanRange* range = AllocateRange();
-    range->Reset(nullptr, file_path, len, offset, disk_id, true,
-        DiskIoMgr::BufferOpts(is_cached, mtime), meta_data);
-    EXPECT_EQ(mtime, range->mtime());
-    return range;
-  }
-
-  ObjectPool pool_;
-
-  mutex written_mutex_;
-  ConditionVariable writes_done_;
-  int num_ranges_written_;
-};
-
-// Test a single writer with multiple disks and threads per disk. Each WriteRange
-// writes random 4-byte integers, and upon completion, the written data is validated
-// by reading the data back via a separate IoMgr instance. All writes are expected to
-// complete successfully.
-TEST_F(DiskIoMgrTest, SingleWriter) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  num_ranges_written_ = 0;
-  string tmp_file = "/tmp/disk_io_mgr_test.txt";
-  int num_ranges = 100;
-  int64_t file_size = 1024 * 1024;
-  int64_t cur_offset = 0;
-  int success = CreateTempFile(tmp_file.c_str(), file_size);
-  if (success != 0) {
-    LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " <<
-        file_size;
-    EXPECT_TRUE(false);
-  }
-
-  scoped_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 1, 10));
-  MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
-  ASSERT_OK(read_io_mgr->Init(&reader_mem_tracker));
-  unique_ptr<DiskIoRequestContext> reader =
-      read_io_mgr->RegisterContext(&reader_mem_tracker);
-  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-      pool_.Clear(); // Destroy scan ranges from previous iterations.
-      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 10);
-      ASSERT_OK(io_mgr.Init(&mem_tracker));
-      unique_ptr<DiskIoRequestContext> writer = io_mgr.RegisterContext(&mem_tracker);
-      for (int i = 0; i < num_ranges; ++i) {
-        int32_t* data = pool_.Add(new int32_t);
-        *data = rand();
-        DiskIoMgr::WriteRange** new_range = pool_.Add(new DiskIoMgr::WriteRange*);
-        DiskIoMgr::WriteRange::WriteDoneCallback callback =
-            bind(mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, num_ranges,
-                new_range, read_io_mgr.get(), reader.get(), data, Status::OK(), _1);
-        *new_range = pool_.Add(new DiskIoMgr::WriteRange(
-            tmp_file, cur_offset, num_ranges % num_disks, callback));
-        (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-        EXPECT_OK(io_mgr.AddWriteRange(writer.get(), *new_range));
-        cur_offset += sizeof(int32_t);
-      }
-
-      {
-        unique_lock<mutex> lock(written_mutex_);
-        while (num_ranges_written_ < num_ranges) writes_done_.Wait(lock);
-      }
-      num_ranges_written_ = 0;
-      io_mgr.UnregisterContext(writer.get());
-    }
-  }
-
-  read_io_mgr->UnregisterContext(reader.get());
-  read_io_mgr.reset();
-}
-
-// Perform invalid writes (e.g. file in non-existent directory, negative offset) and
-// validate that an error status is returned via the write callback.
-TEST_F(DiskIoMgrTest, InvalidWrite) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  num_ranges_written_ = 0;
-  string tmp_file = "/non-existent/file.txt";
-  DiskIoMgr io_mgr(1, 1, 1, 1, 10);
-  ASSERT_OK(io_mgr.Init(&mem_tracker));
-  unique_ptr<DiskIoRequestContext> writer = io_mgr.RegisterContext(nullptr);
-  int32_t* data = pool_.Add(new int32_t);
-  *data = rand();
-
-  // Write to file in non-existent directory.
-  DiskIoMgr::WriteRange** new_range = pool_.Add(new DiskIoMgr::WriteRange*);
-  DiskIoMgr::WriteRange::WriteDoneCallback callback =
-      bind(mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, 2, new_range,
-          (DiskIoMgr*)nullptr, (DiskIoRequestContext*)nullptr, data,
-          Status(TErrorCode::DISK_IO_ERROR, "Test Failure"), _1);
-  *new_range = pool_.Add(new DiskIoMgr::WriteRange(tmp_file, rand(), 0, callback));
-
-  (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-  EXPECT_OK(io_mgr.AddWriteRange(writer.get(), *new_range));
-
-  // Write to a bad location in a file that exists.
-  tmp_file = "/tmp/disk_io_mgr_test.txt";
-  int success = CreateTempFile(tmp_file.c_str(), 100);
-  if (success != 0) {
-    LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size 100";
-    EXPECT_TRUE(false);
-  }
-
-  new_range = pool_.Add(new DiskIoMgr::WriteRange*);
-  callback = bind(mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, 2,
-      new_range, (DiskIoMgr*)nullptr, (DiskIoRequestContext*)nullptr,
-      data, Status(TErrorCode::DISK_IO_ERROR, "Test Failure"), _1);
-
-  *new_range = pool_.Add(new DiskIoMgr::WriteRange(tmp_file, -1, 0, callback));
-  (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-  EXPECT_OK(io_mgr.AddWriteRange(writer.get(), *new_range));
-
-  {
-    unique_lock<mutex> lock(written_mutex_);
-    while (num_ranges_written_ < 2) writes_done_.Wait(lock);
-  }
-  num_ranges_written_ = 0;
-  io_mgr.UnregisterContext(writer.get());
-}
-
-// Issue a number of writes, cancel the writer context and issue more writes.
-// AddWriteRange() is expected to succeed before the cancel and fail after it.
-// The writes themselves may finish with status cancelled or ok.
-TEST_F(DiskIoMgrTest, SingleWriterCancel) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  num_ranges_written_ = 0;
-  string tmp_file = "/tmp/disk_io_mgr_test.txt";
-  int num_ranges = 100;
-  int num_ranges_before_cancel = 25;
-  int64_t file_size = 1024 * 1024;
-  int64_t cur_offset = 0;
-  int success = CreateTempFile(tmp_file.c_str(), file_size);
-  if (success != 0) {
-    LOG(ERROR) << "Error creating temp file " << tmp_file.c_str() << " of size " <<
-        file_size;
-    EXPECT_TRUE(false);
-  }
-
-  scoped_ptr<DiskIoMgr> read_io_mgr(new DiskIoMgr(1, 1, 1, 1, 10));
-  MemTracker reader_mem_tracker(LARGE_MEM_LIMIT);
-  ASSERT_OK(read_io_mgr->Init(&reader_mem_tracker));
-  unique_ptr<DiskIoRequestContext> reader =
-      read_io_mgr->RegisterContext(&reader_mem_tracker);
-  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-      pool_.Clear(); // Destroy scan ranges from previous iterations.
-      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 10);
-      ASSERT_OK(io_mgr.Init(&mem_tracker));
-      unique_ptr<DiskIoRequestContext> writer = io_mgr.RegisterContext(&mem_tracker);
-      Status validate_status = Status::OK();
-      for (int i = 0; i < num_ranges; ++i) {
-        if (i == num_ranges_before_cancel) {
-          io_mgr.CancelContext(writer.get());
-          validate_status = Status::CANCELLED;
-        }
-        int32_t* data = pool_.Add(new int32_t);
-        *data = rand();
-        DiskIoMgr::WriteRange** new_range = pool_.Add(new DiskIoMgr::WriteRange*);
-        DiskIoMgr::WriteRange::WriteDoneCallback callback = bind(
-            mem_fn(&DiskIoMgrTest::WriteValidateCallback), this, num_ranges_before_cancel,
-            new_range, read_io_mgr.get(), reader.get(), data, Status::CANCELLED, _1);
-        *new_range = pool_.Add(new DiskIoMgr::WriteRange(
-            tmp_file, cur_offset, num_ranges % num_disks, callback));
-        (*new_range)->SetData(reinterpret_cast<uint8_t*>(data), sizeof(int32_t));
-        cur_offset += sizeof(int32_t);
-        Status add_status = io_mgr.AddWriteRange(writer.get(), *new_range);
-        EXPECT_TRUE(add_status.code() == validate_status.code());
-      }
-
-      {
-        unique_lock<mutex> lock(written_mutex_);
-        while (num_ranges_written_ < num_ranges_before_cancel) writes_done_.Wait(lock);
-      }
-      num_ranges_written_ = 0;
-      io_mgr.UnregisterContext(writer.get());
-    }
-  }
-
-  read_io_mgr->UnregisterContext(reader.get());
-  read_io_mgr.reset();
-}
-
-// Basic test with a single reader, testing multiple threads, disks and a different
-// number of buffers.
-TEST_F(DiskIoMgrTest, SingleReader) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "abcdefghijklm";
-  int len = strlen(data);
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  int64_t iters = 0;
-  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-      for (int num_read_threads = 1; num_read_threads <= 5; ++num_read_threads) {
-        ObjectPool pool;
-        LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                  << " num_disk=" << num_disks
-                  << " num_read_threads=" << num_read_threads;
-
-        if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-        DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 1);
-
-        ASSERT_OK(io_mgr.Init(&mem_tracker));
-        MemTracker reader_mem_tracker;
-        unique_ptr<DiskIoRequestContext> reader =
-            io_mgr.RegisterContext(&reader_mem_tracker);
-
-        vector<DiskIoMgr::ScanRange*> ranges;
-        for (int i = 0; i < len; ++i) {
-          int disk_id = i % num_disks;
-          ranges.push_back(InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime));
-        }
-        ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
-
-        AtomicInt32 num_ranges_processed;
-        thread_group threads;
-        for (int i = 0; i < num_read_threads; ++i) {
-          threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data, len,
-              Status::OK(), 0, &num_ranges_processed));
-        }
-        threads.join_all();
-
-        EXPECT_EQ(num_ranges_processed.Load(), ranges.size());
-        io_mgr.UnregisterContext(reader.get());
-        EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-      }
-    }
-  }
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// This test issues adding additional scan ranges while there are some still in flight.
-TEST_F(DiskIoMgrTest, AddScanRangeTest) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "abcdefghijklm";
-  int len = strlen(data);
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  int64_t iters = 0;
-  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-      pool_.Clear(); // Destroy scan ranges from previous iterations.
-      LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                << " num_disk=" << num_disks;
-
-      if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 1);
-
-      ASSERT_OK(io_mgr.Init(&mem_tracker));
-      MemTracker reader_mem_tracker;
-      unique_ptr<DiskIoRequestContext> reader =
-          io_mgr.RegisterContext(&reader_mem_tracker);
-
-      vector<DiskIoMgr::ScanRange*> ranges_first_half;
-      vector<DiskIoMgr::ScanRange*> ranges_second_half;
-      for (int i = 0; i < len; ++i) {
-        int disk_id = i % num_disks;
-        if (i > len / 2) {
-          ranges_second_half.push_back(
-              InitRange(tmp_file, i, 1, disk_id, stat_val.st_mtime));
-        } else {
-          ranges_first_half.push_back(
-              InitRange(tmp_file, i, 1, disk_id, stat_val.st_mtime));
-        }
-      }
-      AtomicInt32 num_ranges_processed;
-
-      // Issue first half the scan ranges.
-      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges_first_half));
-
-      // Read a couple of them
-      ScanRangeThread(&io_mgr, reader.get(), data, strlen(data), Status::OK(), 2,
-          &num_ranges_processed);
-
-      // Issue second half
-      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges_second_half));
-
-      // Start up some threads and then cancel
-      thread_group threads;
-      for (int i = 0; i < 3; ++i) {
-        threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
-            strlen(data), Status::CANCELLED, 0, &num_ranges_processed));
-      }
-
-      threads.join_all();
-      EXPECT_EQ(num_ranges_processed.Load(), len);
-      io_mgr.UnregisterContext(reader.get());
-      EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-    }
-  }
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Test to make sure that sync reads and async reads work together
-// Note: this test is constructed so the number of buffers is greater than the
-// number of scan ranges.
-TEST_F(DiskIoMgrTest, SyncReadTest) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "abcdefghijklm";
-  int len = strlen(data);
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  int64_t iters = 0;
-  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-      pool_.Clear(); // Destroy scan ranges from previous iterations.
-      LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                << " num_disk=" << num_disks;
-
-      if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk,
-          MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
-      ASSERT_OK(io_mgr.Init(&mem_tracker));
-      MemTracker reader_mem_tracker;
-      unique_ptr<DiskIoRequestContext> reader =
-          io_mgr.RegisterContext(&reader_mem_tracker);
-
-      DiskIoMgr::ScanRange* complete_range =
-          InitRange(tmp_file, 0, strlen(data), 0, stat_val.st_mtime);
-
-      // Issue some reads before the async ones are issued
-      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-
-      vector<DiskIoMgr::ScanRange*> ranges;
-      for (int i = 0; i < len; ++i) {
-        int disk_id = i % num_disks;
-        ranges.push_back(InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime));
-      }
-      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
-
-      AtomicInt32 num_ranges_processed;
-      thread_group threads;
-      for (int i = 0; i < 5; ++i) {
-        threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
-            strlen(data), Status::OK(), 0, &num_ranges_processed));
-      }
-
-      // Issue some more sync ranges
-      for (int i = 0; i < 5; ++i) {
-        sched_yield();
-        ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-      }
-
-      threads.join_all();
-
-      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-
-      EXPECT_EQ(num_ranges_processed.Load(), ranges.size());
-      io_mgr.UnregisterContext(reader.get());
-      EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-    }
-  }
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Tests a single reader cancelling half way through scan ranges.
-TEST_F(DiskIoMgrTest, SingleReaderCancel) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "abcdefghijklm";
-  int len = strlen(data);
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  int64_t iters = 0;
-  for (int num_threads_per_disk = 1; num_threads_per_disk <= 5; ++num_threads_per_disk) {
-    for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-      pool_.Clear(); // Destroy scan ranges from previous iterations.
-      LOG(INFO) << "Starting test with num_threads_per_disk=" << num_threads_per_disk
-                << " num_disk=" << num_disks;
-
-      if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-      DiskIoMgr io_mgr(num_disks, num_threads_per_disk, num_threads_per_disk, 1, 1);
-
-      ASSERT_OK(io_mgr.Init(&mem_tracker));
-      MemTracker reader_mem_tracker;
-      unique_ptr<DiskIoRequestContext> reader =
-          io_mgr.RegisterContext(&reader_mem_tracker);
-
-      vector<DiskIoMgr::ScanRange*> ranges;
-      for (int i = 0; i < len; ++i) {
-        int disk_id = i % num_disks;
-        ranges.push_back(InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime));
-      }
-      ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
-
-      AtomicInt32 num_ranges_processed;
-      int num_succesful_ranges = ranges.size() / 2;
-      // Read half the ranges
-      for (int i = 0; i < num_succesful_ranges; ++i) {
-        ScanRangeThread(&io_mgr, reader.get(), data, strlen(data), Status::OK(), 1,
-            &num_ranges_processed);
-      }
-      EXPECT_EQ(num_ranges_processed.Load(), num_succesful_ranges);
-
-      // Start up some threads and then cancel
-      thread_group threads;
-      for (int i = 0; i < 3; ++i) {
-        threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
-            strlen(data), Status::CANCELLED, 0, &num_ranges_processed));
-      }
-
-      io_mgr.CancelContext(reader.get());
-      sched_yield();
-
-      threads.join_all();
-      EXPECT_TRUE(io_mgr.context_status(reader.get()).IsCancelled());
-      io_mgr.UnregisterContext(reader.get());
-      EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-    }
-  }
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Test when the reader goes over the mem limit
-TEST_F(DiskIoMgrTest, MemLimits) {
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "abcdefghijklm";
-  int len = strlen(data);
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  const int mem_limit_num_buffers = 2;
-  // Allocate enough ranges so that the total buffers exceeds the mem limit.
-  const int num_ranges = 25;
-  {
-    MemTracker root_mem_tracker(mem_limit_num_buffers * MAX_BUFFER_SIZE);
-    DiskIoMgr io_mgr(1, 1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
-    ASSERT_OK(io_mgr.Init(&root_mem_tracker));
-    MemTracker reader_mem_tracker(-1, "Reader", &root_mem_tracker);
-    unique_ptr<DiskIoRequestContext> reader = io_mgr.RegisterContext(&reader_mem_tracker);
-
-    vector<DiskIoMgr::ScanRange*> ranges;
-    for (int i = 0; i < num_ranges; ++i) {
-      ranges.push_back(InitRange(tmp_file, 0, len, 0, stat_val.st_mtime));
-    }
-    ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
-
-    // Don't return buffers to force memory pressure
-    vector<unique_ptr<DiskIoMgr::BufferDescriptor>> buffers;
-
-    AtomicInt32 num_ranges_processed;
-    ScanRangeThread(&io_mgr, reader.get(), data, strlen(data), Status::MemLimitExceeded(),
-        1, &num_ranges_processed);
-
-    char result[strlen(data) + 1];
-    // Keep reading new ranges without returning buffers. This forces us
-    // to go over the limit eventually.
-    while (true) {
-      memset(result, 0, strlen(data) + 1);
-      DiskIoMgr::ScanRange* range = nullptr;
-      Status status = io_mgr.GetNextRange(reader.get(), &range);
-      ASSERT_TRUE(status.ok() || status.IsMemLimitExceeded());
-      if (range == nullptr) break;
-
-      while (true) {
-        unique_ptr<DiskIoMgr::BufferDescriptor> buffer;
-        Status status = range->GetNext(&buffer);
-        ASSERT_TRUE(status.ok() || status.IsMemLimitExceeded());
-        if (buffer == nullptr) break;
-        memcpy(result + range->offset() + buffer->scan_range_offset(),
-            buffer->buffer(), buffer->len());
-        buffers.push_back(move(buffer));
-      }
-      ValidateEmptyOrCorrect(data, result, strlen(data));
-    }
-
-    for (int i = 0; i < buffers.size(); ++i) {
-      io_mgr.ReturnBuffer(move(buffers[i]));
-    }
-
-    EXPECT_TRUE(io_mgr.context_status(reader.get()).IsMemLimitExceeded());
-    io_mgr.UnregisterContext(reader.get());
-    EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-  }
-}
-
-// Test when some scan ranges are marked as being cached.
-// Since these files are not in HDFS, the cached path always fails so this
-// only tests the fallback mechanism.
-// TODO: we can fake the cached read path without HDFS
-TEST_F(DiskIoMgrTest, CachedReads) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "abcdefghijklm";
-  int len = strlen(data);
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  const int num_disks = 2;
-  {
-    DiskIoMgr io_mgr(num_disks, 1, 1, MIN_BUFFER_SIZE, MAX_BUFFER_SIZE);
-
-    ASSERT_OK(io_mgr.Init(&mem_tracker));
-    MemTracker reader_mem_tracker;
-    unique_ptr<DiskIoRequestContext> reader = io_mgr.RegisterContext(&reader_mem_tracker);
-
-    DiskIoMgr::ScanRange* complete_range =
-        InitRange(tmp_file, 0, strlen(data), 0, stat_val.st_mtime, nullptr, true);
-
-    // Issue some reads before the async ones are issued
-    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-
-    vector<DiskIoMgr::ScanRange*> ranges;
-    for (int i = 0; i < len; ++i) {
-      int disk_id = i % num_disks;
-      ranges.push_back(
-          InitRange(tmp_file, 0, len, disk_id, stat_val.st_mtime, nullptr, true));
-    }
-    ASSERT_OK(io_mgr.AddScanRanges(reader.get(), ranges));
-
-    AtomicInt32 num_ranges_processed;
-    thread_group threads;
-    for (int i = 0; i < 5; ++i) {
-      threads.add_thread(new thread(ScanRangeThread, &io_mgr, reader.get(), data,
-          strlen(data), Status::OK(), 0, &num_ranges_processed));
-    }
-
-    // Issue some more sync ranges
-    for (int i = 0; i < 5; ++i) {
-      sched_yield();
-      ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-    }
-
-    threads.join_all();
-
-    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-    ValidateSyncRead(&io_mgr, reader.get(), complete_range, data);
-
-    EXPECT_EQ(num_ranges_processed.Load(), ranges.size());
-    io_mgr.UnregisterContext(reader.get());
-    EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-  }
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-TEST_F(DiskIoMgrTest, MultipleReaderWriter) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const int ITERATIONS = 1;
-  const char* data = "abcdefghijklmnopqrstuvwxyz";
-  const int num_contexts = 5;
-  const int file_size = 4 * 1024;
-  const int num_writes_queued = 5;
-  const int num_reads_queued = 5;
-
-  string file_name = "/tmp/disk_io_mgr_test.txt";
-  int success = CreateTempFile(file_name.c_str(), file_size);
-  if (success != 0) {
-    LOG(ERROR) << "Error creating temp file " << file_name.c_str() << " of size " <<
-        file_size;
-    ASSERT_TRUE(false);
-  }
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(file_name.c_str(), &stat_val);
-
-  int64_t iters = 0;
-  vector<unique_ptr<DiskIoRequestContext>> contexts(num_contexts);
-  Status status;
-  for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
-    for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
-      for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-        DiskIoMgr io_mgr(num_disks, threads_per_disk, threads_per_disk, MIN_BUFFER_SIZE,
-            MAX_BUFFER_SIZE);
-        ASSERT_OK(io_mgr.Init(&mem_tracker));
-        for (int file_index = 0; file_index < num_contexts; ++file_index) {
-          contexts[file_index] = io_mgr.RegisterContext(&mem_tracker);
-        }
-        pool_.Clear();
-        int read_offset = 0;
-        int write_offset = 0;
-        while (read_offset < file_size) {
-          for (int context_index = 0; context_index < num_contexts; ++context_index) {
-            if (++iters % 5000 == 0) LOG(ERROR) << "Starting iteration " << iters;
-            AtomicInt32 num_ranges_processed;
-            thread_group threads;
-            vector<DiskIoMgr::ScanRange*> ranges;
-            int num_scan_ranges = min<int>(num_reads_queued, write_offset - read_offset);
-            for (int i = 0; i < num_scan_ranges; ++i) {
-              ranges.push_back(InitRange(
-                  file_name.c_str(), read_offset, 1, i % num_disks, stat_val.st_mtime));
-              threads.add_thread(
-                  new thread(ScanRangeThread, &io_mgr, contexts[context_index].get(),
-                      reinterpret_cast<const char*>(data + (read_offset % strlen(data))),
-                      1, Status::OK(), num_scan_ranges, &num_ranges_processed));
-              ++read_offset;
-            }
-
-            num_ranges_written_ = 0;
-            int num_write_ranges = min<int>(num_writes_queued, file_size - write_offset);
-            for (int i = 0; i < num_write_ranges; ++i) {
-              DiskIoMgr::WriteRange::WriteDoneCallback callback =
-                  bind(mem_fn(&DiskIoMgrTest::WriteCompleteCallback),
-                      this, num_write_ranges, _1);
-              DiskIoMgr::WriteRange* new_range = pool_.Add(new DiskIoMgr::WriteRange(
-                  file_name, write_offset, i % num_disks, callback));
-              new_range->SetData(
-                  reinterpret_cast<const uint8_t*>(data + (write_offset % strlen(data))),
-                  1);
-              status = io_mgr.AddWriteRange(contexts[context_index].get(), new_range);
-              ++write_offset;
-            }
-
-            {
-              unique_lock<mutex> lock(written_mutex_);
-              while (num_ranges_written_ < num_write_ranges) writes_done_.Wait(lock);
-            }
-
-            threads.join_all();
-          } // for (int context_index
-        } // while (read_offset < file_size)
-
-        for (int file_index = 0; file_index < num_contexts; ++file_index) {
-          io_mgr.UnregisterContext(contexts[file_index].get());
-        }
-      } // for (int num_disks
-    } // for (int threads_per_disk
-  } // for (int iteration
-}
-
-// This test will test multiple concurrent reads each reading a different file.
-TEST_F(DiskIoMgrTest, MultipleReader) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const int NUM_READERS = 5;
-  const int DATA_LEN = 50;
-  const int ITERATIONS = 25;
-  const int NUM_THREADS_PER_READER = 3;
-
-  vector<string> file_names;
-  vector<int64_t> mtimes;
-  vector<string> data;
-  vector<unique_ptr<DiskIoRequestContext>> readers;
-  vector<char*> results;
-
-  file_names.resize(NUM_READERS);
-  readers.resize(NUM_READERS);
-  mtimes.resize(NUM_READERS);
-  data.resize(NUM_READERS);
-  results.resize(NUM_READERS);
-
-  // Initialize data for each reader.  The data will be
-  // 'abcd...' for reader one, 'bcde...' for reader two (wrapping around at 'z')
-  for (int i = 0; i < NUM_READERS; ++i) {
-    char buf[DATA_LEN];
-    for (int j = 0; j < DATA_LEN; ++j) {
-      int c = (j + i) % 26;
-      buf[j] = 'a' + c;
-    }
-    data[i] = string(buf, DATA_LEN);
-
-    stringstream ss;
-    ss << "/tmp/disk_io_mgr_test" << i << ".txt";
-    file_names[i] = ss.str();
-    CreateTempFile(ss.str().c_str(), data[i].c_str());
-
-    // Get mtime for file
-    struct stat stat_val;
-    stat(file_names[i].c_str(), &stat_val);
-    mtimes[i] = stat_val.st_mtime;
-
-    results[i] = new char[DATA_LEN + 1];
-    memset(results[i], 0, DATA_LEN + 1);
-  }
-
-  // This exercises concurrency, run the test multiple times
-  int64_t iters = 0;
-  for (int iteration = 0; iteration < ITERATIONS; ++iteration) {
-    for (int threads_per_disk = 1; threads_per_disk <= 5; ++threads_per_disk) {
-      for (int num_disks = 1; num_disks <= 5; num_disks += 2) {
-        pool_.Clear(); // Destroy scan ranges from previous iterations.
-        LOG(INFO) << "Starting test with num_threads_per_disk=" << threads_per_disk
-                  << " num_disk=" << num_disks;
-        if (++iters % 2500 == 0) LOG(ERROR) << "Starting iteration " << iters;
-
-        DiskIoMgr io_mgr(num_disks, threads_per_disk, threads_per_disk, MIN_BUFFER_SIZE,
-            MAX_BUFFER_SIZE);
-        EXPECT_OK(io_mgr.Init(&mem_tracker));
-
-        for (int i = 0; i < NUM_READERS; ++i) {
-          readers[i] = io_mgr.RegisterContext(&mem_tracker);
-
-          vector<DiskIoMgr::ScanRange*> ranges;
-          for (int j = 0; j < DATA_LEN; ++j) {
-            int disk_id = j % num_disks;
-            ranges.push_back(InitRange(file_names[i].c_str(), j, 1, disk_id, mtimes[i]));
-          }
-          ASSERT_OK(io_mgr.AddScanRanges(readers[i].get(), ranges));
-        }
-
-        AtomicInt32 num_ranges_processed;
-        thread_group threads;
-        for (int i = 0; i < NUM_READERS; ++i) {
-          for (int j = 0; j < NUM_THREADS_PER_READER; ++j) {
-            threads.add_thread(new thread(ScanRangeThread, &io_mgr, readers[i].get(),
-                data[i].c_str(), data[i].size(), Status::OK(), 0, &num_ranges_processed));
-          }
-        }
-        threads.join_all();
-        EXPECT_EQ(num_ranges_processed.Load(), DATA_LEN * NUM_READERS);
-        for (int i = 0; i < NUM_READERS; ++i) {
-          io_mgr.UnregisterContext(readers[i].get());
-        }
-      }
-    }
-  }
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Stress test for multiple clients with cancellation
-// TODO: the stress app should be expanded to include sync reads and adding scan
-// ranges in the middle.
-TEST_F(DiskIoMgrTest, StressTest) {
-  // Run the test with 5 disks, 5 threads per disk, 10 clients and with cancellation
-  DiskIoMgrStress test(5, 5, 10, true);
-  test.Run(2); // In seconds
-}
-
-TEST_F(DiskIoMgrTest, Buffers) {
-  // Test default min/max buffer size
-  int min_buffer_size = 1024;
-  int max_buffer_size = 8 * 1024 * 1024; // 8 MB
-  MemTracker root_mem_tracker(max_buffer_size * 2);
-
-  DiskIoMgr io_mgr(1, 1, 1, min_buffer_size, max_buffer_size);
-  ASSERT_OK(io_mgr.Init(&root_mem_tracker));
-  ASSERT_EQ(root_mem_tracker.consumption(), 0);
-
-  MemTracker reader_mem_tracker(-1, "Reader", &root_mem_tracker);
-  unique_ptr<DiskIoRequestContext> reader;
-  reader = io_mgr.RegisterContext(&reader_mem_tracker);
-
-  DiskIoMgr::ScanRange* dummy_range = InitRange("dummy", 0, 0, 0, 0);
-
-  // buffer length should be rounded up to min buffer size
-  int64_t buffer_len = 1;
-  unique_ptr<DiskIoMgr::BufferDescriptor> buffer_desc;
-  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
-  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
-  EXPECT_EQ(min_buffer_size, buffer_desc->buffer_len());
-  EXPECT_EQ(1, io_mgr.num_allocated_buffers_.Load());
-  io_mgr.FreeBufferMemory(buffer_desc.get());
-  io_mgr.ReturnBuffer(move(buffer_desc));
-  EXPECT_EQ(min_buffer_size, root_mem_tracker.consumption());
-
-  // reuse buffer
-  buffer_len = min_buffer_size;
-  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
-  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
-  EXPECT_EQ(min_buffer_size, buffer_desc->buffer_len());
-  EXPECT_EQ(1, io_mgr.num_allocated_buffers_.Load());
-  io_mgr.FreeBufferMemory(buffer_desc.get());
-  io_mgr.ReturnBuffer(move(buffer_desc));
-  EXPECT_EQ(min_buffer_size, root_mem_tracker.consumption());
-
-  // bump up to next buffer size
-  buffer_len = min_buffer_size + 1;
-  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
-  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
-  EXPECT_EQ(min_buffer_size * 2, buffer_desc->buffer_len());
-  EXPECT_EQ(2, io_mgr.num_allocated_buffers_.Load());
-  EXPECT_EQ(min_buffer_size * 3, root_mem_tracker.consumption());
-
-  // gc unused buffer
-  io_mgr.GcIoBuffers();
-  EXPECT_EQ(1, io_mgr.num_allocated_buffers_.Load());
-  EXPECT_EQ(min_buffer_size * 2, root_mem_tracker.consumption());
-
-  io_mgr.FreeBufferMemory(buffer_desc.get());
-  io_mgr.ReturnBuffer(move(buffer_desc));
-
-  // max buffer size
-  buffer_len = max_buffer_size;
-  buffer_desc = io_mgr.GetFreeBuffer(reader.get(), dummy_range, buffer_len);
-  EXPECT_TRUE(buffer_desc->buffer() != nullptr);
-  EXPECT_EQ(max_buffer_size, buffer_desc->buffer_len());
-  EXPECT_EQ(2, io_mgr.num_allocated_buffers_.Load());
-  io_mgr.FreeBufferMemory(buffer_desc.get());
-  io_mgr.ReturnBuffer(move(buffer_desc));
-  EXPECT_EQ(min_buffer_size * 2 + max_buffer_size, root_mem_tracker.consumption());
-
-  // gc buffers
-  io_mgr.GcIoBuffers();
-  EXPECT_EQ(io_mgr.num_allocated_buffers_.Load(), 0);
-  EXPECT_EQ(root_mem_tracker.consumption(), 0);
-  io_mgr.UnregisterContext(reader.get());
-}
-
-// IMPALA-2366: handle partial read where range goes past end of file.
-TEST_F(DiskIoMgrTest, PartialRead) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "the quick brown fox jumped over the lazy dog";
-  int len = strlen(data);
-  int read_len = len + 1000; // Read past end of file.
-  CreateTempFile(tmp_file, data);
-
-  // Get mtime for file
-  struct stat stat_val;
-  stat(tmp_file, &stat_val);
-
-  scoped_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, 1, read_len, read_len));
-
-  ASSERT_OK(io_mgr->Init(&mem_tracker));
-  MemTracker reader_mem_tracker;
-  unique_ptr<DiskIoRequestContext> reader;
-  reader = io_mgr->RegisterContext(&reader_mem_tracker);
-
-  // We should not read past the end of file.
-  DiskIoMgr::ScanRange* range = InitRange(tmp_file, 0, read_len, 0, stat_val.st_mtime);
-  unique_ptr<DiskIoMgr::BufferDescriptor> buffer;
-  ASSERT_OK(io_mgr->Read(reader.get(), range, &buffer));
-  ASSERT_TRUE(buffer->eosr());
-  ASSERT_EQ(len, buffer->len());
-  ASSERT_TRUE(memcmp(buffer->buffer(), data, len) == 0);
-  io_mgr->ReturnBuffer(move(buffer));
-
-  io_mgr->UnregisterContext(reader.get());
-  pool_.Clear();
-  io_mgr.reset();
-  EXPECT_EQ(reader_mem_tracker.consumption(), 0);
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Test reading into a client-allocated buffer.
-TEST_F(DiskIoMgrTest, ReadIntoClientBuffer) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/tmp/disk_io_mgr_test.txt";
-  const char* data = "the quick brown fox jumped over the lazy dog";
-  int len = strlen(data);
-  int read_len = 4; // Make buffer size smaller than client-provided buffer.
-  CreateTempFile(tmp_file, data);
-
-  scoped_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, 1, read_len, read_len));
-
-  ASSERT_OK(io_mgr->Init(&mem_tracker));
-  // Reader doesn't need to provide mem tracker if it's providing buffers.
-  MemTracker* reader_mem_tracker = nullptr;
-  unique_ptr<DiskIoRequestContext> reader;
-  reader = io_mgr->RegisterContext(reader_mem_tracker);
-
-  for (int buffer_len : vector<int>({len - 1, len, len + 1})) {
-    vector<uint8_t> client_buffer(buffer_len);
-    int scan_len = min(len, buffer_len);
-    DiskIoMgr::ScanRange* range = AllocateRange();
-    range->Reset(nullptr, tmp_file, scan_len, 0, 0, true,
-        DiskIoMgr::BufferOpts::ReadInto(client_buffer.data(), buffer_len));
-    ASSERT_OK(io_mgr->AddScanRange(reader.get(), range, true));
-
-    unique_ptr<DiskIoMgr::BufferDescriptor> io_buffer;
-    ASSERT_OK(range->GetNext(&io_buffer));
-    ASSERT_TRUE(io_buffer->eosr());
-    ASSERT_EQ(scan_len, io_buffer->len());
-    ASSERT_EQ(client_buffer.data(), io_buffer->buffer());
-    ASSERT_EQ(memcmp(io_buffer->buffer(), data, scan_len), 0);
-
-    // DiskIoMgr should not have allocated memory.
-    EXPECT_EQ(mem_tracker.consumption(), 0);
-    io_mgr->ReturnBuffer(move(io_buffer));
-  }
-
-  io_mgr->UnregisterContext(reader.get());
-  pool_.Clear();
-  io_mgr.reset();
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Test reading into a client-allocated buffer where the read fails.
-TEST_F(DiskIoMgrTest, ReadIntoClientBufferError) {
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const char* tmp_file = "/file/that/does/not/exist";
-  const int SCAN_LEN = 128;
-
-  scoped_ptr<DiskIoMgr> io_mgr(new DiskIoMgr(1, 1, 1, SCAN_LEN, SCAN_LEN));
-
-  ASSERT_OK(io_mgr->Init(&mem_tracker));
-  // Reader doesn't need to provide mem tracker if it's providing buffers.
-  MemTracker* reader_mem_tracker = nullptr;
-  unique_ptr<DiskIoRequestContext> reader;
-  vector<uint8_t> client_buffer(SCAN_LEN);
-  for (int i = 0; i < 1000; ++i) {
-    reader = io_mgr->RegisterContext(reader_mem_tracker);
-    DiskIoMgr::ScanRange* range = AllocateRange();
-    range->Reset(nullptr, tmp_file, SCAN_LEN, 0, 0, true,
-        DiskIoMgr::BufferOpts::ReadInto(client_buffer.data(), SCAN_LEN));
-    ASSERT_OK(io_mgr->AddScanRange(reader.get(), range, true));
-
-    /// Also test the cancellation path. Run multiple iterations since it is racy whether
-    /// the read fails before the cancellation.
-    if (i >= 1) io_mgr->CancelContext(reader.get());
-
-    unique_ptr<DiskIoMgr::BufferDescriptor> io_buffer;
-    ASSERT_FALSE(range->GetNext(&io_buffer).ok());
-
-    // DiskIoMgr should not have allocated memory.
-    EXPECT_EQ(mem_tracker.consumption(), 0);
-
-    io_mgr->UnregisterContext(reader.get());
-  }
-
-  pool_.Clear();
-  io_mgr.reset();
-  EXPECT_EQ(mem_tracker.consumption(), 0);
-}
-
-// Test to verify configuration parameters for number of I/O threads per disk.
-TEST_F(DiskIoMgrTest, VerifyNumThreadsParameter) {
-  const int num_io_threads_for_remote_disks = FLAGS_num_remote_hdfs_io_threads
-      + FLAGS_num_s3_io_threads + FLAGS_num_adls_io_threads;
-
-  // Verify num_io_threads_per_rotational_disk and num_io_threads_per_solid_state_disk.
-  // Since we do not have control over which disk is used, we check for either type
-  // (rotational/solid state)
-  MemTracker mem_tracker(LARGE_MEM_LIMIT);
-  const int num_io_threads_per_rotational_or_ssd = 2;
-  DiskIoMgr io_mgr(1, num_io_threads_per_rotational_or_ssd,
-      num_io_threads_per_rotational_or_ssd, 1, 10);
-  ASSERT_OK(io_mgr.Init(&mem_tracker));
-  const int num_io_threads = io_mgr.disk_thread_group_.Size();
-  ASSERT_TRUE(num_io_threads ==
-      num_io_threads_per_rotational_or_ssd + num_io_threads_for_remote_disks);
-}
-}
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
-  return RUN_ALL_TESTS();
-}

[05/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
index 2c866e3..55660cb 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view.test
@@ -336,7 +336,7 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.smallint_col = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--02:SCAN HDFS [functional.alltypessmall b]
 |  |     partitions=4/4 files=4 size=6.32KB
@@ -345,7 +345,7 @@ PLAN-ROOT SINK
 |  01:SCAN HDFS [functional.alltypesagg a]
 |     partitions=1/11 files=1 size=73.39KB
 |     predicates: a.int_col > 899
-|     runtime filters: RF001 -> a.smallint_col
+|     runtime filters: RF002 -> a.smallint_col
 |
 00:SCAN HDFS [functional.alltypessmall c]
    partitions=4/4 files=4 size=6.32KB
@@ -378,7 +378,7 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  hash predicates: b.id = a.smallint_col
-|  |  runtime filters: RF001 <- a.smallint_col
+|  |  runtime filters: RF002 <- a.smallint_col
 |  |
 |  |--06:EXCHANGE [HASH(a.smallint_col)]
 |  |  |
@@ -391,7 +391,7 @@ PLAN-ROOT SINK
 |  02:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
 |     predicates: b.float_col > 4.5
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 07:EXCHANGE [HASH(c.id)]
 |
@@ -742,14 +742,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.smallint_col = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--02:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
 |
 01:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> a.tinyint_col, RF001 -> a.smallint_col
+   runtime filters: RF000 -> a.tinyint_col, RF002 -> a.smallint_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -778,7 +778,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.smallint_col = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--06:EXCHANGE [BROADCAST]
 |  |
@@ -787,7 +787,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [functional.alltypesagg a]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> a.tinyint_col, RF001 -> a.smallint_col
+   runtime filters: RF000 -> a.tinyint_col, RF002 -> a.smallint_col
 ====
 # complex join, having joined subquery on the lhs, and predicate
 # at multiple subquery level
@@ -823,7 +823,7 @@ PLAN-ROOT SINK
 |
 |--02:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.smallint_col = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--01:SCAN HDFS [functional.alltypessmall b]
 |  |     partitions=4/4 files=4 size=6.32KB
@@ -832,7 +832,7 @@ PLAN-ROOT SINK
 |  00:SCAN HDFS [functional.alltypesagg a]
 |     partitions=1/11 files=1 size=73.39KB
 |     predicates: a.int_col > 899
-|     runtime filters: RF001 -> a.smallint_col
+|     runtime filters: RF002 -> a.smallint_col
 |
 03:SCAN HDFS [functional.alltypessmall c]
    partitions=4/4 files=4 size=6.32KB
@@ -852,7 +852,7 @@ PLAN-ROOT SINK
 |  |
 |  02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  hash predicates: b.id = a.smallint_col
-|  |  runtime filters: RF001 <- a.smallint_col
+|  |  runtime filters: RF002 <- a.smallint_col
 |  |
 |  |--06:EXCHANGE [HASH(a.smallint_col)]
 |  |  |
@@ -865,7 +865,7 @@ PLAN-ROOT SINK
 |  01:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
 |     predicates: b.float_col > 4.5
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 07:EXCHANGE [HASH(c.id)]
 |
@@ -902,14 +902,14 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.smallint_col = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--02:SCAN HDFS [functional.alltypessmall b]
 |  |     partitions=4/4 files=4 size=6.32KB
 |  |
 |  01:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF001 -> a.smallint_col
+|     runtime filters: RF002 -> a.smallint_col
 |
 00:SCAN HDFS [functional.alltypessmall c]
    partitions=4/4 files=4 size=6.32KB
@@ -947,7 +947,7 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: a.smallint_col = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--07:EXCHANGE [BROADCAST]
 |  |  |
@@ -956,7 +956,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF001 -> a.smallint_col
+|     runtime filters: RF002 -> a.smallint_col
 |
 00:SCAN HDFS [functional.alltypessmall c]
    partitions=4/4 files=4 size=6.32KB
@@ -1103,14 +1103,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: x.id = a
-|  runtime filters: RF001 <- a
+|  runtime filters: RF002 <- a
 |
 |--01:UNION
 |     constant-operands=2
 |
 00:SCAN HDFS [functional.alltypessmall x]
    partitions=4/4 files=4 size=6.32KB
-   runtime filters: RF000 -> x.id + 2, RF001 -> x.id
+   runtime filters: RF000 -> x.id + 2, RF002 -> x.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1127,7 +1127,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [LEFT SEMI JOIN, BROADCAST]
 |  hash predicates: x.id = a
-|  runtime filters: RF001 <- a
+|  runtime filters: RF002 <- a
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -1136,7 +1136,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypessmall x]
    partitions=4/4 files=4 size=6.32KB
-   runtime filters: RF000 -> x.id + 2, RF001 -> x.id
+   runtime filters: RF000 -> x.id + 2, RF002 -> x.id
 ====
 # Tests that views correctly reanalyze cloned exprs. (IMPALA-984)
 select b.* from functional.decimal_tbl a left outer join
@@ -1224,7 +1224,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
@@ -1232,7 +1232,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.id, RF001 -> a.id
+   runtime filters: RF000 -> a.id, RF002 -> a.id
 ====
 # IMPALA-2665: Test correct assignment of On-clause predicate from an enclosing block
 # inside an inline view with an outer join.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
index a525f91..3330b2e 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/join-order.test
@@ -42,7 +42,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: l.l_orderkey = o.o_orderkey
-|  runtime filters: RF001 <- o.o_orderkey
+|  runtime filters: RF002 <- o.o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders o]
 |     partitions=1/1 files=1 size=162.56MB
@@ -52,7 +52,7 @@ PLAN-ROOT SINK
 02:SCAN HDFS [tpch.lineitem l]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate > '1995-03-15'
-   runtime filters: RF001 -> l.l_orderkey
+   runtime filters: RF002 -> l.l_orderkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -85,7 +85,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l.l_orderkey = o.o_orderkey
-|  runtime filters: RF001 <- o.o_orderkey
+|  runtime filters: RF002 <- o.o_orderkey
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -97,7 +97,7 @@ PLAN-ROOT SINK
 02:SCAN HDFS [tpch.lineitem l]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate > '1995-03-15'
-   runtime filters: RF001 -> l.l_orderkey
+   runtime filters: RF002 -> l.l_orderkey
 ====
 # Q3 - Shipping Priority Query
 # straight_join prevents join order optimization
@@ -143,7 +143,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: c.c_custkey = o.o_custkey
-|  runtime filters: RF001 <- o.o_custkey
+|  runtime filters: RF002 <- o.o_custkey
 |
 |--01:SCAN HDFS [tpch.orders o]
 |     partitions=1/1 files=1 size=162.56MB
@@ -153,7 +153,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [tpch.customer c]
    partitions=1/1 files=1 size=23.08MB
    predicates: c.c_mktsegment = 'BUILDING'
-   runtime filters: RF001 -> c.c_custkey
+   runtime filters: RF002 -> c.c_custkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -186,7 +186,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c.c_custkey = o.o_custkey
-|  runtime filters: RF001 <- o.o_custkey
+|  runtime filters: RF002 <- o.o_custkey
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -198,7 +198,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [tpch.customer c]
    partitions=1/1 files=1 size=23.08MB
    predicates: c.c_mktsegment = 'BUILDING'
-   runtime filters: RF001 -> c.c_custkey
+   runtime filters: RF002 -> c.c_custkey
 ====
 # Q5 - Local Supplier Volume Query
 # Modifications: Added round() call, converted selects from multiple tables
@@ -246,7 +246,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF001 <- n_nationkey
+|  runtime filters: RF002 <- n_nationkey
 |
 |--04:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
@@ -254,32 +254,32 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = s_nationkey, l_suppkey = s_suppkey
-|  runtime filters: RF002 <- s_nationkey, RF003 <- s_suppkey
+|  runtime filters: RF004 <- s_nationkey, RF005 <- s_suppkey
 |
 |--03:SCAN HDFS [tpch.supplier s]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF004 <- c_custkey
+|  runtime filters: RF008 <- c_custkey
 |
 |--00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF001 -> tpch.customer.c_nationkey, RF002 -> c_nationkey
+|     runtime filters: RF002 -> tpch.customer.c_nationkey, RF004 -> c_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders o]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1995-01-01', o_orderdate >= '1994-01-01'
-|     runtime filters: RF004 -> o_custkey
+|     runtime filters: RF008 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem l]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF003 -> l_suppkey, RF005 -> l_orderkey
+   runtime filters: RF005 -> l_suppkey, RF010 -> l_orderkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -312,7 +312,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF001 <- n_nationkey
+|  runtime filters: RF002 <- n_nationkey
 |
 |--16:EXCHANGE [BROADCAST]
 |  |
@@ -322,38 +322,38 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = s_nationkey, l_suppkey = s_suppkey
-|  runtime filters: RF002 <- s_nationkey, RF003 <- s_suppkey
+|  runtime filters: RF004 <- s_nationkey, RF005 <- s_suppkey
 |
 |--15:EXCHANGE [BROADCAST]
 |  |
 |  03:SCAN HDFS [tpch.supplier s]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF004 <- c_custkey
+|  runtime filters: RF008 <- c_custkey
 |
 |--14:EXCHANGE [BROADCAST]
 |  |
 |  00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF001 -> tpch.customer.c_nationkey, RF002 -> c_nationkey
+|     runtime filters: RF002 -> tpch.customer.c_nationkey, RF004 -> c_nationkey
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
 |  01:SCAN HDFS [tpch.orders o]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1995-01-01', o_orderdate >= '1994-01-01'
-|     runtime filters: RF004 -> o_custkey
+|     runtime filters: RF008 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem l]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF003 -> l_suppkey, RF005 -> l_orderkey
+   runtime filters: RF005 -> l_suppkey, RF010 -> l_orderkey
 ====
 # Q2 - Minimum Cost Supplier Query
 select
@@ -393,7 +393,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: s.s_nationkey = n.n_nationkey
-|  runtime filters: RF001 <- n.n_nationkey
+|  runtime filters: RF002 <- n.n_nationkey
 |
 |--03:SCAN HDFS [tpch.nation n]
 |     partitions=1/1 files=1 size=2.15KB
@@ -401,11 +401,11 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: s.s_suppkey = ps.ps_suppkey
-|  runtime filters: RF002 <- ps.ps_suppkey
+|  runtime filters: RF004 <- ps.ps_suppkey
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ps.ps_partkey = p.p_partkey
-|  |  runtime filters: RF003 <- p.p_partkey
+|  |  runtime filters: RF006 <- p.p_partkey
 |  |
 |  |--00:SCAN HDFS [tpch.part p]
 |  |     partitions=1/1 files=1 size=22.83MB
@@ -413,11 +413,11 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.partsupp ps]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF003 -> ps.ps_partkey
+|     runtime filters: RF006 -> ps.ps_partkey
 |
 01:SCAN HDFS [tpch.supplier s]
    partitions=1/1 files=1 size=1.33MB
-   runtime filters: RF001 -> s.s_nationkey, RF002 -> s.s_suppkey
+   runtime filters: RF002 -> s.s_nationkey, RF004 -> s.s_suppkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -435,7 +435,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s.s_nationkey = n.n_nationkey
-|  runtime filters: RF001 <- n.n_nationkey
+|  runtime filters: RF002 <- n.n_nationkey
 |
 |--11:EXCHANGE [BROADCAST]
 |  |
@@ -445,13 +445,13 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s.s_suppkey = ps.ps_suppkey
-|  runtime filters: RF002 <- ps.ps_suppkey
+|  runtime filters: RF004 <- ps.ps_suppkey
 |
 |--10:EXCHANGE [BROADCAST]
 |  |
 |  05:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ps.ps_partkey = p.p_partkey
-|  |  runtime filters: RF003 <- p.p_partkey
+|  |  runtime filters: RF006 <- p.p_partkey
 |  |
 |  |--09:EXCHANGE [BROADCAST]
 |  |  |
@@ -461,11 +461,11 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.partsupp ps]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF003 -> ps.ps_partkey
+|     runtime filters: RF006 -> ps.ps_partkey
 |
 01:SCAN HDFS [tpch.supplier s]
    partitions=1/1 files=1 size=1.33MB
-   runtime filters: RF001 -> s.s_nationkey, RF002 -> s.s_suppkey
+   runtime filters: RF002 -> s.s_nationkey, RF004 -> s.s_suppkey
 ====
 # Q4 - Order Priority Checking Query
 # the largest input is prevented from becoming the leftmost input by the semi-join
@@ -684,7 +684,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
@@ -692,7 +692,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpch.orders]
    partitions=1/1 files=1 size=162.56MB
-   runtime filters: RF001 -> o_custkey
+   runtime filters: RF002 -> o_custkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -716,7 +716,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--07:EXCHANGE [HASH(c_custkey)]
 |  |
@@ -728,7 +728,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpch.orders]
    partitions=1/1 files=1 size=162.56MB
-   runtime filters: RF001 -> o_custkey
+   runtime filters: RF002 -> o_custkey
 ====
 # order does not become the leftmost input because of the cross join;
 # the join with nation is done first because it reduces the intermediate output
@@ -835,7 +835,7 @@ PLAN-ROOT SINK
 |
 |--09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t4.id = t6.id
-|  |  runtime filters: RF001 <- t6.id
+|  |  runtime filters: RF002 <- t6.id
 |  |
 |  |--05:SCAN HDFS [functional.alltypestiny t6]
 |  |     partitions=4/4 files=4 size=460B
@@ -845,27 +845,27 @@ PLAN-ROOT SINK
 |  |
 |  |--07:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: t3.id = t2.id
-|  |  |  runtime filters: RF002 <- t2.id
+|  |  |  runtime filters: RF004 <- t2.id
 |  |  |
 |  |  |--06:HASH JOIN [INNER JOIN]
 |  |  |  |  hash predicates: t2.id = t1.id
-|  |  |  |  runtime filters: RF003 <- t1.id
+|  |  |  |  runtime filters: RF006 <- t1.id
 |  |  |  |
 |  |  |  |--00:SCAN HDFS [functional.alltypestiny t1]
 |  |  |  |     partitions=4/4 files=4 size=460B
-|  |  |  |     runtime filters: RF001 -> t1.id
+|  |  |  |     runtime filters: RF002 -> t1.id
 |  |  |  |
 |  |  |  01:SCAN HDFS [functional.alltypes t2]
 |  |  |     partitions=24/24 files=24 size=478.45KB
-|  |  |     runtime filters: RF001 -> t2.id, RF003 -> t2.id
+|  |  |     runtime filters: RF002 -> t2.id, RF006 -> t2.id
 |  |  |
 |  |  02:SCAN HDFS [functional.alltypessmall t3]
 |  |     partitions=4/4 files=4 size=6.32KB
-|  |     runtime filters: RF001 -> t3.id, RF002 -> t3.id
+|  |     runtime filters: RF002 -> t3.id, RF004 -> t3.id
 |  |
 |  03:SCAN HDFS [functional.alltypesagg t4]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF001 -> t4.id
+|     runtime filters: RF002 -> t4.id
 |
 04:SCAN HDFS [functional.alltypes t5]
    partitions=24/24 files=24 size=478.45KB
@@ -895,30 +895,30 @@ PLAN-ROOT SINK
 |  |
 |  |--10:HASH JOIN [RIGHT SEMI JOIN]
 |  |  |  hash predicates: t4.id = t3.id
-|  |  |  runtime filters: RF001 <- t3.id
+|  |  |  runtime filters: RF002 <- t3.id
 |  |  |
 |  |  |--09:HASH JOIN [INNER JOIN]
 |  |  |  |  hash predicates: t3.id = t2.id
-|  |  |  |  runtime filters: RF002 <- t2.id
+|  |  |  |  runtime filters: RF004 <- t2.id
 |  |  |  |
 |  |  |  |--08:HASH JOIN [RIGHT OUTER JOIN]
 |  |  |  |  |  hash predicates: t2.id = t1.id
-|  |  |  |  |  runtime filters: RF003 <- t1.id
+|  |  |  |  |  runtime filters: RF006 <- t1.id
 |  |  |  |  |
 |  |  |  |  |--00:SCAN HDFS [functional.alltypestiny t1]
 |  |  |  |  |     partitions=4/4 files=4 size=460B
 |  |  |  |  |
 |  |  |  |  01:SCAN HDFS [functional.alltypes t2]
 |  |  |  |     partitions=24/24 files=24 size=478.45KB
-|  |  |  |     runtime filters: RF003 -> t2.id
+|  |  |  |     runtime filters: RF006 -> t2.id
 |  |  |  |
 |  |  |  02:SCAN HDFS [functional.alltypessmall t3]
 |  |  |     partitions=4/4 files=4 size=6.32KB
-|  |  |     runtime filters: RF002 -> t3.id
+|  |  |     runtime filters: RF004 -> t3.id
 |  |  |
 |  |  03:SCAN HDFS [functional.alltypesagg t4]
 |  |     partitions=11/11 files=11 size=814.73KB
-|  |     runtime filters: RF001 -> t4.id
+|  |     runtime filters: RF002 -> t4.id
 |  |
 |  04:SCAN HDFS [functional.alltypes t5]
 |     partitions=24/24 files=24 size=478.45KB
@@ -948,38 +948,38 @@ PLAN-ROOT SINK
 |
 |--11:HASH JOIN [RIGHT SEMI JOIN]
 |  |  hash predicates: t5.id = t4.id
-|  |  runtime filters: RF001 <- t4.id
+|  |  runtime filters: RF002 <- t4.id
 |  |
 |  |--10:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: t3.id = t2.id
-|  |  |  runtime filters: RF002 <- t2.id
+|  |  |  runtime filters: RF004 <- t2.id
 |  |  |
 |  |  |--09:HASH JOIN [INNER JOIN]
 |  |  |  |  hash predicates: t4.id = t2.id
-|  |  |  |  runtime filters: RF003 <- t2.id
+|  |  |  |  runtime filters: RF006 <- t2.id
 |  |  |  |
 |  |  |  |--08:HASH JOIN [RIGHT OUTER JOIN]
 |  |  |  |  |  hash predicates: t2.id = t1.id
-|  |  |  |  |  runtime filters: RF004 <- t1.id
+|  |  |  |  |  runtime filters: RF008 <- t1.id
 |  |  |  |  |
 |  |  |  |  |--00:SCAN HDFS [functional.alltypestiny t1]
 |  |  |  |  |     partitions=4/4 files=4 size=460B
 |  |  |  |  |
 |  |  |  |  01:SCAN HDFS [functional.alltypes t2]
 |  |  |  |     partitions=24/24 files=24 size=478.45KB
-|  |  |  |     runtime filters: RF004 -> t2.id
+|  |  |  |     runtime filters: RF008 -> t2.id
 |  |  |  |
 |  |  |  03:SCAN HDFS [functional.alltypessmall t4]
 |  |  |     partitions=4/4 files=4 size=6.32KB
-|  |  |     runtime filters: RF003 -> t4.id
+|  |  |     runtime filters: RF006 -> t4.id
 |  |  |
 |  |  02:SCAN HDFS [functional.alltypesagg t3]
 |  |     partitions=11/11 files=11 size=814.73KB
-|  |     runtime filters: RF002 -> t3.id
+|  |     runtime filters: RF004 -> t3.id
 |  |
 |  04:SCAN HDFS [functional.alltypes t5]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> t5.id
+|     runtime filters: RF002 -> t5.id
 |
 05:SCAN HDFS [functional.alltypestiny t6]
    partitions=4/4 files=4 size=460B
@@ -1008,30 +1008,30 @@ PLAN-ROOT SINK
 |  |
 |  |--10:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: t3.id = t2.id
-|  |  |  runtime filters: RF001 <- t2.id
+|  |  |  runtime filters: RF002 <- t2.id
 |  |  |
 |  |  |--09:HASH JOIN [INNER JOIN]
 |  |  |  |  hash predicates: t4.id = t2.id
-|  |  |  |  runtime filters: RF002 <- t2.id
+|  |  |  |  runtime filters: RF004 <- t2.id
 |  |  |  |
 |  |  |  |--08:HASH JOIN [RIGHT OUTER JOIN]
 |  |  |  |  |  hash predicates: t2.id = t1.id
-|  |  |  |  |  runtime filters: RF003 <- t1.id
+|  |  |  |  |  runtime filters: RF006 <- t1.id
 |  |  |  |  |
 |  |  |  |  |--00:SCAN HDFS [functional.alltypestiny t1]
 |  |  |  |  |     partitions=4/4 files=4 size=460B
 |  |  |  |  |
 |  |  |  |  01:SCAN HDFS [functional.alltypes t2]
 |  |  |  |     partitions=24/24 files=24 size=478.45KB
-|  |  |  |     runtime filters: RF003 -> t2.id
+|  |  |  |     runtime filters: RF006 -> t2.id
 |  |  |  |
 |  |  |  03:SCAN HDFS [functional.alltypessmall t4]
 |  |  |     partitions=4/4 files=4 size=6.32KB
-|  |  |     runtime filters: RF002 -> t4.id
+|  |  |     runtime filters: RF004 -> t4.id
 |  |  |
 |  |  02:SCAN HDFS [functional.alltypesagg t3]
 |  |     partitions=11/11 files=11 size=814.73KB
-|  |     runtime filters: RF001 -> t3.id
+|  |     runtime filters: RF002 -> t3.id
 |  |
 |  04:SCAN HDFS [functional.alltypes t5]
 |     partitions=24/24 files=24 size=478.45KB
@@ -1067,11 +1067,11 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: t3.id = a.id
-|  runtime filters: RF001 <- a.id
+|  runtime filters: RF002 <- a.id
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = count(a.id)
-|  |  runtime filters: RF002 <- count(a.id)
+|  |  runtime filters: RF004 <- count(a.id)
 |  |
 |  |--04:AGGREGATE [FINALIZE]
 |  |  |  output: count(a.id)
@@ -1081,14 +1081,14 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: a.id = b.id
-|  |  |  runtime filters: RF003 <- b.id
+|  |  |  runtime filters: RF006 <- b.id
 |  |  |
 |  |  |--01:SCAN HDFS [functional.alltypestiny b]
 |  |  |     partitions=4/4 files=4 size=460B
 |  |  |
 |  |  00:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF003 -> a.id
+|  |     runtime filters: RF006 -> a.id
 |  |
 |  07:HASH JOIN [LEFT OUTER JOIN]
 |  |  hash predicates: b.id = a.id
@@ -1096,14 +1096,14 @@ PLAN-ROOT SINK
 |  |
 |  |--05:SCAN HDFS [functional.alltypes a]
 |  |     partitions=0/24 files=0 size=0B
-|  |     runtime filters: RF002 -> a.id
+|  |     runtime filters: RF004 -> a.id
 |  |
 |  06:SCAN HDFS [functional.alltypestiny b]
 |     partitions=4/4 files=4 size=460B
 |
 08:SCAN HDFS [functional.alltypes t3]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> t3.id, RF001 -> t3.id
+   runtime filters: RF000 -> t3.id, RF002 -> t3.id
 ====
 # Same as above but with full outer joins.
 select 1 from
@@ -1135,7 +1135,7 @@ PLAN-ROOT SINK
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = count(a.id)
-|  |  runtime filters: RF001 <- count(a.id)
+|  |  runtime filters: RF002 <- count(a.id)
 |  |
 |  |--04:AGGREGATE [FINALIZE]
 |  |  |  output: count(a.id)
@@ -1145,14 +1145,14 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: a.id = b.id
-|  |  |  runtime filters: RF002 <- b.id
+|  |  |  runtime filters: RF004 <- b.id
 |  |  |
 |  |  |--01:SCAN HDFS [functional.alltypestiny b]
 |  |  |     partitions=4/4 files=4 size=460B
 |  |  |
 |  |  00:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF002 -> a.id
+|  |     runtime filters: RF004 -> a.id
 |  |
 |  07:HASH JOIN [FULL OUTER JOIN]
 |  |  hash predicates: b.id = a.id
@@ -1160,7 +1160,7 @@ PLAN-ROOT SINK
 |  |
 |  |--05:SCAN HDFS [functional.alltypes a]
 |  |     partitions=0/24 files=0 size=0B
-|  |     runtime filters: RF001 -> a.id
+|  |     runtime filters: RF002 -> a.id
 |  |
 |  06:SCAN HDFS [functional.alltypestiny b]
 |     partitions=4/4 files=4 size=460B
@@ -1196,11 +1196,11 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: t3.id = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: b.id = count(a.id)
-|  |  runtime filters: RF002 <- count(a.id)
+|  |  runtime filters: RF004 <- count(a.id)
 |  |
 |  |--04:AGGREGATE [FINALIZE]
 |  |  |  output: count(a.id)
@@ -1210,30 +1210,30 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: a.id = b.id
-|  |  |  runtime filters: RF004 <- b.id
+|  |  |  runtime filters: RF008 <- b.id
 |  |  |
 |  |  |--01:SCAN HDFS [functional.alltypestiny b]
 |  |  |     partitions=4/4 files=4 size=460B
 |  |  |
 |  |  00:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF004 -> a.id
+|  |     runtime filters: RF008 -> a.id
 |  |
 |  07:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: b.id = a.id
-|  |  runtime filters: RF003 <- a.id
+|  |  runtime filters: RF006 <- a.id
 |  |
 |  |--05:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF000 -> a.id, RF002 -> a.id
+|  |     runtime filters: RF000 -> a.id, RF004 -> a.id
 |  |
 |  06:SCAN HDFS [functional.alltypes b]
 |     partitions=2/24 files=2 size=40.32KB
-|     runtime filters: RF000 -> b.id, RF002 -> b.id, RF003 -> b.id
+|     runtime filters: RF000 -> b.id, RF004 -> b.id, RF006 -> b.id
 |
 08:SCAN HDFS [functional.alltypes t3]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> t3.id, RF001 -> t3.id
+   runtime filters: RF000 -> t3.id, RF002 -> t3.id
 ====
 # Same as above but with anti joins.
 select 1 from
@@ -1265,7 +1265,7 @@ PLAN-ROOT SINK
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: b.id = count(a.id)
-|  |  runtime filters: RF001 <- count(a.id)
+|  |  runtime filters: RF002 <- count(a.id)
 |  |
 |  |--04:AGGREGATE [FINALIZE]
 |  |  |  output: count(a.id)
@@ -1275,25 +1275,25 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: a.id = b.id
-|  |  |  runtime filters: RF002 <- b.id
+|  |  |  runtime filters: RF004 <- b.id
 |  |  |
 |  |  |--01:SCAN HDFS [functional.alltypestiny b]
 |  |  |     partitions=4/4 files=4 size=460B
 |  |  |
 |  |  00:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF002 -> a.id
+|  |     runtime filters: RF004 -> a.id
 |  |
 |  07:HASH JOIN [LEFT ANTI JOIN]
 |  |  hash predicates: b.id = a.id
 |  |
 |  |--05:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF000 -> a.id, RF001 -> a.id
+|  |     runtime filters: RF000 -> a.id, RF002 -> a.id
 |  |
 |  06:SCAN HDFS [functional.alltypes b]
 |     partitions=2/24 files=2 size=40.32KB
-|     runtime filters: RF000 -> b.id, RF001 -> b.id
+|     runtime filters: RF000 -> b.id, RF002 -> b.id
 |
 08:SCAN HDFS [functional.alltypes t3]
    partitions=24/24 files=24 size=478.45KB
@@ -1342,7 +1342,7 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t1.bigint_col = t2.smallint_col
-|  |  runtime filters: RF001 <- t2.smallint_col
+|  |  runtime filters: RF002 <- t2.smallint_col
 |  |  limit: 1
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny t2]
@@ -1350,7 +1350,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [functional.alltypes t1]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> t1.bigint_col
+|     runtime filters: RF002 -> t1.bigint_col
 |
 00:SCAN HDFS [functional.alltypestiny t4]
    partitions=4/4 files=4 size=460B
@@ -1398,14 +1398,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [RIGHT OUTER JOIN]
 |  |  hash predicates: b.id = a.id
-|  |  runtime filters: RF001 <- a.id
+|  |  runtime filters: RF002 <- a.id
 |  |
 |  |--00:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 02:SCAN HDFS [functional.alltypes c]
    partitions=24/24 files=24 size=478.45KB
@@ -1430,14 +1430,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: b.id = a.id
-|  |  runtime filters: RF001 <- a.id
+|  |  runtime filters: RF002 <- a.id
 |  |
 |  |--00:SCAN HDFS [functional.alltypestiny a]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 02:SCAN HDFS [functional.alltypes c]
    partitions=24/24 files=24 size=478.45KB

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
index 6c42545..c85a5d1 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
@@ -86,7 +86,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: functional.alltypestiny.id = functional.alltypestiny.id
-|  runtime filters: RF001 <- functional.alltypestiny.id
+|  runtime filters: RF002 <- functional.alltypestiny.id
 |
 |--01:SCAN HDFS [functional.alltypestiny]
 |     partitions=4/4 files=4 size=460B
@@ -94,7 +94,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> coalesce(functional.alltypestiny.id, functional.alltypestiny.id), RF001 -> functional.alltypestiny.id
+   runtime filters: RF000 -> coalesce(functional.alltypestiny.id, functional.alltypestiny.id), RF002 -> functional.alltypestiny.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -111,7 +111,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: functional.alltypestiny.id = functional.alltypestiny.id
-|  runtime filters: RF001 <- functional.alltypestiny.id
+|  runtime filters: RF002 <- functional.alltypestiny.id
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -121,7 +121,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> coalesce(functional.alltypestiny.id, functional.alltypestiny.id), RF001 -> functional.alltypestiny.id
+   runtime filters: RF000 -> coalesce(functional.alltypestiny.id, functional.alltypestiny.id), RF002 -> functional.alltypestiny.id
 ====
 # multiple join predicates;
 # scan predicates get propagated correctly;
@@ -450,22 +450,22 @@ PLAN-ROOT SINK
 |
 |--05:HASH JOIN [RIGHT OUTER JOIN]
 |  |  hash predicates: b.id = c.id
-|  |  runtime filters: RF001 <- c.id
+|  |  runtime filters: RF002 <- c.id
 |  |
 |  |--02:SCAN HDFS [functional.alltypesnopart c]
 |  |     partitions=1/1 files=0 size=0B
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = b.id, a.int_col = b.int_col
-|  |  runtime filters: RF002 <- b.id, RF003 <- b.int_col
+|  |  runtime filters: RF004 <- b.id, RF005 <- b.int_col
 |  |
 |  |--01:SCAN HDFS [functional.alltypessmall b]
 |  |     partitions=4/4 files=4 size=6.32KB
-|  |     runtime filters: RF001 -> b.id
+|  |     runtime filters: RF002 -> b.id
 |  |
 |  00:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF001 -> a.id, RF002 -> a.id, RF003 -> a.int_col
+|     runtime filters: RF002 -> a.id, RF004 -> a.id, RF005 -> a.int_col
 |
 03:SCAN HDFS [functional.alltypesagg d]
    partitions=11/11 files=11 size=814.73KB
@@ -498,7 +498,7 @@ PLAN-ROOT SINK
 |  |
 |  05:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
 |  |  hash predicates: b.id = c.id
-|  |  runtime filters: RF001 <- c.id
+|  |  runtime filters: RF002 <- c.id
 |  |
 |  |--13:EXCHANGE [HASH(c.id)]
 |  |  |
@@ -509,17 +509,17 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: a.id = b.id, a.int_col = b.int_col
-|  |  runtime filters: RF002 <- b.id, RF003 <- b.int_col
+|  |  runtime filters: RF004 <- b.id, RF005 <- b.int_col
 |  |
 |  |--11:EXCHANGE [BROADCAST]
 |  |  |
 |  |  01:SCAN HDFS [functional.alltypessmall b]
 |  |     partitions=4/4 files=4 size=6.32KB
-|  |     runtime filters: RF001 -> b.id
+|  |     runtime filters: RF002 -> b.id
 |  |
 |  00:SCAN HDFS [functional.alltypesagg a]
 |     partitions=11/11 files=11 size=814.73KB
-|     runtime filters: RF001 -> a.id, RF002 -> a.id, RF003 -> a.int_col
+|     runtime filters: RF002 -> a.id, RF004 -> a.id, RF005 -> a.int_col
 |
 03:SCAN HDFS [functional.alltypesagg d]
    partitions=11/11 files=11 size=814.73KB
@@ -580,7 +580,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF001 <- t2.id
+|  runtime filters: RF002 <- t2.id
 |
 |--01:SCAN HDFS [functional.testtbl t2]
 |     partitions=1/1 files=0 size=0B
@@ -588,7 +588,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.testtbl t1]
    partitions=1/1 files=0 size=0B
-   runtime filters: RF000 -> t1.id, RF001 -> t1.id
+   runtime filters: RF000 -> t1.id, RF002 -> t1.id
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -605,7 +605,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: t1.id = t2.id
-|  runtime filters: RF001 <- t2.id
+|  runtime filters: RF002 <- t2.id
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -615,7 +615,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.testtbl t1]
    partitions=1/1 files=0 size=0B
-   runtime filters: RF000 -> t1.id, RF001 -> t1.id
+   runtime filters: RF000 -> t1.id, RF002 -> t1.id
 ====
 # join involving a table with no table stats (functional.emptytable)
 # tests that the default join strategy is broadcast
@@ -729,7 +729,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id, a.int_col = b.int_col
-|  runtime filters: RF002 <- b.id, RF003 <- b.int_col
+|  runtime filters: RF004 <- b.id, RF005 <- b.int_col
 |
 |--01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
@@ -737,7 +737,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.id, RF001 -> a.int_col, RF002 -> a.id, RF003 -> a.int_col
+   runtime filters: RF000 -> a.id, RF001 -> a.int_col, RF004 -> a.id, RF005 -> a.int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -754,7 +754,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: a.id = b.id, a.int_col = b.int_col
-|  runtime filters: RF002 <- b.id, RF003 <- b.int_col
+|  runtime filters: RF004 <- b.id, RF005 <- b.int_col
 |
 |--06:EXCHANGE [HASH(b.id,b.int_col)]
 |  |
@@ -766,7 +766,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.id, RF001 -> a.int_col, RF002 -> a.id, RF003 -> a.int_col
+   runtime filters: RF000 -> a.id, RF001 -> a.int_col, RF004 -> a.id, RF005 -> a.int_col
 ====
 # Tests that the partitioned join between a and b exploits the existing
 # data partition of its rhs input.
@@ -841,7 +841,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: a.bool_col = bool_col, a.int_col = int_col
-|  runtime filters: RF002 <- bool_col, RF003 <- int_col
+|  runtime filters: RF004 <- bool_col, RF005 <- int_col
 |
 |--03:AGGREGATE [FINALIZE]
 |  |  output: count(*)
@@ -853,7 +853,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.bool_col, RF001 -> a.int_col, RF002 -> a.bool_col, RF003 -> a.int_col
+   runtime filters: RF000 -> a.bool_col, RF001 -> a.int_col, RF004 -> a.bool_col, RF005 -> a.int_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -870,7 +870,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: a.bool_col = bool_col, a.int_col = int_col
-|  runtime filters: RF002 <- bool_col, RF003 <- int_col
+|  runtime filters: RF004 <- bool_col, RF005 <- int_col
 |
 |--07:AGGREGATE [FINALIZE]
 |  |  output: count:merge(*)
@@ -890,7 +890,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> a.bool_col, RF001 -> a.int_col, RF002 -> a.bool_col, RF003 -> a.int_col
+   runtime filters: RF000 -> a.bool_col, RF001 -> a.int_col, RF004 -> a.bool_col, RF005 -> a.int_col
 ====
 # Tests that all predicates from the On-clause are applied (IMPALA-805)
 # and that slot equivalences are enforced at lowest possible plan node.
@@ -964,14 +964,14 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = b.id, a.int_col = b.int_col
-|  |  runtime filters: RF002 <- b.id, RF003 <- b.int_col
+|  |  runtime filters: RF004 <- b.id, RF005 <- b.int_col
 |  |
 |  |--01:SCAN HDFS [functional.alltypestiny b]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  00:SCAN HDFS [functional.alltypes a]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF002 -> a.id, RF003 -> a.int_col
+|     runtime filters: RF004 -> a.id, RF005 -> a.int_col
 |
 02:SCAN HDFS [functional.alltypessmall c]
    partitions=4/4 files=4 size=6.32KB
@@ -999,7 +999,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: b.int_col = a.id
-|  runtime filters: RF001 <- a.id
+|  runtime filters: RF002 <- a.id
 |
 |--00:SCAN HDFS [functional.alltypestiny a]
 |     partitions=4/4 files=4 size=460B
@@ -1007,7 +1007,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [functional.alltypes b]
    partitions=24/24 files=24 size=478.45KB
-   runtime filters: RF000 -> b.int_col, RF001 -> b.int_col
+   runtime filters: RF000 -> b.int_col, RF002 -> b.int_col
 ====
 # Tests elimination of redundant join predicates (IMPALA-912)
 # and that slot equivalences are enforced at the lowest possible plan node.
@@ -1072,7 +1072,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: t2.smallint_col = t1.smallint_col
-|  runtime filters: RF002 <- t1.smallint_col
+|  runtime filters: RF004 <- t1.smallint_col
 |
 |--00:SCAN HDFS [functional.alltypes t1]
 |     partitions=24/24 files=24 size=478.45KB
@@ -1080,7 +1080,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [functional.alltypesagg t2]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t2.smallint_col, RF001 -> t2.bigint_col, RF002 -> t2.smallint_col
+   runtime filters: RF000 -> t2.smallint_col, RF001 -> t2.bigint_col, RF004 -> t2.smallint_col
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1098,7 +1098,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
 |  hash predicates: t2.smallint_col = t1.smallint_col
-|  runtime filters: RF002 <- t1.smallint_col
+|  runtime filters: RF004 <- t1.smallint_col
 |
 |--06:EXCHANGE [HASH(t1.smallint_col)]
 |  |
@@ -1110,7 +1110,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [functional.alltypesagg t2]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t2.smallint_col, RF001 -> t2.bigint_col, RF002 -> t2.smallint_col
+   runtime filters: RF000 -> t2.smallint_col, RF001 -> t2.bigint_col, RF004 -> t2.smallint_col
 ====
 # Test correct removal of redundant join predicates (IMPALA-1353):
 # Equivalences among inline-view slots are enforced. The predicates
@@ -1257,7 +1257,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.tinyint_col = t3.smallint_col, t1.string_col = t3.date_string_col
-|  runtime filters: RF001 <- t3.smallint_col, RF002 <- t3.date_string_col
+|  runtime filters: RF002 <- t3.smallint_col, RF003 <- t3.date_string_col
 |
 |--01:SCAN HDFS [functional.alltypestiny t3]
 |     partitions=4/4 files=4 size=460B
@@ -1265,7 +1265,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypesagg t1]
    partitions=11/11 files=11 size=814.73KB
-   runtime filters: RF000 -> t1.string_col, RF001 -> t1.tinyint_col, RF002 -> t1.string_col
+   runtime filters: RF000 -> t1.string_col, RF002 -> t1.tinyint_col, RF003 -> t1.string_col
 ====
 # Regression test for IMPALA-935.
 select 1 from
@@ -1589,7 +1589,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -1599,7 +1599,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny a]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> a.id, RF001 -> a.id
+   runtime filters: RF000 -> a.id, RF002 -> a.id
 ====
 # Test traditional commented join hints.
 select /* +straight_join */ * from functional.alltypestiny a
@@ -1623,7 +1623,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -1633,7 +1633,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny a]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> a.id, RF001 -> a.id
+   runtime filters: RF000 -> a.id, RF002 -> a.id
 ====
 # Test end-of-line commented join hints.
 select
@@ -1663,7 +1663,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id
-|  runtime filters: RF001 <- b.id
+|  runtime filters: RF002 <- b.id
 |
 |--05:EXCHANGE [BROADCAST]
 |  |
@@ -1673,7 +1673,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypestiny a]
    partitions=4/4 files=4 size=460B
-   runtime filters: RF000 -> a.id, RF001 -> a.id
+   runtime filters: RF000 -> a.id, RF002 -> a.id
 ====
 # Regression test for IMPALA-1289. Predicates should be assigned correctly
 # to inverted joins.
@@ -1962,7 +1962,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: int_col = int_col, string_col = string_col
-|  runtime filters: RF002 <- int_col, RF003 <- string_col
+|  runtime filters: RF004 <- int_col, RF005 <- string_col
 |
 |--12:EXCHANGE [HASH(string_col,int_col,int_col)]
 |  |
@@ -1990,7 +1990,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypessmall]
    partitions=4/4 files=4 size=6.32KB
    predicates: functional.alltypessmall.smallint_col = functional.alltypessmall.int_col
-   runtime filters: RF000 -> functional.alltypessmall.int_col, RF001 -> functional.alltypessmall.string_col, RF002 -> functional.alltypessmall.int_col, RF003 -> functional.alltypessmall.string_col
+   runtime filters: RF000 -> functional.alltypessmall.int_col, RF001 -> functional.alltypessmall.string_col, RF004 -> functional.alltypessmall.int_col, RF005 -> functional.alltypessmall.string_col
 ====
 # Assignment of predicates from the On-clause of an
 # anti join; inner join followed by anti join (IMPALA-1387)
@@ -2057,7 +2057,7 @@ PLAN-ROOT SINK
 |  06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: b.id = a.id
 |  |  other predicates: a.float_col < b.float_col
-|  |  runtime filters: RF001 <- a.id
+|  |  runtime filters: RF002 <- a.id
 |  |
 |  |--00:SCAN HDFS [functional.alltypes a]
 |  |     partitions=24/24 files=24 size=478.45KB
@@ -2065,7 +2065,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 03:SCAN HDFS [functional.alltypesagg d]
    partitions=11/11 files=11 size=814.73KB

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-delete.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-delete.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-delete.test
index 3a4b97a..2120c3e 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-delete.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-delete.test
@@ -31,13 +31,13 @@ DELETE FROM KUDU [functional_kudu.testtbl]
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: b.id = a.id
-|  |  runtime filters: RF001 <- a.id
+|  |  runtime filters: RF002 <- a.id
 |  |
 |  |--00:SCAN KUDU [functional_kudu.testtbl a]
 |  |
 |  01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 02:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
@@ -53,7 +53,7 @@ DELETE FROM KUDU [functional_kudu.testtbl]
 |  |
 |  03:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: b.id = a.id
-|  |  runtime filters: RF001 <- a.id
+|  |  runtime filters: RF002 <- a.id
 |  |
 |  |--05:EXCHANGE [BROADCAST]
 |  |  |
@@ -61,7 +61,7 @@ DELETE FROM KUDU [functional_kudu.testtbl]
 |  |
 |  01:SCAN HDFS [functional.alltypes b]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 06:EXCHANGE [HASH(id)]
 |

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-update.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-update.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-update.test
index b779ee3..5fe40ac 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu-update.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu-update.test
@@ -45,6 +45,7 @@ UPDATE KUDU [functional_kudu.testtbl]
 |
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id
+|  runtime filters: RF001 <- b.id
 |
 |--01:SCAN HDFS [functional.testtbl b]
 |     partitions=1/1 files=0 size=0B
@@ -52,11 +53,13 @@ UPDATE KUDU [functional_kudu.testtbl]
 |
 00:SCAN KUDU [functional_kudu.testtbl a]
    kudu predicates: a.id = 10
+   runtime filters: RF001 -> a.id
 ---- DISTRIBUTEDPLAN
 UPDATE KUDU [functional_kudu.testtbl]
 |
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: a.id = b.id
+|  runtime filters: RF001 <- b.id
 |
 |--03:EXCHANGE [BROADCAST]
 |  |
@@ -66,6 +69,7 @@ UPDATE KUDU [functional_kudu.testtbl]
 |
 00:SCAN KUDU [functional_kudu.testtbl a]
    kudu predicates: a.id = 10
+   runtime filters: RF001 -> a.id
 ====
 update a
 set a.name = 'values'
@@ -85,6 +89,7 @@ UPDATE KUDU [functional_kudu.testtbl]
 |
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: a.id = ids
+|  runtime filters: RF001 <- ids
 |
 |--04:EXCHANGE [HASH(ids)]
 |  |
@@ -94,6 +99,7 @@ UPDATE KUDU [functional_kudu.testtbl]
 03:EXCHANGE [HASH(a.id)]
 |
 00:SCAN KUDU [functional_kudu.testtbl a]
+   runtime filters: RF001 -> a.id
 ====
 update a
 set a.name = 'values'
@@ -104,17 +110,20 @@ UPDATE KUDU [functional_kudu.testtbl]
 |
 02:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: a.zip = zip
+|  runtime filters: RF001 <- zip
 |
 |--01:SCAN HDFS [functional.testtbl]
 |     partitions=1/1 files=0 size=0B
 |     limit: 10
 |
 00:SCAN KUDU [functional_kudu.testtbl a]
+   runtime filters: RF001 -> a.zip
 ---- DISTRIBUTEDPLAN
 UPDATE KUDU [functional_kudu.testtbl]
 |
 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST]
 |  hash predicates: a.zip = zip
+|  runtime filters: RF001 <- zip
 |
 |--04:EXCHANGE [BROADCAST]
 |  |
@@ -126,6 +135,7 @@ UPDATE KUDU [functional_kudu.testtbl]
 |     limit: 10
 |
 00:SCAN KUDU [functional_kudu.testtbl a]
+   runtime filters: RF001 -> a.zip
 ====
 update functional_kudu.testtbl set zip = 94546 where false
 ---- PLAN

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
index e620ad6..053fe72 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
@@ -344,6 +344,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id
+|  runtime filters: RF001 <- b.id
 |
 |--01:SCAN KUDU [functional_kudu.alltypessmall b]
 |     predicates: CAST(b.id AS STRING) > '123'
@@ -352,6 +353,7 @@ PLAN-ROOT SINK
 00:SCAN KUDU [functional_kudu.alltypes a]
    predicates: CAST(a.id AS STRING) > '123'
    kudu predicates: a.id > 10
+   runtime filters: RF001 -> a.id
 ====
 # IMPALA-4662: Kudu analysis failure for NULL literal in IN list
 # NULL literal in values list results in applying predicate at scan node

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/max-row-size.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/max-row-size.test b/testdata/workloads/functional-planner/queries/PlannerTest/max-row-size.test
index d563444..fe7e25f 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/max-row-size.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/max-row-size.test
@@ -21,7 +21,7 @@ Per-Host Resources: mem-estimate=40.94MB mem-reservation=16.94MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = n_nationkey
 |  fk/pk conjuncts: c_nationkey = n_nationkey
-|  runtime filters: RF000 <- n_nationkey
+|  runtime filters: RF000[bloom] <- n_nationkey
 |  mem-estimate=16.94MB mem-reservation=16.94MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=150000
 |
@@ -41,7 +41,7 @@ Per-Host Resources: mem-estimate=40.94MB mem-reservation=16.94MB
 |
 00:SCAN HDFS [tpch_parquet.customer, RANDOM]
    partitions=1/1 files=1 size=12.34MB
-   runtime filters: RF000 -> c_nationkey
+   runtime filters: RF000[bloom] -> c_nationkey
    stats-rows=150000 extrapolated-rows=disabled
    table stats: rows=150000 size=12.34MB
    column stats: all
@@ -186,7 +186,7 @@ Per-Host Resources: mem-estimate=85.12MB mem-reservation=65.00MB
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  runtime filters: RF000 <- o_orderkey
+|  runtime filters: RF000[bloom] <- o_orderkey
 |  mem-estimate=31.00MB mem-reservation=31.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=33B cardinality=5757710
 |
@@ -212,7 +212,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
 Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
    partitions=1/1 files=3 size=193.92MB
-   runtime filters: RF000 -> l_orderkey
+   runtime filters: RF000[bloom] -> l_orderkey
    stats-rows=6001215 extrapolated-rows=disabled
    table stats: rows=6001215 size=193.92MB
    column stats: all

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/min-max-runtime-filters.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/min-max-runtime-filters.test b/testdata/workloads/functional-planner/queries/PlannerTest/min-max-runtime-filters.test
new file mode 100644
index 0000000..13e3014
--- /dev/null
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/min-max-runtime-filters.test
@@ -0,0 +1,142 @@
+# basic filter
+select count(*) from functional_kudu.alltypes a, functional_kudu.alltypestiny b
+where a.int_col = b.tinyint_col + 1 and a.string_col = b.string_col
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=11.94MB mem-reservation=1.94MB
+PLAN-ROOT SINK
+|  mem-estimate=0B mem-reservation=0B
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
+|  tuple-ids=2 row-size=8B cardinality=1
+|
+02:HASH JOIN [INNER JOIN]
+|  hash predicates: a.string_col = b.string_col, a.int_col = b.tinyint_col + 1
+|  fk/pk conjuncts: none
+|  runtime filters: RF002[min_max] <- b.string_col, RF003[min_max] <- b.tinyint_col + 1
+|  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
+|  tuple-ids=0,1 row-size=39B cardinality=5840
+|
+|--01:SCAN KUDU [functional_kudu.alltypestiny b]
+|     mem-estimate=0B mem-reservation=0B
+|     tuple-ids=1 row-size=18B cardinality=8
+|
+00:SCAN KUDU [functional_kudu.alltypes a]
+   runtime filters: RF002[min_max] -> a.string_col, RF003[min_max] -> a.int_col
+   mem-estimate=0B mem-reservation=0B
+   tuple-ids=0 row-size=21B cardinality=7300
+====
+# Filters are not created if the target isn't a bare Kudu column or if 'is (not) distinct'
+# is used.
+select count(*) from functional_kudu.alltypes a, functional_kudu.alltypestiny b
+where a.int_col + 1 = b.int_col
+    and a.string_col is distinct from b.string_col
+    and a.tinyint_col is not distinct from b.tinyint_col
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=11.94MB mem-reservation=1.94MB
+PLAN-ROOT SINK
+|  mem-estimate=0B mem-reservation=0B
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
+|  tuple-ids=2 row-size=8B cardinality=1
+|
+02:HASH JOIN [INNER JOIN]
+|  hash predicates: a.tinyint_col IS NOT DISTINCT FROM b.tinyint_col, a.int_col + 1 = b.int_col
+|  fk/pk conjuncts: assumed fk/pk
+|  other predicates: a.string_col IS DISTINCT FROM b.string_col
+|  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
+|  tuple-ids=0,1 row-size=44B cardinality=7300
+|
+|--01:SCAN KUDU [functional_kudu.alltypestiny b]
+|     mem-estimate=0B mem-reservation=0B
+|     tuple-ids=1 row-size=22B cardinality=8
+|
+00:SCAN KUDU [functional_kudu.alltypes a]
+   mem-estimate=0B mem-reservation=0B
+   tuple-ids=0 row-size=22B cardinality=7300
+====
+# Filters are only assigned when the target expr is cast if its an implicit integer cast.
+select count(*) from functional_kudu.alltypes a, functional_kudu.alltypestiny b
+where a.tinyint_col = b.bigint_col
+    and cast(a.int_col as smallint) = b.smallint_col
+    and a.string_col = b.timestamp_col
+    and cast(a.float_col as double) = b.double_col
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=11.94MB mem-reservation=1.94MB
+PLAN-ROOT SINK
+|  mem-estimate=0B mem-reservation=0B
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
+|  tuple-ids=2 row-size=8B cardinality=1
+|
+02:HASH JOIN [INNER JOIN]
+|  hash predicates: CAST(a.float_col AS DOUBLE) = b.double_col, CAST(a.int_col AS SMALLINT) = b.smallint_col, a.string_col = b.timestamp_col, a.tinyint_col = b.bigint_col
+|  fk/pk conjuncts: a.string_col = b.timestamp_col, a.tinyint_col = b.bigint_col
+|  runtime filters: RF007[min_max] <- b.bigint_col
+|  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
+|  tuple-ids=0,1 row-size=60B cardinality=1460
+|
+|--01:SCAN KUDU [functional_kudu.alltypestiny b]
+|     mem-estimate=0B mem-reservation=0B
+|     tuple-ids=1 row-size=34B cardinality=8
+|
+00:SCAN KUDU [functional_kudu.alltypes a]
+   runtime filters: RF007[min_max] -> a.tinyint_col
+   mem-estimate=0B mem-reservation=0B
+   tuple-ids=0 row-size=26B cardinality=7300
+====
+# Query with both Kudu and HDFS filter targets.
+select count(*) from functional_kudu.alltypes a, functional_parquet.alltypes b,
+    functional_kudu.alltypes c
+where a.int_col = b.int_col and a.int_col = c.int_col
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=2.02GB mem-reservation=35.94MB
+PLAN-ROOT SINK
+|  mem-estimate=0B mem-reservation=0B
+|
+05:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
+|  tuple-ids=3 row-size=8B cardinality=1
+|
+04:HASH JOIN [INNER JOIN]
+|  hash predicates: a.int_col = c.int_col
+|  fk/pk conjuncts: none
+|  runtime filters: RF000[bloom] <- c.int_col, RF001[min_max] <- c.int_col
+|  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
+|  tuple-ids=0,1,2 row-size=12B cardinality=5329000
+|
+|--02:SCAN KUDU [functional_kudu.alltypes c]
+|     mem-estimate=0B mem-reservation=0B
+|     tuple-ids=2 row-size=4B cardinality=7300
+|
+03:HASH JOIN [INNER JOIN]
+|  hash predicates: a.int_col = b.int_col
+|  fk/pk conjuncts: assumed fk/pk
+|  runtime filters: RF003[min_max] <- b.int_col
+|  mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
+|  tuple-ids=0,1 row-size=8B cardinality=7300
+|
+|--01:SCAN HDFS [functional_parquet.alltypes b]
+|     partitions=24/24 files=24 size=179.70KB
+|     runtime filters: RF000[bloom] -> b.int_col
+|     stats-rows=unavailable extrapolated-rows=disabled
+|     table stats: rows=unavailable size=unavailable
+|     column stats: unavailable
+|     mem-estimate=16.00MB mem-reservation=0B
+|     tuple-ids=1 row-size=4B cardinality=unavailable
+|
+00:SCAN KUDU [functional_kudu.alltypes a]
+   runtime filters: RF001[min_max] -> a.int_col, RF003[min_max] -> a.int_col
+   mem-estimate=0B mem-reservation=0B
+   tuple-ids=0 row-size=4B cardinality=7300
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
index da2e17f..3646146 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/nested-collections.test
@@ -86,7 +86,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: c.c_nationkey = s.s_nationkey, c_comment = s_comment
-|  runtime filters: RF002 <- s.s_nationkey, RF003 <- s_comment
+|  runtime filters: RF004 <- s.s_nationkey, RF005 <- s_comment
 |
 |--06:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=111.08MB
@@ -94,7 +94,7 @@ PLAN-ROOT SINK
 |
 05:SCAN HDFS [tpch_nested_parquet.customer c]
    partitions=1/1 files=4 size=577.87MB
-   runtime filters: RF000 -> c_nationkey, RF001 -> c.c_comment, RF002 -> c.c_nationkey, RF003 -> c_comment
+   runtime filters: RF000 -> c_nationkey, RF001 -> c.c_comment, RF004 -> c.c_nationkey, RF005 -> c_comment
 ====
 # Test subplans: Cross join of parent and relative ref.
 select a.id, b.item from functional.allcomplextypes a cross join a.int_array_col b
@@ -695,7 +695,7 @@ PLAN-ROOT SINK
 |  |
 |  |--05:HASH JOIN [RIGHT OUTER JOIN]
 |  |  |  hash predicates: b.id = a.id
-|  |  |  runtime filters: RF001 <- a.id
+|  |  |  runtime filters: RF002 <- a.id
 |  |  |
 |  |  |--00:SCAN HDFS [functional.allcomplextypes a]
 |  |  |     partitions=0/0 files=0 size=0B
@@ -704,7 +704,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  01:SCAN HDFS [functional.alltypestiny b]
 |  |     partitions=4/4 files=4 size=460B
-|  |     runtime filters: RF001 -> b.id
+|  |     runtime filters: RF002 -> b.id
 |  |
 |  02:SCAN HDFS [functional.alltypessmall c]
 |     partitions=4/4 files=4 size=6.32KB
@@ -782,7 +782,7 @@ PLAN-ROOT SINK
 |  10:HASH JOIN [RIGHT OUTER JOIN]
 |  |  hash predicates: c.id = b.item
 |  |  other predicates: c.int_col > 30
-|  |  runtime filters: RF001 <- b.item
+|  |  runtime filters: RF002 <- b.item
 |  |
 |  |--01:SUBPLAN
 |  |  |
@@ -800,7 +800,7 @@ PLAN-ROOT SINK
 |  05:SCAN HDFS [functional.alltypessmall c]
 |     partitions=4/4 files=4 size=6.32KB
 |     predicates: c.id < 10, c.int_col > 30
-|     runtime filters: RF001 -> c.id
+|     runtime filters: RF002 -> c.id
 |
 06:SCAN HDFS [functional.alltypes e]
    partitions=24/24 files=24 size=478.45KB
@@ -1966,22 +1966,22 @@ PLAN-ROOT SINK
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t3.r_comment = t2.c_address
-|  |  runtime filters: RF001 <- t2.c_address
+|  |  runtime filters: RF002 <- t2.c_address
 |  |
 |  |--04:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: t2.c_custkey = t5.r_regionkey
-|  |  |  runtime filters: RF002 <- t5.r_regionkey
+|  |  |  runtime filters: RF004 <- t5.r_regionkey
 |  |  |
 |  |  |--03:SCAN HDFS [tpch_nested_parquet.region t5]
 |  |  |     partitions=1/1 files=1 size=4.18KB
 |  |  |
 |  |  01:SCAN HDFS [tpch_nested_parquet.customer t2]
 |  |     partitions=1/1 files=4 size=577.87MB
-|  |     runtime filters: RF002 -> t2.c_custkey
+|  |     runtime filters: RF004 -> t2.c_custkey
 |  |
 |  02:SCAN HDFS [tpch_nested_parquet.region t3]
 |     partitions=1/1 files=1 size=4.18KB
-|     runtime filters: RF001 -> t3.r_comment
+|     runtime filters: RF002 -> t3.r_comment
 |
 00:SCAN HDFS [tpch_nested_parquet.region.r_nations t1]
    partitions=1/1 files=1 size=4.18KB

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/order.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/order.test b/testdata/workloads/functional-planner/queries/PlannerTest/order.test
index 4f51a2a..c64be15 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/order.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/order.test
@@ -359,7 +359,7 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.smallint_col = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--01:SCAN HDFS [functional.alltypessmall b]
 |  |     partitions=4/4 files=4 size=6.32KB
@@ -368,7 +368,7 @@ PLAN-ROOT SINK
 |  00:SCAN HDFS [functional.alltypesagg a]
 |     partitions=1/11 files=1 size=73.39KB
 |     predicates: a.int_col > 899
-|     runtime filters: RF001 -> a.smallint_col
+|     runtime filters: RF002 -> a.smallint_col
 |
 02:SCAN HDFS [functional.alltypessmall c]
    partitions=4/4 files=4 size=6.32KB
@@ -392,7 +392,7 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  hash predicates: b.id = a.smallint_col
-|  |  runtime filters: RF001 <- a.smallint_col
+|  |  runtime filters: RF002 <- a.smallint_col
 |  |
 |  |--07:EXCHANGE [HASH(a.smallint_col)]
 |  |  |
@@ -405,7 +405,7 @@ PLAN-ROOT SINK
 |  01:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
 |     predicates: b.float_col > 4.5
-|     runtime filters: RF001 -> b.id
+|     runtime filters: RF002 -> b.id
 |
 08:EXCHANGE [HASH(c.id)]
 |

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/outer-joins.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/outer-joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/outer-joins.test
index 51989cd..4f7b809 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/outer-joins.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/outer-joins.test
@@ -220,7 +220,7 @@ PLAN-ROOT SINK
 03:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: t1.id - 1 = t2.id + 1
 |  other join predicates: t1.zip = 94611, t2.zip = 94104
-|  runtime filters: RF001 <- t2.id + 1
+|  runtime filters: RF002 <- t2.id + 1
 |
 |--01:SCAN HDFS [functional.testtbl t2]
 |     partitions=1/1 files=0 size=0B
@@ -229,7 +229,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.testtbl t1]
    partitions=1/1 files=0 size=0B
    predicates: t1.id IS NOT NULL, t1.id > 0
-   runtime filters: RF000 -> t1.id, RF001 -> t1.id - 1
+   runtime filters: RF000 -> t1.id, RF002 -> t1.id - 1
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -252,7 +252,7 @@ PLAN-ROOT SINK
 03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
 |  hash predicates: t1.id - 1 = t2.id + 1
 |  other join predicates: t1.zip = 94611, t2.zip = 94104
-|  runtime filters: RF001 <- t2.id + 1
+|  runtime filters: RF002 <- t2.id + 1
 |
 |--06:EXCHANGE [HASH(t2.id + 1)]
 |  |
@@ -265,7 +265,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.testtbl t1]
    partitions=1/1 files=0 size=0B
    predicates: t1.id IS NOT NULL, t1.id > 0
-   runtime filters: RF000 -> t1.id, RF001 -> t1.id - 1
+   runtime filters: RF000 -> t1.id, RF002 -> t1.id - 1
 ====
 # the same thing with subqueries; should produce the same result
 select *
@@ -291,7 +291,7 @@ PLAN-ROOT SINK
 03:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: a1.id - 1 = a2.id + 1
 |  other join predicates: a1.zip = 94611, a2.zip = 94104
-|  runtime filters: RF001 <- a2.id + 1
+|  runtime filters: RF002 <- a2.id + 1
 |
 |--01:SCAN HDFS [functional.testtbl a2]
 |     partitions=1/1 files=0 size=0B
@@ -300,7 +300,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.testtbl a1]
    partitions=1/1 files=0 size=0B
    predicates: a1.id IS NOT NULL, a1.id > 0
-   runtime filters: RF000 -> a1.id, RF001 -> a1.id - 1
+   runtime filters: RF000 -> a1.id, RF002 -> a1.id - 1
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -323,7 +323,7 @@ PLAN-ROOT SINK
 03:HASH JOIN [RIGHT OUTER JOIN, PARTITIONED]
 |  hash predicates: a1.id - 1 = a2.id + 1
 |  other join predicates: a1.zip = 94611, a2.zip = 94104
-|  runtime filters: RF001 <- a2.id + 1
+|  runtime filters: RF002 <- a2.id + 1
 |
 |--06:EXCHANGE [HASH(a2.id + 1)]
 |  |
@@ -336,7 +336,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.testtbl a1]
    partitions=1/1 files=0 size=0B
    predicates: a1.id IS NOT NULL, a1.id > 0
-   runtime filters: RF000 -> a1.id, RF001 -> a1.id - 1
+   runtime filters: RF000 -> a1.id, RF002 -> a1.id - 1
 ====
 # right outer join requires the join op to be partitioned, otherwise non-matches cause
 # duplicates
@@ -661,7 +661,7 @@ PLAN-ROOT SINK
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--01:SCAN HDFS [functional.alltypes b]
 |  |     partitions=24/24 files=24 size=478.45KB
@@ -669,7 +669,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [functional.alltypes a]
 |     partitions=24/24 files=24 size=478.45KB
-|     runtime filters: RF001 -> a.id
+|     runtime filters: RF002 -> a.id
 |
 02:SCAN HDFS [functional.alltypes c]
    partitions=24/24 files=24 size=478.45KB
@@ -800,7 +800,7 @@ PLAN-ROOT SINK
 |
 |--04:HASH JOIN [RIGHT OUTER JOIN]
 |  |  hash predicates: a.id = b.id
-|  |  runtime filters: RF001 <- b.id
+|  |  runtime filters: RF002 <- b.id
 |  |
 |  |--01:SCAN HDFS [functional.alltypestiny b]
 |  |     partitions=4/4 files=4 size=460B
@@ -809,7 +809,7 @@ PLAN-ROOT SINK
 |  00:SCAN HDFS [functional.alltypestiny a]
 |     partitions=4/4 files=4 size=460B
 |     predicates: a.int_col > 10
-|     runtime filters: RF001 -> a.id
+|     runtime filters: RF002 -> a.id
 |
 02:SCAN HDFS [functional.alltypestiny c]
    partitions=4/4 files=4 size=460B

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test b/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
index bd2d705..cc29c8c 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
@@ -219,7 +219,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: a.id = b.id, a.month = b.month, a.year = b.year, a.tinyint_col = b.smallint_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.month, RF006 <- b.year, RF007 <- b.smallint_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.month, RF010 <- b.year, RF011 <- b.smallint_col
 |
 |--01:SCAN HDFS [functional.alltypessmall b]
 |     partitions=2/4 files=2 size=3.16KB
@@ -229,7 +229,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes a]
    partitions=2/24 files=2 size=38.07KB
    predicates: a.id = 17, CAST(sin(a.tinyint_col) AS BOOLEAN) = TRUE
-   runtime filters: RF000 -> a.id, RF001 -> a.month, RF002 -> a.year, RF003 -> a.tinyint_col, RF004 -> a.id, RF005 -> a.month, RF006 -> a.year, RF007 -> a.tinyint_col
+   runtime filters: RF000 -> a.id, RF001 -> a.month, RF002 -> a.year, RF003 -> a.tinyint_col, RF008 -> a.id, RF009 -> a.month, RF010 -> a.year, RF011 -> a.tinyint_col
 ---- SCANRANGELOCATIONS
 NODE 0:
   HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=1/090101.txt 0:20433
@@ -257,7 +257,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: a.id = b.id, a.month = b.month, a.year = b.year, a.tinyint_col = b.smallint_col
-|  runtime filters: RF004 <- b.id, RF005 <- b.month, RF006 <- b.year, RF007 <- b.smallint_col
+|  runtime filters: RF008 <- b.id, RF009 <- b.month, RF010 <- b.year, RF011 <- b.smallint_col
 |
 |--06:EXCHANGE [HASH(b.id,b.month,b.year,b.smallint_col)]
 |  |
@@ -271,7 +271,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes a]
    partitions=2/24 files=2 size=38.07KB
    predicates: a.id = 17, CAST(sin(a.tinyint_col) AS BOOLEAN) = TRUE
-   runtime filters: RF000 -> a.id, RF001 -> a.month, RF002 -> a.year, RF003 -> a.tinyint_col, RF004 -> a.id, RF005 -> a.month, RF006 -> a.year, RF007 -> a.tinyint_col
+   runtime filters: RF000 -> a.id, RF001 -> a.month, RF002 -> a.year, RF003 -> a.tinyint_col, RF008 -> a.id, RF009 -> a.month, RF010 -> a.year, RF011 -> a.tinyint_col
 ====
 # basic propagation between equivalence classes, with partition pruning;
 # variation with inline views
@@ -296,7 +296,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: functional.alltypes.id = functional.alltypessmall.id, functional.alltypes.month = functional.alltypessmall.month, functional.alltypes.year = functional.alltypessmall.year, functional.alltypes.tinyint_col = functional.alltypessmall.smallint_col
-|  runtime filters: RF004 <- functional.alltypessmall.id, RF005 <- functional.alltypessmall.month, RF006 <- functional.alltypessmall.year, RF007 <- functional.alltypessmall.smallint_col
+|  runtime filters: RF008 <- functional.alltypessmall.id, RF009 <- functional.alltypessmall.month, RF010 <- functional.alltypessmall.year, RF011 <- functional.alltypessmall.smallint_col
 |
 |--01:SCAN HDFS [functional.alltypessmall]
 |     partitions=2/4 files=2 size=3.16KB
@@ -306,7 +306,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes]
    partitions=2/24 files=2 size=38.07KB
    predicates: functional.alltypes.id = 17, CAST(sin(functional.alltypes.tinyint_col) AS BOOLEAN) = TRUE
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.month, RF002 -> functional.alltypes.year, RF003 -> functional.alltypes.tinyint_col, RF004 -> functional.alltypes.id, RF005 -> functional.alltypes.month, RF006 -> functional.alltypes.year, RF007 -> functional.alltypes.tinyint_col
+   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.month, RF002 -> functional.alltypes.year, RF003 -> functional.alltypes.tinyint_col, RF008 -> functional.alltypes.id, RF009 -> functional.alltypes.month, RF010 -> functional.alltypes.year, RF011 -> functional.alltypes.tinyint_col
 ---- SCANRANGELOCATIONS
 NODE 0:
   HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=1/090101.txt 0:20433
@@ -334,7 +334,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: functional.alltypes.id = functional.alltypessmall.id, functional.alltypes.month = functional.alltypessmall.month, functional.alltypes.year = functional.alltypessmall.year, functional.alltypes.tinyint_col = functional.alltypessmall.smallint_col
-|  runtime filters: RF004 <- functional.alltypessmall.id, RF005 <- functional.alltypessmall.month, RF006 <- functional.alltypessmall.year, RF007 <- functional.alltypessmall.smallint_col
+|  runtime filters: RF008 <- functional.alltypessmall.id, RF009 <- functional.alltypessmall.month, RF010 <- functional.alltypessmall.year, RF011 <- functional.alltypessmall.smallint_col
 |
 |--06:EXCHANGE [HASH(functional.alltypessmall.id,functional.alltypessmall.month,functional.alltypessmall.year,functional.alltypessmall.smallint_col)]
 |  |
@@ -348,7 +348,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes]
    partitions=2/24 files=2 size=38.07KB
    predicates: functional.alltypes.id = 17, CAST(sin(functional.alltypes.tinyint_col) AS BOOLEAN) = TRUE
-   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.month, RF002 -> functional.alltypes.year, RF003 -> functional.alltypes.tinyint_col, RF004 -> functional.alltypes.id, RF005 -> functional.alltypes.month, RF006 -> functional.alltypes.year, RF007 -> functional.alltypes.tinyint_col
+   runtime filters: RF000 -> functional.alltypes.id, RF001 -> functional.alltypes.month, RF002 -> functional.alltypes.year, RF003 -> functional.alltypes.tinyint_col, RF008 -> functional.alltypes.id, RF009 -> functional.alltypes.month, RF010 -> functional.alltypes.year, RF011 -> functional.alltypes.tinyint_col
 ====
 # propagation between outer-joined tables only goes in one direction:
 # - predicates on a.year and a.tinyint_col are propagated to b
@@ -824,14 +824,14 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: a.int_col = b.int_col, a.year = b.year
 |  other predicates: a.id + b.id = 17
-|  runtime filters: RF001 <- b.int_col, RF002 <- b.year
+|  runtime filters: RF002 <- b.int_col, RF003 <- b.year
 |
 |--01:SCAN HDFS [functional.alltypessmall b]
 |     partitions=4/4 files=4 size=6.32KB
 |
 00:SCAN HDFS [functional.alltypes a]
    partitions=12/24 files=12 size=238.68KB
-   runtime filters: RF000 -> a.id, RF001 -> a.int_col, RF002 -> a.year
+   runtime filters: RF000 -> a.id, RF002 -> a.int_col, RF003 -> a.year
 ====
 # correct placement of predicates in the presence of aggregation in an inline view
 select straight_join a.id, b.id
@@ -1051,7 +1051,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = t2.id, t1.tinyint_col = t2.smallint_col
-|  runtime filters: RF003 <- t2.id, RF004 <- t2.smallint_col
+|  runtime filters: RF006 <- t2.id, RF007 <- t2.smallint_col
 |
 |--01:SCAN HDFS [functional.alltypessmall t2]
 |     partitions=4/4 files=4 size=6.32KB
@@ -1061,7 +1061,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes t1]
    partitions=24/24 files=24 size=478.45KB
    predicates: t1.id + t1.tinyint_col > 10
-   runtime filters: RF001 -> t1.id, RF002 -> t1.tinyint_col, RF003 -> t1.id, RF004 -> t1.tinyint_col
+   runtime filters: RF001 -> t1.id, RF002 -> t1.tinyint_col, RF006 -> t1.id, RF007 -> t1.tinyint_col
 ====
 # basic propagation of multi-slot, single-tuple predicates with aggregates
 select straight_join 1 from
@@ -1096,7 +1096,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: t1.id = count(tinyint_col), t1.tinyint_col = max(smallint_col)
-|  runtime filters: RF001 <- count(tinyint_col), RF002 <- max(smallint_col)
+|  runtime filters: RF002 <- count(tinyint_col), RF003 <- max(smallint_col)
 |
 |--02:AGGREGATE [FINALIZE]
 |  |  output: count(tinyint_col), max(smallint_col), min(int_col)
@@ -1109,7 +1109,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [functional.alltypes t1]
    partitions=24/24 files=24 size=478.45KB
    predicates: t1.id + t1.tinyint_col > 10, t1.id + t1.tinyint_col > 20
-   runtime filters: RF000 -> t1.tinyint_col, RF001 -> t1.id, RF002 -> t1.tinyint_col
+   runtime filters: RF000 -> t1.tinyint_col, RF002 -> t1.id, RF003 -> t1.tinyint_col
 ====
 # assignment of multi-slot, single-tuple predicates with outer-joined tuple (IMPALA-824)
 select straight_join 1

[03/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test
index a18f4f5..31f0738 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpcds-all.test
@@ -42,7 +42,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--02:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -50,7 +50,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -85,7 +85,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -95,7 +95,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -134,7 +134,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -148,7 +148,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ====
 # TPCDS-Q7
 select
@@ -198,7 +198,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_promo_sk = p_promo_sk
-|  runtime filters: RF001 <- p_promo_sk
+|  runtime filters: RF002 <- p_promo_sk
 |
 |--04:SCAN HDFS [tpcds.promotion]
 |     partitions=1/1 files=1 size=36.36KB
@@ -206,7 +206,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_cdemo_sk = cd_demo_sk
-|  runtime filters: RF002 <- cd_demo_sk
+|  runtime filters: RF004 <- cd_demo_sk
 |
 |--01:SCAN HDFS [tpcds.customer_demographics]
 |     partitions=1/1 files=1 size=76.92MB
@@ -214,7 +214,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--02:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -222,7 +222,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_promo_sk, RF002 -> ss_cdemo_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_item_sk, RF002 -> ss_promo_sk, RF004 -> ss_cdemo_sk, RF006 -> ss_sold_date_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -254,7 +254,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_promo_sk = p_promo_sk
-|  runtime filters: RF001 <- p_promo_sk
+|  runtime filters: RF002 <- p_promo_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -264,7 +264,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_cdemo_sk = cd_demo_sk
-|  runtime filters: RF002 <- cd_demo_sk
+|  runtime filters: RF004 <- cd_demo_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -274,7 +274,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--11:EXCHANGE [BROADCAST]
 |  |
@@ -284,7 +284,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_promo_sk, RF002 -> ss_cdemo_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_item_sk, RF002 -> ss_promo_sk, RF004 -> ss_cdemo_sk, RF006 -> ss_sold_date_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -320,7 +320,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_promo_sk = p_promo_sk
-|  runtime filters: RF001 <- p_promo_sk
+|  runtime filters: RF002 <- p_promo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -334,7 +334,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_cdemo_sk = cd_demo_sk
-|  runtime filters: RF002 <- cd_demo_sk
+|  runtime filters: RF004 <- cd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -348,7 +348,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -362,7 +362,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_promo_sk, RF002 -> ss_cdemo_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_item_sk, RF002 -> ss_promo_sk, RF004 -> ss_cdemo_sk, RF006 -> ss_sold_date_sk
 ====
 # TPCDS-Q8
 select
@@ -449,7 +449,7 @@ PLAN-ROOT SINK
 |
 |--07:HASH JOIN [RIGHT SEMI JOIN]
 |  |  hash predicates: substr(ca_zip, 1, 5) = substr(ca_zip, 1, 5)
-|  |  runtime filters: RF002 <- substr(ca_zip, 1, 5)
+|  |  runtime filters: RF004 <- substr(ca_zip, 1, 5)
 |  |
 |  |--05:AGGREGATE [FINALIZE]
 |  |  |  output: count(*)
@@ -458,7 +458,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  04:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: customer_address.ca_address_sk = customer.c_current_addr_sk
-|  |  |  runtime filters: RF003 <- customer.c_current_addr_sk
+|  |  |  runtime filters: RF006 <- customer.c_current_addr_sk
 |  |  |
 |  |  |--03:SCAN HDFS [tpcds.customer]
 |  |  |     partitions=1/1 files=1 size=12.60MB
@@ -466,16 +466,16 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpcds.customer_address]
 |  |     partitions=1/1 files=1 size=5.25MB
-|  |     runtime filters: RF003 -> customer_address.ca_address_sk
+|  |     runtime filters: RF006 -> customer_address.ca_address_sk
 |  |
 |  06:SCAN HDFS [tpcds.customer_address]
 |     partitions=1/1 files=1 size=5.25MB
 |     predicates: substr(ca_zip, 1, 5) IN ('89436', '30868', '65085', '22977', '83927', '77557', '58429', '40697', '80614', '10502', '32779', '91137', '61265', '98294', '17921', '18427', '21203', '59362', '87291', '84093', '21505', '17184', '10866', '67898', '25797', '28055', '18377', '80332', '74535', '21757', '29742', '90885', '29898', '17819', '40811', '25990', '47513', '89531', '91068', '10391', '18846', '99223', '82637', '41368', '83658', '86199', '81625', '26696', '89338', '88425', '32200', '81427', '19053', '77471', '36610', '99823', '43276', '41249', '48584', '83550', '82276', '18842', '78890', '14090', '38123', '40936', '34425', '19850', '43286', '80072', '79188', '54191', '11395', '50497', '84861', '90733', '21068', '57666', '37119', '25004', '57835', '70067', '62878', '95806', '19303', '18840', '19124', '29785', '16737', '16022', '49613', '89977', '68310', '60069', '98360', '48649', '39050', '41793', '25002', '27413', '39736', '47208', '16515', '94808', '57648', '15009', 
 '80015', '42961', '63982', '21744', '71853', '81087', '67468', '34175', '64008', '20261', '11201', '51799', '48043', '45645', '61163', '48375', '36447', '57042', '21218', '41100', '89951', '22745', '35851', '83326', '61125', '78298', '80752', '49858', '52940', '96976', '63792', '11376', '53582', '18717', '90226', '50530', '94203', '99447', '27670', '96577', '57856', '56372', '16165', '23427', '54561', '28806', '44439', '22926', '30123', '61451', '92397', '56979', '92309', '70873', '13355', '21801', '46346', '37562', '56458', '28286', '47306', '99555', '69399', '26234', '47546', '49661', '88601', '35943', '39936', '25632', '24611', '44166', '56648', '30379', '59785', '11110', '14329', '93815', '52226', '71381', '13842', '25612', '63294', '14664', '21077', '82626', '18799', '60915', '81020', '56447', '76619', '11433', '13414', '42548', '92713', '70467', '30884', '47484', '16072', '38936', '13036', '88376', '45539', '35901', '19506', '65690', '73957', '71850', '49231', '14276', '20005'
 , '18384', '76615', '11635', '38177', '55607', '41369', '95447', '58581', '58149', '91946', '33790', '76232', '75692', '95464', '22246', '51061', '56692', '53121', '77209', '15482', '10688', '14868', '45907', '73520', '72666', '25734', '17959', '24677', '66446', '94627', '53535', '15560', '41967', '69297', '11929', '59403', '33283', '52232', '57350', '43933', '40921', '36635', '10827', '71286', '19736', '80619', '25251', '95042', '15526', '36496', '55854', '49124', '81980', '35375', '49157', '63512', '28944', '14946', '36503', '54010', '18767', '23969', '43905', '66979', '33113', '21286', '58471', '59080', '13395', '79144', '70373', '67031', '38360', '26705', '50906', '52406', '26066', '73146', '15884', '31897', '30045', '61068', '45550', '92454', '13376', '14354', '19770', '22928', '97790', '50723', '46081', '30202', '14410', '20223', '88500', '67298', '13261', '14172', '81410', '93578', '83583', '46047', '94167', '82564', '21156', '15799', '86709', '37931', '74703', '83103', '2305
 4', '70470', '72008', '49247', '91911', '69998', '20961', '70070', '63197', '54853', '88191', '91830', '49521', '19454', '81450', '89091', '62378', '25683', '61869', '51744', '36580', '85778', '36871', '48121', '28810', '83712', '45486', '67393', '26935', '42393', '20132', '55349', '86057', '21309', '80218', '10094', '11357', '48819', '39734', '40758', '30432', '21204', '29467', '30214', '61024', '55307', '74621', '11622', '68908', '33032', '52868', '99194', '99900', '84936', '69036', '99149', '45013', '32895', '59004', '32322', '14933', '32936', '33562', '72550', '27385', '58049', '58200', '16808', '21360', '32961', '18586', '79307', '15492')
-|     runtime filters: RF002 -> substr(ca_zip, 1, 5)
+|     runtime filters: RF004 -> substr(ca_zip, 1, 5)
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--01:SCAN HDFS [tpcds.store]
 |     partitions=1/1 files=1 size=3.08KB
@@ -483,7 +483,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=90/1824 files=90 size=9.71MB
-   runtime filters: RF001 -> store_sales.ss_store_sk
+   runtime filters: RF002 -> store_sales.ss_store_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -512,7 +512,7 @@ PLAN-ROOT SINK
 |  |
 |  07:HASH JOIN [RIGHT SEMI JOIN, PARTITIONED]
 |  |  hash predicates: substr(ca_zip, 1, 5) = substr(ca_zip, 1, 5)
-|  |  runtime filters: RF002 <- substr(ca_zip, 1, 5)
+|  |  runtime filters: RF004 <- substr(ca_zip, 1, 5)
 |  |
 |  |--15:AGGREGATE [FINALIZE]
 |  |  |  output: count:merge(*)
@@ -527,7 +527,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: customer_address.ca_address_sk = customer.c_current_addr_sk
-|  |  |  runtime filters: RF003 <- customer.c_current_addr_sk
+|  |  |  runtime filters: RF006 <- customer.c_current_addr_sk
 |  |  |
 |  |  |--13:EXCHANGE [BROADCAST]
 |  |  |  |
@@ -537,18 +537,18 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpcds.customer_address]
 |  |     partitions=1/1 files=1 size=5.25MB
-|  |     runtime filters: RF003 -> customer_address.ca_address_sk
+|  |     runtime filters: RF006 -> customer_address.ca_address_sk
 |  |
 |  16:EXCHANGE [HASH(substr(ca_zip, 1, 5))]
 |  |
 |  06:SCAN HDFS [tpcds.customer_address]
 |     partitions=1/1 files=1 size=5.25MB
 |     predicates: substr(ca_zip, 1, 5) IN ('89436', '30868', '65085', '22977', '83927', '77557', '58429', '40697', '80614', '10502', '32779', '91137', '61265', '98294', '17921', '18427', '21203', '59362', '87291', '84093', '21505', '17184', '10866', '67898', '25797', '28055', '18377', '80332', '74535', '21757', '29742', '90885', '29898', '17819', '40811', '25990', '47513', '89531', '91068', '10391', '18846', '99223', '82637', '41368', '83658', '86199', '81625', '26696', '89338', '88425', '32200', '81427', '19053', '77471', '36610', '99823', '43276', '41249', '48584', '83550', '82276', '18842', '78890', '14090', '38123', '40936', '34425', '19850', '43286', '80072', '79188', '54191', '11395', '50497', '84861', '90733', '21068', '57666', '37119', '25004', '57835', '70067', '62878', '95806', '19303', '18840', '19124', '29785', '16737', '16022', '49613', '89977', '68310', '60069', '98360', '48649', '39050', '41793', '25002', '27413', '39736', '47208', '16515', '94808', '57648', '15009', 
 '80015', '42961', '63982', '21744', '71853', '81087', '67468', '34175', '64008', '20261', '11201', '51799', '48043', '45645', '61163', '48375', '36447', '57042', '21218', '41100', '89951', '22745', '35851', '83326', '61125', '78298', '80752', '49858', '52940', '96976', '63792', '11376', '53582', '18717', '90226', '50530', '94203', '99447', '27670', '96577', '57856', '56372', '16165', '23427', '54561', '28806', '44439', '22926', '30123', '61451', '92397', '56979', '92309', '70873', '13355', '21801', '46346', '37562', '56458', '28286', '47306', '99555', '69399', '26234', '47546', '49661', '88601', '35943', '39936', '25632', '24611', '44166', '56648', '30379', '59785', '11110', '14329', '93815', '52226', '71381', '13842', '25612', '63294', '14664', '21077', '82626', '18799', '60915', '81020', '56447', '76619', '11433', '13414', '42548', '92713', '70467', '30884', '47484', '16072', '38936', '13036', '88376', '45539', '35901', '19506', '65690', '73957', '71850', '49231', '14276', '20005'
 , '18384', '76615', '11635', '38177', '55607', '41369', '95447', '58581', '58149', '91946', '33790', '76232', '75692', '95464', '22246', '51061', '56692', '53121', '77209', '15482', '10688', '14868', '45907', '73520', '72666', '25734', '17959', '24677', '66446', '94627', '53535', '15560', '41967', '69297', '11929', '59403', '33283', '52232', '57350', '43933', '40921', '36635', '10827', '71286', '19736', '80619', '25251', '95042', '15526', '36496', '55854', '49124', '81980', '35375', '49157', '63512', '28944', '14946', '36503', '54010', '18767', '23969', '43905', '66979', '33113', '21286', '58471', '59080', '13395', '79144', '70373', '67031', '38360', '26705', '50906', '52406', '26066', '73146', '15884', '31897', '30045', '61068', '45550', '92454', '13376', '14354', '19770', '22928', '97790', '50723', '46081', '30202', '14410', '20223', '88500', '67298', '13261', '14172', '81410', '93578', '83583', '46047', '94167', '82564', '21156', '15799', '86709', '37931', '74703', '83103', '2305
 4', '70470', '72008', '49247', '91911', '69998', '20961', '70070', '63197', '54853', '88191', '91830', '49521', '19454', '81450', '89091', '62378', '25683', '61869', '51744', '36580', '85778', '36871', '48121', '28810', '83712', '45486', '67393', '26935', '42393', '20132', '55349', '86057', '21309', '80218', '10094', '11357', '48819', '39734', '40758', '30432', '21204', '29467', '30214', '61024', '55307', '74621', '11622', '68908', '33032', '52868', '99194', '99900', '84936', '69036', '99149', '45013', '32895', '59004', '32322', '14933', '32936', '33562', '72550', '27385', '58049', '58200', '16808', '21360', '32961', '18586', '79307', '15492')
-|     runtime filters: RF002 -> substr(ca_zip, 1, 5)
+|     runtime filters: RF004 -> substr(ca_zip, 1, 5)
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -558,7 +558,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=90/1824 files=90 size=9.71MB
-   runtime filters: RF001 -> store_sales.ss_store_sk
+   runtime filters: RF002 -> store_sales.ss_store_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -591,7 +591,7 @@ PLAN-ROOT SINK
 |  |
 |  07:HASH JOIN [RIGHT SEMI JOIN, PARTITIONED]
 |  |  hash predicates: substr(ca_zip, 1, 5) = substr(ca_zip, 1, 5)
-|  |  runtime filters: RF002 <- substr(ca_zip, 1, 5)
+|  |  runtime filters: RF004 <- substr(ca_zip, 1, 5)
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -610,7 +610,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: customer_address.ca_address_sk = customer.c_current_addr_sk
-|  |  |  runtime filters: RF003 <- customer.c_current_addr_sk
+|  |  |  runtime filters: RF006 <- customer.c_current_addr_sk
 |  |  |
 |  |  |--JOIN BUILD
 |  |  |  |  join-table-id=02 plan-id=03 cohort-id=03
@@ -624,18 +624,18 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpcds.customer_address]
 |  |     partitions=1/1 files=1 size=5.25MB
-|  |     runtime filters: RF003 -> customer_address.ca_address_sk
+|  |     runtime filters: RF006 -> customer_address.ca_address_sk
 |  |
 |  16:EXCHANGE [HASH(substr(ca_zip, 1, 5))]
 |  |
 |  06:SCAN HDFS [tpcds.customer_address]
 |     partitions=1/1 files=1 size=5.25MB
 |     predicates: substr(ca_zip, 1, 5) IN ('89436', '30868', '65085', '22977', '83927', '77557', '58429', '40697', '80614', '10502', '32779', '91137', '61265', '98294', '17921', '18427', '21203', '59362', '87291', '84093', '21505', '17184', '10866', '67898', '25797', '28055', '18377', '80332', '74535', '21757', '29742', '90885', '29898', '17819', '40811', '25990', '47513', '89531', '91068', '10391', '18846', '99223', '82637', '41368', '83658', '86199', '81625', '26696', '89338', '88425', '32200', '81427', '19053', '77471', '36610', '99823', '43276', '41249', '48584', '83550', '82276', '18842', '78890', '14090', '38123', '40936', '34425', '19850', '43286', '80072', '79188', '54191', '11395', '50497', '84861', '90733', '21068', '57666', '37119', '25004', '57835', '70067', '62878', '95806', '19303', '18840', '19124', '29785', '16737', '16022', '49613', '89977', '68310', '60069', '98360', '48649', '39050', '41793', '25002', '27413', '39736', '47208', '16515', '94808', '57648', '15009', 
 '80015', '42961', '63982', '21744', '71853', '81087', '67468', '34175', '64008', '20261', '11201', '51799', '48043', '45645', '61163', '48375', '36447', '57042', '21218', '41100', '89951', '22745', '35851', '83326', '61125', '78298', '80752', '49858', '52940', '96976', '63792', '11376', '53582', '18717', '90226', '50530', '94203', '99447', '27670', '96577', '57856', '56372', '16165', '23427', '54561', '28806', '44439', '22926', '30123', '61451', '92397', '56979', '92309', '70873', '13355', '21801', '46346', '37562', '56458', '28286', '47306', '99555', '69399', '26234', '47546', '49661', '88601', '35943', '39936', '25632', '24611', '44166', '56648', '30379', '59785', '11110', '14329', '93815', '52226', '71381', '13842', '25612', '63294', '14664', '21077', '82626', '18799', '60915', '81020', '56447', '76619', '11433', '13414', '42548', '92713', '70467', '30884', '47484', '16072', '38936', '13036', '88376', '45539', '35901', '19506', '65690', '73957', '71850', '49231', '14276', '20005'
 , '18384', '76615', '11635', '38177', '55607', '41369', '95447', '58581', '58149', '91946', '33790', '76232', '75692', '95464', '22246', '51061', '56692', '53121', '77209', '15482', '10688', '14868', '45907', '73520', '72666', '25734', '17959', '24677', '66446', '94627', '53535', '15560', '41967', '69297', '11929', '59403', '33283', '52232', '57350', '43933', '40921', '36635', '10827', '71286', '19736', '80619', '25251', '95042', '15526', '36496', '55854', '49124', '81980', '35375', '49157', '63512', '28944', '14946', '36503', '54010', '18767', '23969', '43905', '66979', '33113', '21286', '58471', '59080', '13395', '79144', '70373', '67031', '38360', '26705', '50906', '52406', '26066', '73146', '15884', '31897', '30045', '61068', '45550', '92454', '13376', '14354', '19770', '22928', '97790', '50723', '46081', '30202', '14410', '20223', '88500', '67298', '13261', '14172', '81410', '93578', '83583', '46047', '94167', '82564', '21156', '15799', '86709', '37931', '74703', '83103', '2305
 4', '70470', '72008', '49247', '91911', '69998', '20961', '70070', '63197', '54853', '88191', '91830', '49521', '19454', '81450', '89091', '62378', '25683', '61869', '51744', '36580', '85778', '36871', '48121', '28810', '83712', '45486', '67393', '26935', '42393', '20132', '55349', '86057', '21309', '80218', '10094', '11357', '48819', '39734', '40758', '30432', '21204', '29467', '30214', '61024', '55307', '74621', '11622', '68908', '33032', '52868', '99194', '99900', '84936', '69036', '99149', '45013', '32895', '59004', '32322', '14933', '32936', '33562', '72550', '27385', '58049', '58200', '16808', '21360', '32961', '18586', '79307', '15492')
-|     runtime filters: RF002 -> substr(ca_zip, 1, 5)
+|     runtime filters: RF004 -> substr(ca_zip, 1, 5)
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -649,7 +649,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=90/1824 files=90 size=9.71MB
-   runtime filters: RF001 -> store_sales.ss_store_sk
+   runtime filters: RF002 -> store_sales.ss_store_sk
 ====
 # TPCDS-Q19
 select
@@ -708,15 +708,15 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: ca_address_sk = c_current_addr_sk
-|  runtime filters: RF001 <- c_current_addr_sk
+|  runtime filters: RF002 <- c_current_addr_sk
 |
 |--08:HASH JOIN [INNER JOIN]
 |  |  hash predicates: c_customer_sk = ss_customer_sk
-|  |  runtime filters: RF002 <- ss_customer_sk
+|  |  runtime filters: RF004 <- ss_customer_sk
 |  |
 |  |--07:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  |  runtime filters: RF003 <- d_date_sk
+|  |  |  runtime filters: RF006 <- d_date_sk
 |  |  |
 |  |  |--00:SCAN HDFS [tpcds.date_dim]
 |  |  |     partitions=1/1 files=1 size=9.84MB
@@ -724,7 +724,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  06:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: ss_item_sk = i_item_sk
-|  |  |  runtime filters: RF004 <- i_item_sk
+|  |  |  runtime filters: RF008 <- i_item_sk
 |  |  |
 |  |  |--02:SCAN HDFS [tpcds.item]
 |  |  |     partitions=1/1 files=1 size=4.82MB
@@ -732,15 +732,15 @@ PLAN-ROOT SINK
 |  |  |
 |  |  01:SCAN HDFS [tpcds.store_sales]
 |  |     partitions=30/1824 files=30 size=9.93MB
-|  |     runtime filters: RF000 -> ss_store_sk, RF003 -> ss_sold_date_sk, RF004 -> ss_item_sk
+|  |     runtime filters: RF000 -> ss_store_sk, RF006 -> ss_sold_date_sk, RF008 -> ss_item_sk
 |  |
 |  03:SCAN HDFS [tpcds.customer]
 |     partitions=1/1 files=1 size=12.60MB
-|     runtime filters: RF002 -> c_customer_sk
+|     runtime filters: RF004 -> c_customer_sk
 |
 04:SCAN HDFS [tpcds.customer_address]
    partitions=1/1 files=1 size=5.25MB
-   runtime filters: RF001 -> ca_address_sk
+   runtime filters: RF002 -> ca_address_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -773,7 +773,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: c_current_addr_sk = ca_address_sk
-|  runtime filters: RF001 <- ca_address_sk
+|  runtime filters: RF002 <- ca_address_sk
 |
 |--18:EXCHANGE [HASH(ca_address_sk)]
 |  |
@@ -784,19 +784,19 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF002 <- c_customer_sk
+|  runtime filters: RF004 <- c_customer_sk
 |
 |--16:EXCHANGE [HASH(c_customer_sk)]
 |  |
 |  03:SCAN HDFS [tpcds.customer]
 |     partitions=1/1 files=1 size=12.60MB
-|     runtime filters: RF001 -> c_current_addr_sk
+|     runtime filters: RF002 -> c_current_addr_sk
 |
 15:EXCHANGE [HASH(ss_customer_sk)]
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--14:EXCHANGE [BROADCAST]
 |  |
@@ -806,7 +806,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF004 <- i_item_sk
+|  runtime filters: RF008 <- i_item_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -816,7 +816,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=30/1824 files=30 size=9.93MB
-   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_customer_sk, RF003 -> ss_sold_date_sk, RF004 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF004 -> ss_customer_sk, RF006 -> ss_sold_date_sk, RF008 -> ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -853,7 +853,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: c_current_addr_sk = ca_address_sk
-|  runtime filters: RF001 <- ca_address_sk
+|  runtime filters: RF002 <- ca_address_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -868,7 +868,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF002 <- c_customer_sk
+|  runtime filters: RF004 <- c_customer_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -878,13 +878,13 @@ PLAN-ROOT SINK
 |  |
 |  03:SCAN HDFS [tpcds.customer]
 |     partitions=1/1 files=1 size=12.60MB
-|     runtime filters: RF001 -> c_current_addr_sk
+|     runtime filters: RF002 -> c_current_addr_sk
 |
 15:EXCHANGE [HASH(ss_customer_sk)]
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -898,7 +898,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF004 <- i_item_sk
+|  runtime filters: RF008 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -912,7 +912,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=30/1824 files=30 size=9.93MB
-   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_customer_sk, RF003 -> ss_sold_date_sk, RF004 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF004 -> ss_customer_sk, RF006 -> ss_sold_date_sk, RF008 -> ss_item_sk
 ====
 # TPCDS-Q27
 select
@@ -964,7 +964,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF001 <- s_store_sk
+|  runtime filters: RF002 <- s_store_sk
 |
 |--03:SCAN HDFS [tpcds.store]
 |     partitions=1/1 files=1 size=3.08KB
@@ -972,7 +972,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_cdemo_sk = cd_demo_sk
-|  runtime filters: RF002 <- cd_demo_sk
+|  runtime filters: RF004 <- cd_demo_sk
 |
 |--01:SCAN HDFS [tpcds.customer_demographics]
 |     partitions=1/1 files=1 size=76.92MB
@@ -980,7 +980,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--02:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -988,7 +988,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_store_sk, RF002 -> ss_cdemo_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_item_sk, RF002 -> ss_store_sk, RF004 -> ss_cdemo_sk, RF006 -> ss_sold_date_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1020,7 +1020,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF001 <- s_store_sk
+|  runtime filters: RF002 <- s_store_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -1030,7 +1030,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_cdemo_sk = cd_demo_sk
-|  runtime filters: RF002 <- cd_demo_sk
+|  runtime filters: RF004 <- cd_demo_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -1040,7 +1040,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--11:EXCHANGE [BROADCAST]
 |  |
@@ -1050,7 +1050,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_store_sk, RF002 -> ss_cdemo_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_item_sk, RF002 -> ss_store_sk, RF004 -> ss_cdemo_sk, RF006 -> ss_sold_date_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1086,7 +1086,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF001 <- s_store_sk
+|  runtime filters: RF002 <- s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1100,7 +1100,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_cdemo_sk = cd_demo_sk
-|  runtime filters: RF002 <- cd_demo_sk
+|  runtime filters: RF004 <- cd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -1114,7 +1114,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -1128,7 +1128,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_store_sk, RF002 -> ss_cdemo_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_item_sk, RF002 -> ss_store_sk, RF004 -> ss_cdemo_sk, RF006 -> ss_sold_date_sk
 ====
 # TPCDS-Q34
 select
@@ -1191,7 +1191,7 @@ PLAN-ROOT SINK
 |  |
 |  06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  |  runtime filters: RF001 <- store.s_store_sk
+|  |  runtime filters: RF002 <- store.s_store_sk
 |  |
 |  |--02:SCAN HDFS [tpcds.store]
 |  |     partitions=1/1 files=1 size=3.08KB
@@ -1199,7 +1199,7 @@ PLAN-ROOT SINK
 |  |
 |  05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  |  runtime filters: RF002 <- date_dim.d_date_sk
+|  |  runtime filters: RF004 <- date_dim.d_date_sk
 |  |
 |  |--01:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
@@ -1207,7 +1207,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  |  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  |  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |  |
 |  |--03:SCAN HDFS [tpcds.household_demographics]
 |  |     partitions=1/1 files=1 size=148.10KB
@@ -1215,7 +1215,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF001 -> store_sales.ss_store_sk, RF002 -> store_sales.ss_sold_date_sk, RF003 -> store_sales.ss_hdemo_sk
+|     runtime filters: RF002 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF006 -> store_sales.ss_hdemo_sk
 |
 08:SCAN HDFS [tpcds.customer]
    partitions=1/1 files=1 size=12.60MB
@@ -1254,7 +1254,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -1264,7 +1264,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF002 <- date_dim.d_date_sk
+|  runtime filters: RF004 <- date_dim.d_date_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -1274,7 +1274,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |
 |--11:EXCHANGE [BROADCAST]
 |  |
@@ -1284,7 +1284,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF001 -> store_sales.ss_store_sk, RF002 -> store_sales.ss_sold_date_sk, RF003 -> store_sales.ss_hdemo_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF006 -> store_sales.ss_hdemo_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1323,7 +1323,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1337,7 +1337,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF002 <- date_dim.d_date_sk
+|  runtime filters: RF004 <- date_dim.d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -1351,7 +1351,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -1365,7 +1365,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF001 -> store_sales.ss_store_sk, RF002 -> store_sales.ss_sold_date_sk, RF003 -> store_sales.ss_hdemo_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF006 -> store_sales.ss_hdemo_sk
 ====
 # TPCDS-Q42
 select
@@ -1413,7 +1413,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--02:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -1421,7 +1421,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1454,7 +1454,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -1464,7 +1464,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1501,7 +1501,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1515,7 +1515,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ====
 # TPCDS-Q43
 select
@@ -1571,7 +1571,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--00:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -1579,7 +1579,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1612,7 +1612,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -1622,7 +1622,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1659,7 +1659,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1673,7 +1673,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk
 ====
 # TPCDS-Q46
 select
@@ -1742,7 +1742,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF001 <- c_customer_sk
+|  runtime filters: RF002 <- c_customer_sk
 |
 |--10:SCAN HDFS [tpcds.customer]
 |     partitions=1/1 files=1 size=12.60MB
@@ -1754,14 +1754,14 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_addr_sk = customer_address.ca_address_sk
-|  runtime filters: RF002 <- customer_address.ca_address_sk
+|  runtime filters: RF004 <- customer_address.ca_address_sk
 |
 |--04:SCAN HDFS [tpcds.customer_address]
 |     partitions=1/1 files=1 size=5.25MB
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF003 <- store.s_store_sk
+|  runtime filters: RF006 <- store.s_store_sk
 |
 |--02:SCAN HDFS [tpcds.store]
 |     partitions=1/1 files=1 size=3.08KB
@@ -1769,7 +1769,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF004 <- date_dim.d_date_sk
+|  runtime filters: RF008 <- date_dim.d_date_sk
 |
 |--01:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -1777,7 +1777,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF005 <- household_demographics.hd_demo_sk
+|  runtime filters: RF010 <- household_demographics.hd_demo_sk
 |
 |--03:SCAN HDFS [tpcds.household_demographics]
 |     partitions=1/1 files=1 size=148.10KB
@@ -1785,7 +1785,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF001 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_addr_sk, RF003 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF005 -> store_sales.ss_hdemo_sk
+   runtime filters: RF002 -> tpcds.store_sales.ss_customer_sk, RF004 -> store_sales.ss_addr_sk, RF006 -> store_sales.ss_store_sk, RF008 -> store_sales.ss_sold_date_sk, RF010 -> store_sales.ss_hdemo_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1808,7 +1808,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF001 <- c_customer_sk
+|  runtime filters: RF002 <- c_customer_sk
 |
 |--22:EXCHANGE [HASH(c_customer_sk)]
 |  |
@@ -1830,7 +1830,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_addr_sk = customer_address.ca_address_sk
-|  runtime filters: RF002 <- customer_address.ca_address_sk
+|  runtime filters: RF004 <- customer_address.ca_address_sk
 |
 |--18:EXCHANGE [BROADCAST]
 |  |
@@ -1839,7 +1839,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF003 <- store.s_store_sk
+|  runtime filters: RF006 <- store.s_store_sk
 |
 |--17:EXCHANGE [BROADCAST]
 |  |
@@ -1849,7 +1849,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF004 <- date_dim.d_date_sk
+|  runtime filters: RF008 <- date_dim.d_date_sk
 |
 |--16:EXCHANGE [BROADCAST]
 |  |
@@ -1859,7 +1859,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF005 <- household_demographics.hd_demo_sk
+|  runtime filters: RF010 <- household_demographics.hd_demo_sk
 |
 |--15:EXCHANGE [BROADCAST]
 |  |
@@ -1869,7 +1869,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF001 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_addr_sk, RF003 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF005 -> store_sales.ss_hdemo_sk
+   runtime filters: RF002 -> tpcds.store_sales.ss_customer_sk, RF004 -> store_sales.ss_addr_sk, RF006 -> store_sales.ss_store_sk, RF008 -> store_sales.ss_sold_date_sk, RF010 -> store_sales.ss_hdemo_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1896,7 +1896,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF001 <- c_customer_sk
+|  runtime filters: RF002 <- c_customer_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1922,7 +1922,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_addr_sk = customer_address.ca_address_sk
-|  runtime filters: RF002 <- customer_address.ca_address_sk
+|  runtime filters: RF004 <- customer_address.ca_address_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -1935,7 +1935,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF003 <- store.s_store_sk
+|  runtime filters: RF006 <- store.s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -1949,7 +1949,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF004 <- date_dim.d_date_sk
+|  runtime filters: RF008 <- date_dim.d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -1963,7 +1963,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF005 <- household_demographics.hd_demo_sk
+|  runtime filters: RF010 <- household_demographics.hd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=05 plan-id=06 cohort-id=01
@@ -1977,7 +1977,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF001 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_addr_sk, RF003 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF005 -> store_sales.ss_hdemo_sk
+   runtime filters: RF002 -> tpcds.store_sales.ss_customer_sk, RF004 -> store_sales.ss_addr_sk, RF006 -> store_sales.ss_store_sk, RF008 -> store_sales.ss_sold_date_sk, RF010 -> store_sales.ss_hdemo_sk
 ====
 # TPCDS-Q52
 select
@@ -2024,7 +2024,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--02:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -2032,7 +2032,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -2065,7 +2065,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -2075,7 +2075,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -2112,7 +2112,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_item_sk = item.i_item_sk
-|  runtime filters: RF001 <- item.i_item_sk
+|  runtime filters: RF002 <- item.i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -2126,7 +2126,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF001 -> store_sales.ss_item_sk
+   runtime filters: RF000 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_item_sk
 ====
 # TPCDS-Q53
 select
@@ -2178,7 +2178,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--02:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -2186,7 +2186,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--00:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -2194,7 +2194,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -2226,7 +2226,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--10:EXCHANGE [BROADCAST]
 |  |
@@ -2236,7 +2236,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -2246,7 +2246,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -2282,7 +2282,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -2296,7 +2296,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -2310,7 +2310,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ====
 # TPCDS-Q55
 select
@@ -2354,7 +2354,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--02:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -2362,7 +2362,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_sold_date_sk, RF001 -> ss_item_sk
+   runtime filters: RF000 -> ss_sold_date_sk, RF002 -> ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -2395,7 +2395,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -2405,7 +2405,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_sold_date_sk, RF001 -> ss_item_sk
+   runtime filters: RF000 -> ss_sold_date_sk, RF002 -> ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -2442,7 +2442,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -2456,7 +2456,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_sold_date_sk, RF001 -> ss_item_sk
+   runtime filters: RF000 -> ss_sold_date_sk, RF002 -> ss_item_sk
 ====
 # TPCDS-Q59
 with
@@ -2552,14 +2552,14 @@ PLAN-ROOT SINK
 |
 |--15:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ss_store_sk = s_store_sk
-|  |  runtime filters: RF005 <- s_store_sk
+|  |  runtime filters: RF010 <- s_store_sk
 |  |
 |  |--12:SCAN HDFS [tpcds.store]
 |  |     partitions=1/1 files=1 size=3.08KB
 |  |
 |  14:HASH JOIN [INNER JOIN]
 |  |  hash predicates: d_week_seq = d.d_week_seq
-|  |  runtime filters: RF006 <- d.d_week_seq
+|  |  runtime filters: RF012 <- d.d_week_seq
 |  |
 |  |--13:SCAN HDFS [tpcds.date_dim d]
 |  |     partitions=1/1 files=1 size=9.84MB
@@ -2571,19 +2571,19 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  runtime filters: RF007 <- d_date_sk
+|  |  runtime filters: RF014 <- d_date_sk
 |  |
 |  |--09:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
-|  |     runtime filters: RF006 -> tpcds.date_dim.d_week_seq
+|  |     runtime filters: RF012 -> tpcds.date_dim.d_week_seq
 |  |
 |  08:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF005 -> tpcds.store_sales.ss_store_sk, RF007 -> ss_sold_date_sk
+|     runtime filters: RF010 -> tpcds.store_sales.ss_store_sk, RF014 -> ss_sold_date_sk
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF002 <- s_store_sk
+|  runtime filters: RF004 <- s_store_sk
 |
 |--04:SCAN HDFS [tpcds.store]
 |     partitions=1/1 files=1 size=3.08KB
@@ -2591,7 +2591,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: d_week_seq = d.d_week_seq
-|  runtime filters: RF003 <- d.d_week_seq
+|  runtime filters: RF006 <- d.d_week_seq
 |
 |--05:SCAN HDFS [tpcds.date_dim d]
 |     partitions=1/1 files=1 size=9.84MB
@@ -2604,15 +2604,15 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF004 <- d_date_sk
+|  runtime filters: RF008 <- d_date_sk
 |
 |--01:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
-|     runtime filters: RF000 -> tpcds.date_dim.d_week_seq, RF003 -> tpcds.date_dim.d_week_seq
+|     runtime filters: RF000 -> tpcds.date_dim.d_week_seq, RF006 -> tpcds.date_dim.d_week_seq
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF002 -> tpcds.store_sales.ss_store_sk, RF004 -> ss_sold_date_sk
+   runtime filters: RF004 -> tpcds.store_sales.ss_store_sk, RF008 -> ss_sold_date_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -2631,7 +2631,7 @@ PLAN-ROOT SINK
 |  |
 |  15:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ss_store_sk = s_store_sk
-|  |  runtime filters: RF005 <- s_store_sk
+|  |  runtime filters: RF010 <- s_store_sk
 |  |
 |  |--27:EXCHANGE [BROADCAST]
 |  |  |
@@ -2640,7 +2640,7 @@ PLAN-ROOT SINK
 |  |
 |  14:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: d_week_seq = d.d_week_seq
-|  |  runtime filters: RF006 <- d.d_week_seq
+|  |  runtime filters: RF012 <- d.d_week_seq
 |  |
 |  |--26:EXCHANGE [BROADCAST]
 |  |  |
@@ -2660,23 +2660,23 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  runtime filters: RF007 <- d_date_sk
+|  |  runtime filters: RF014 <- d_date_sk
 |  |
 |  |--23:EXCHANGE [BROADCAST]
 |  |  |
 |  |  09:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
-|  |     runtime filters: RF006 -> tpcds.date_dim.d_week_seq
+|  |     runtime filters: RF012 -> tpcds.date_dim.d_week_seq
 |  |
 |  08:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF005 -> tpcds.store_sales.ss_store_sk, RF007 -> ss_sold_date_sk
+|     runtime filters: RF010 -> tpcds.store_sales.ss_store_sk, RF014 -> ss_sold_date_sk
 |
 28:EXCHANGE [HASH(d_week_seq,s_store_id)]
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF002 <- s_store_sk
+|  runtime filters: RF004 <- s_store_sk
 |
 |--22:EXCHANGE [BROADCAST]
 |  |
@@ -2686,7 +2686,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: d_week_seq = d.d_week_seq
-|  runtime filters: RF003 <- d.d_week_seq
+|  runtime filters: RF006 <- d.d_week_seq
 |
 |--21:EXCHANGE [BROADCAST]
 |  |
@@ -2707,17 +2707,17 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF004 <- d_date_sk
+|  runtime filters: RF008 <- d_date_sk
 |
 |--18:EXCHANGE [BROADCAST]
 |  |
 |  01:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
-|     runtime filters: RF000 -> tpcds.date_dim.d_week_seq, RF003 -> tpcds.date_dim.d_week_seq
+|     runtime filters: RF000 -> tpcds.date_dim.d_week_seq, RF006 -> tpcds.date_dim.d_week_seq
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF002 -> tpcds.store_sales.ss_store_sk, RF004 -> ss_sold_date_sk
+   runtime filters: RF004 -> tpcds.store_sales.ss_store_sk, RF008 -> ss_sold_date_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -2740,7 +2740,7 @@ PLAN-ROOT SINK
 |  |
 |  15:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ss_store_sk = s_store_sk
-|  |  runtime filters: RF005 <- s_store_sk
+|  |  runtime filters: RF010 <- s_store_sk
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -2753,7 +2753,7 @@ PLAN-ROOT SINK
 |  |
 |  14:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: d_week_seq = d.d_week_seq
-|  |  runtime filters: RF006 <- d.d_week_seq
+|  |  runtime filters: RF012 <- d.d_week_seq
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=02 plan-id=03 cohort-id=02
@@ -2777,7 +2777,7 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  runtime filters: RF007 <- d_date_sk
+|  |  runtime filters: RF014 <- d_date_sk
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=03 plan-id=04 cohort-id=02
@@ -2787,17 +2787,17 @@ PLAN-ROOT SINK
 |  |  |
 |  |  09:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
-|  |     runtime filters: RF006 -> tpcds.date_dim.d_week_seq
+|  |     runtime filters: RF012 -> tpcds.date_dim.d_week_seq
 |  |
 |  08:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF005 -> tpcds.store_sales.ss_store_sk, RF007 -> ss_sold_date_sk
+|     runtime filters: RF010 -> tpcds.store_sales.ss_store_sk, RF014 -> ss_sold_date_sk
 |
 28:EXCHANGE [HASH(d_week_seq,s_store_id)]
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF002 <- s_store_sk
+|  runtime filters: RF004 <- s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -2811,7 +2811,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: d_week_seq = d.d_week_seq
-|  runtime filters: RF003 <- d.d_week_seq
+|  runtime filters: RF006 <- d.d_week_seq
 |
 |--JOIN BUILD
 |  |  join-table-id=05 plan-id=06 cohort-id=01
@@ -2836,7 +2836,7 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF004 <- d_date_sk
+|  runtime filters: RF008 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=06 plan-id=07 cohort-id=01
@@ -2846,11 +2846,11 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
-|     runtime filters: RF000 -> tpcds.date_dim.d_week_seq, RF003 -> tpcds.date_dim.d_week_seq
+|     runtime filters: RF000 -> tpcds.date_dim.d_week_seq, RF006 -> tpcds.date_dim.d_week_seq
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF002 -> tpcds.store_sales.ss_store_sk, RF004 -> ss_sold_date_sk
+   runtime filters: RF004 -> tpcds.store_sales.ss_store_sk, RF008 -> ss_sold_date_sk
 ====
 # TPCDS-Q63
 select
@@ -2917,7 +2917,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--02:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -2925,7 +2925,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--00:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -2933,7 +2933,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=364/1824 files=364 size=62.92MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -2975,7 +2975,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -2985,7 +2985,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -2995,7 +2995,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=364/1824 files=364 size=62.92MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -3041,7 +3041,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -3055,7 +3055,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -3069,7 +3069,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=364/1824 files=364 size=62.92MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ====
 # TPCDS-Q65
 select
@@ -3147,7 +3147,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  runtime filters: RF004 <- d_date_sk
+|  |  runtime filters: RF008 <- d_date_sk
 |  |
 |  |--03:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
@@ -3155,18 +3155,18 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF004 -> ss_sold_date_sk
+|     runtime filters: RF008 -> ss_sold_date_sk
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--01:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF002 <- s_store_sk
+|  runtime filters: RF004 <- s_store_sk
 |
 |--00:SCAN HDFS [tpcds.store]
 |     partitions=1/1 files=1 size=3.08KB
@@ -3178,7 +3178,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--08:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -3186,7 +3186,7 @@ PLAN-ROOT SINK
 |
 07:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_store_sk, RF001 -> tpcds.store_sales.ss_item_sk, RF002 -> tpcds.store_sales.ss_store_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_store_sk, RF002 -> tpcds.store_sales.ss_item_sk, RF004 -> tpcds.store_sales.ss_store_sk, RF006 -> ss_sold_date_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -3226,7 +3226,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  runtime filters: RF004 <- d_date_sk
+|  |  runtime filters: RF008 <- d_date_sk
 |  |
 |  |--21:EXCHANGE [BROADCAST]
 |  |  |
@@ -3236,11 +3236,11 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF004 -> ss_sold_date_sk
+|     runtime filters: RF008 -> ss_sold_date_sk
 |
 12:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--20:EXCHANGE [HASH(i_item_sk)]
 |  |
@@ -3251,7 +3251,7 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF002 <- s_store_sk
+|  runtime filters: RF004 <- s_store_sk
 |
 |--18:EXCHANGE [BROADCAST]
 |  |
@@ -3271,7 +3271,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--15:EXCHANGE [BROADCAST]
 |  |
@@ -3281,7 +3281,7 @@ PLAN-ROOT SINK
 |
 07:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_store_sk, RF001 -> tpcds.store_sales.ss_item_sk, RF002 -> tpcds.store_sales.ss_store_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_store_sk, RF002 -> tpcds.store_sales.ss_item_sk, RF004 -> tpcds.store_sales.ss_store_sk, RF006 -> ss_sold_date_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -3325,7 +3325,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ss_sold_date_sk = d_date_sk
-|  |  runtime filters: RF004 <- d_date_sk
+|  |  runtime filters: RF008 <- d_date_sk
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -3339,11 +3339,11 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF004 -> ss_sold_date_sk
+|     runtime filters: RF008 -> ss_sold_date_sk
 |
 12:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -3358,7 +3358,7 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_store_sk = s_store_sk
-|  runtime filters: RF002 <- s_store_sk
+|  runtime filters: RF004 <- s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -3382,7 +3382,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF003 <- d_date_sk
+|  runtime filters: RF006 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -3396,7 +3396,7 @@ PLAN-ROOT SINK
 |
 07:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_store_sk, RF001 -> tpcds.store_sales.ss_item_sk, RF002 -> tpcds.store_sales.ss_store_sk, RF003 -> ss_sold_date_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_store_sk, RF002 -> tpcds.store_sales.ss_item_sk, RF004 -> tpcds.store_sales.ss_store_sk, RF006 -> ss_sold_date_sk
 ====
 # TPCDS-Q68
 select
@@ -3463,7 +3463,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF001 <- c_customer_sk
+|  runtime filters: RF002 <- c_customer_sk
 |
 |--10:SCAN HDFS [tpcds.customer]
 |     partitions=1/1 files=1 size=12.60MB
@@ -3475,14 +3475,14 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_addr_sk = customer_address.ca_address_sk
-|  runtime filters: RF002 <- customer_address.ca_address_sk
+|  runtime filters: RF004 <- customer_address.ca_address_sk
 |
 |--04:SCAN HDFS [tpcds.customer_address]
 |     partitions=1/1 files=1 size=5.25MB
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF003 <- store.s_store_sk
+|  runtime filters: RF006 <- store.s_store_sk
 |
 |--02:SCAN HDFS [tpcds.store]
 |     partitions=1/1 files=1 size=3.08KB
@@ -3490,7 +3490,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF004 <- date_dim.d_date_sk
+|  runtime filters: RF008 <- date_dim.d_date_sk
 |
 |--01:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -3498,7 +3498,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF005 <- household_demographics.hd_demo_sk
+|  runtime filters: RF010 <- household_demographics.hd_demo_sk
 |
 |--03:SCAN HDFS [tpcds.household_demographics]
 |     partitions=1/1 files=1 size=148.10KB
@@ -3506,7 +3506,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF001 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_addr_sk, RF003 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF005 -> store_sales.ss_hdemo_sk
+   runtime filters: RF002 -> tpcds.store_sales.ss_customer_sk, RF004 -> store_sales.ss_addr_sk, RF006 -> store_sales.ss_store_sk, RF008 -> store_sales.ss_sold_date_sk, RF010 -> store_sales.ss_hdemo_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -3529,7 +3529,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF001 <- c_customer_sk
+|  runtime filters: RF002 <- c_customer_sk
 |
 |--21:EXCHANGE [BROADCAST]
 |  |
@@ -3549,7 +3549,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_addr_sk = customer_address.ca_address_sk
-|  runtime filters: RF002 <- customer_address.ca_address_sk
+|  runtime filters: RF004 <- customer_address.ca_address_sk
 |
 |--18:EXCHANGE [BROADCAST]
 |  |
@@ -3558,7 +3558,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF003 <- store.s_store_sk
+|  runtime filters: RF006 <- store.s_store_sk
 |
 |--17:EXCHANGE [BROADCAST]
 |  |
@@ -3568,7 +3568,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF004 <- date_dim.d_date_sk
+|  runtime filters: RF008 <- date_dim.d_date_sk
 |
 |--16:EXCHANGE [BROADCAST]
 |  |
@@ -3578,7 +3578,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF005 <- household_demographics.hd_demo_sk
+|  runtime filters: RF010 <- household_demographics.hd_demo_sk
 |
 |--15:EXCHANGE [BROADCAST]
 |  |
@@ -3588,7 +3588,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF001 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_addr_sk, RF003 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF005 -> store_sales.ss_hdemo_sk
+   runtime filters: RF002 -> tpcds.store_sales.ss_customer_sk, RF004 -> store_sales.ss_addr_sk, RF006 -> store_sales.ss_store_sk, RF008 -> store_sales.ss_sold_date_sk, RF010 -> store_sales.ss_hdemo_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -3615,7 +3615,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_customer_sk = c_customer_sk
-|  runtime filters: RF001 <- c_customer_sk
+|  runtime filters: RF002 <- c_customer_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -3639,7 +3639,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_addr_sk = customer_address.ca_address_sk
-|  runtime filters: RF002 <- customer_address.ca_address_sk
+|  runtime filters: RF004 <- customer_address.ca_address_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -3652,7 +3652,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF003 <- store.s_store_sk
+|  runtime filters: RF006 <- store.s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -3666,7 +3666,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF004 <- date_dim.d_date_sk
+|  runtime filters: RF008 <- date_dim.d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -3680,7 +3680,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF005 <- household_demographics.hd_demo_sk
+|  runtime filters: RF010 <- household_demographics.hd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=05 plan-id=06 cohort-id=01
@@ -3694,7 +3694,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF001 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_addr_sk, RF003 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF005 -> store_sales.ss_hdemo_sk
+   runtime filters: RF002 -> tpcds.store_sales.ss_customer_sk, RF004 -> store_sales.ss_addr_sk, RF006 -> store_sales.ss_store_sk, RF008 -> store_sales.ss_sold_date_sk, RF010 -> store_sales.ss_hdemo_sk
 ====
 # TPCDS-Q73
 select
@@ -3751,7 +3751,7 @@ PLAN-ROOT SINK
 |  |
 |  06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  |  runtime filters: RF001 <- store.s_store_sk
+|  |  runtime filters: RF002 <- store.s_store_sk
 |  |
 |  |--02:SCAN HDFS [tpcds.store]
 |  |     partitions=1/1 files=1 size=3.08KB
@@ -3759,14 +3759,14 @@ PLAN-ROOT SINK
 |  |
 |  05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  |  runtime filters: RF002 <- date_dim.d_date_sk
+|  |  runtime filters: RF004 <- date_dim.d_date_sk
 |  |
 |  |--01:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  |  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  |  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |  |
 |  |--03:SCAN HDFS [tpcds.household_demographics]
 |  |     partitions=1/1 files=1 size=148.10KB
@@ -3774,7 +3774,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF001 -> store_sales.ss_store_sk, RF002 -> store_sales.ss_sold_date_sk, RF003 -> store_sales.ss_hdemo_sk
+|     runtime filters: RF002 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF006 -> store_sales.ss_hdemo_sk
 |
 08:SCAN HDFS [tpcds.customer]
    partitions=1/1 files=1 size=12.60MB
@@ -3813,7 +3813,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -3823,7 +3823,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF002 <- date_dim.d_date_sk
+|  runtime filters: RF004 <- date_dim.d_date_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -3832,7 +3832,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |
 |--11:EXCHANGE [BROADCAST]
 |  |
@@ -3842,7 +3842,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF001 -> store_sales.ss_store_sk, RF002 -> store_sales.ss_sold_date_sk, RF003 -> store_sales.ss_hdemo_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF006 -> store_sales.ss_hdemo_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -3881,7 +3881,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF001 <- store.s_store_sk
+|  runtime filters: RF002 <- store.s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -3895,7 +3895,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF002 <- date_dim.d_date_sk
+|  runtime filters: RF004 <- date_dim.d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -3908,7 +3908,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -3922,7 +3922,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF001 -> store_sales.ss_store_sk, RF002 -> store_sales.ss_sold_date_sk, RF003 -> store_sales.ss_hdemo_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_store_sk, RF004 -> store_sales.ss_sold_date_sk, RF006 -> store_sales.ss_hdemo_sk
 ====
 # TPCDS-Q79
 select
@@ -3983,7 +3983,7 @@ PLAN-ROOT SINK
 |  |
 |  06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  |  runtime filters: RF001 <- date_dim.d_date_sk
+|  |  runtime filters: RF002 <- date_dim.d_date_sk
 |  |
 |  |--01:SCAN HDFS [tpcds.date_dim]
 |  |     partitions=1/1 files=1 size=9.84MB
@@ -3991,7 +3991,7 @@ PLAN-ROOT SINK
 |  |
 |  05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  |  runtime filters: RF002 <- store.s_store_sk
+|  |  runtime filters: RF004 <- store.s_store_sk
 |  |
 |  |--02:SCAN HDFS [tpcds.store]
 |  |     partitions=1/1 files=1 size=3.08KB
@@ -3999,7 +3999,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  |  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  |  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |  |
 |  |--03:SCAN HDFS [tpcds.household_demographics]
 |  |     partitions=1/1 files=1 size=148.10KB
@@ -4007,7 +4007,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF001 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_store_sk, RF003 -> store_sales.ss_hdemo_sk
+|     runtime filters: RF002 -> store_sales.ss_sold_date_sk, RF004 -> store_sales.ss_store_sk, RF006 -> store_sales.ss_hdemo_sk
 |
 08:SCAN HDFS [tpcds.customer]
    partitions=1/1 files=1 size=12.60MB
@@ -4045,7 +4045,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF001 <- date_dim.d_date_sk
+|  runtime filters: RF002 <- date_dim.d_date_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -4055,7 +4055,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF002 <- store.s_store_sk
+|  runtime filters: RF004 <- store.s_store_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -4065,7 +4065,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |
 |--11:EXCHANGE [BROADCAST]
 |  |
@@ -4075,7 +4075,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF001 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_store_sk, RF003 -> store_sales.ss_hdemo_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_sold_date_sk, RF004 -> store_sales.ss_store_sk, RF006 -> store_sales.ss_hdemo_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -4113,7 +4113,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_sold_date_sk = date_dim.d_date_sk
-|  runtime filters: RF001 <- date_dim.d_date_sk
+|  runtime filters: RF002 <- date_dim.d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -4127,7 +4127,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_store_sk = store.s_store_sk
-|  runtime filters: RF002 <- store.s_store_sk
+|  runtime filters: RF004 <- store.s_store_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -4141,7 +4141,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
-|  runtime filters: RF003 <- household_demographics.hd_demo_sk
+|  runtime filters: RF006 <- household_demographics.hd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -4155,7 +4155,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF001 -> store_sales.ss_sold_date_sk, RF002 -> store_sales.ss_store_sk, RF003 -> store_sales.ss_hdemo_sk
+   runtime filters: RF000 -> tpcds.store_sales.ss_customer_sk, RF002 -> store_sales.ss_sold_date_sk, RF004 -> store_sales.ss_store_sk, RF006 -> store_sales.ss_hdemo_sk
 ====
 # TPCDS-Q89
 select * from (select  *
@@ -4212,7 +4212,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--02:SCAN HDFS [tpcds.date_dim]
 |     partitions=1/1 files=1 size=9.84MB
@@ -4220,7 +4220,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--00:SCAN HDFS [tpcds.item]
 |     partitions=1/1 files=1 size=4.82MB
@@ -4228,7 +4228,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -4270,7 +4270,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
@@ -4280,7 +4280,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -4290,7 +4290,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -4336,7 +4336,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_sold_date_sk = d_date_sk
-|  runtime filters: RF001 <- d_date_sk
+|  runtime filters: RF002 <- d_date_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -4350,7 +4350,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF002 <- i_item_sk
+|  runtime filters: RF004 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -4364,7 +4364,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_store_sk, RF001 -> ss_sold_date_sk, RF002 -> ss_item_sk
+   runtime filters: RF000 -> ss_store_sk, RF002 -> ss_sold_date_sk, RF004 -> ss_item_sk
 ====
 # TPCDS-Q96
 SELECT
@@ -4397,7 +4397,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: ss.ss_hdemo_sk = hd.hd_demo_sk
-|  runtime filters: RF001 <- hd.hd_demo_sk
+|  runtime filters: RF002 <- hd.hd_demo_sk
 |
 |--02:SCAN HDFS [tpcds.household_demographics hd]
 |     partitions=1/1 files=1 size=148.10KB
@@ -4405,7 +4405,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: ss.ss_sold_time_sk = td.t_time_sk
-|  runtime filters: RF002 <- td.t_time_sk
+|  runtime filters: RF004 <- td.t_time_sk
 |
 |--01:SCAN HDFS [tpcds.time_dim td]
 |     partitions=1/1 files=1 size=4.87MB
@@ -4413,7 +4413,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales ss]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss.ss_store_sk, RF001 -> ss.ss_hdemo_sk, RF002 -> ss.ss_sold_time_sk
+   runtime filters: RF000 -> ss.ss_store_sk, RF002 -> ss.ss_hdemo_sk, RF004 -> ss.ss_sold_time_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -4437,7 +4437,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss.ss_hdemo_sk = hd.hd_demo_sk
-|  runtime filters: RF001 <- hd.hd_demo_sk
+|  runtime filters: RF002 <- hd.hd_demo_sk
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
@@ -4447,7 +4447,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss.ss_sold_time_sk = td.t_time_sk
-|  runtime filters: RF002 <- td.t_time_sk
+|  runtime filters: RF004 <- td.t_time_sk
 |
 |--08:EXCHANGE [BROADCAST]
 |  |
@@ -4457,7 +4457,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales ss]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss.ss_store_sk, RF001 -> ss.ss_hdemo_sk, RF002 -> ss.ss_sold_time_sk
+   runtime filters: RF000 -> ss.ss_store_sk, RF002 -> ss.ss_hdemo_sk, RF004 -> ss.ss_sold_time_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -4485,7 +4485,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss.ss_hdemo_sk = hd.hd_demo_sk
-|  runtime filters: RF001 <- hd.hd_demo_sk
+|  runtime filters: RF002 <- hd.hd_demo_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -4499,7 +4499,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ss.ss_sold_time_sk = td.t_time_sk
-|  runtime filters: RF002 <- td.t_time_sk
+|  runtime filters: RF004 <- td.t_time_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -4513,7 +4513,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales ss]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss.ss_store_sk, RF001 -> ss.ss_hdemo_sk, RF002 -> ss.ss_sold_time_sk
+   runtime filters: RF000 -> ss.ss_store_sk, RF002 -> ss.ss_hdemo_sk, RF004 -> ss.ss_sold_time_sk
 ====
 # TPCDS-Q98
 select
@@ -4573,7 +4573,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: i_item_sk = ss_item_sk
-|  runtime filters: RF001 <- ss_item_sk
+|  runtime filters: RF002 <- ss_item_sk
 |
 |--00:SCAN HDFS [tpcds.store_sales]
 |     partitions=31/1824 files=31 size=3.43MB
@@ -4582,7 +4582,7 @@ PLAN-ROOT SINK
 01:SCAN HDFS [tpcds.item]
    partitions=1/1 files=1 size=4.82MB
    predicates: i_category IN ('Jewelry', 'Sports', 'Books')
-   runtime filters: RF001 -> i_item_sk
+   runtime filters: RF002 -> i_item_sk
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -4622,7 +4622,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--10:EXCHANGE [HASH(i_item_sk)]
 |  |
@@ -4634,7 +4634,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=31/1824 files=31 size=3.43MB
-   runtime filters: RF000 -> ss_sold_date_sk, RF001 -> ss_item_sk
+   runtime filters: RF000 -> ss_sold_date_sk, RF002 -> ss_item_sk
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -4678,7 +4678,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: ss_item_sk = i_item_sk
-|  runtime filters: RF001 <- i_item_sk
+|  runtime filters: RF002 <- i_item_sk
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -4694,7 +4694,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=31/1824 files=31 size=3.43MB
-   runtime filters: RF000 -> ss_sold_date_sk, RF001 -> ss_item_sk
+   runtime filters: RF000 -> ss_sold_date_sk, RF002 -> ss_item_sk
 ====
 # TPCD-Q6
 select * from (
@@ -4747,7 +4747,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: d.d_month_seq = (d_month_seq)
-|  runtime filters: RF001 <- (d_month_seq)
+|  runtime filters: RF002 <- (d_month_seq)
 |
 |--06:AGGREGATE [FINALIZE]
 |  |  group by: (d_month_seq)
@@ -4759,7 +4759,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: s.ss_item_sk = i.i_item_sk
-|  runtime filters: RF002 <- i.i_item_sk
+|  runtime filters: RF004 <- i.i_item_sk
 |
 |--04:SCAN HDFS [tpcds.item i]
 |     partitions=1/1 files=1 size=4.82MB
@@ -4767,30 +4767,30 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: s.ss_sold_date_sk = d.d_date_sk
-|  runtime filters: RF003 <- d.d_date_sk
+|  runtime filters: RF006 <- d.d_date_sk
 

<TRUNCATED>

[08/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/tmp-file-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index f1e243c..24217de 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -26,7 +26,8 @@
 #include <gutil/strings/join.h>
 #include <gutil/strings/substitute.h>
 
-#include "runtime/disk-io-mgr-reader-context.h"
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/io/request-context.h"
 #include "runtime/runtime-state.h"
 #include "runtime/tmp-file-mgr-internal.h"
 #include "util/bit-util.h"
@@ -52,6 +53,7 @@ using boost::algorithm::token_compress_on;
 using boost::filesystem::absolute;
 using boost::filesystem::path;
 using boost::uuids::random_generator;
+using namespace impala::io;
 using namespace strings;
 
 namespace impala {
@@ -358,7 +360,7 @@ Status TmpFileMgr::FileGroup::Write(
 
   unique_ptr<WriteHandle> tmp_handle(new WriteHandle(encryption_timer_, cb));
   WriteHandle* tmp_handle_ptr = tmp_handle.get(); // Pass ptr by value into lambda.
-  DiskIoMgr::WriteRange::WriteDoneCallback callback = [this, tmp_handle_ptr](
+  WriteRange::WriteDoneCallback callback = [this, tmp_handle_ptr](
       const Status& write_status) { WriteComplete(tmp_handle_ptr, write_status); };
   RETURN_IF_ERROR(
       tmp_handle->Write(io_mgr_, io_ctx_.get(), tmp_file, file_offset, buffer, callback));
@@ -387,11 +389,11 @@ Status TmpFileMgr::FileGroup::ReadAsync(WriteHandle* handle, MemRange buffer) {
   DCHECK(handle->write_range_ != nullptr);
   // Don't grab handle->write_state_lock_, it is safe to touch all of handle's state
   // since the write is not in flight.
-  handle->read_range_ = scan_range_pool_.Add(new DiskIoMgr::ScanRange);
+  handle->read_range_ = scan_range_pool_.Add(new ScanRange);
   handle->read_range_->Reset(nullptr, handle->write_range_->file(),
       handle->write_range_->len(), handle->write_range_->offset(),
       handle->write_range_->disk_id(), false,
-      DiskIoMgr::BufferOpts::ReadInto(buffer.data(), buffer.len()));
+      BufferOpts::ReadInto(buffer.data(), buffer.len()));
   read_counter_->Add(1);
   bytes_read_counter_->Add(buffer.len());
   RETURN_IF_ERROR(io_mgr_->AddScanRange(io_ctx_.get(), handle->read_range_, true));
@@ -403,7 +405,7 @@ Status TmpFileMgr::FileGroup::WaitForAsyncRead(WriteHandle* handle, MemRange buf
   // Don't grab handle->write_state_lock_, it is safe to touch all of handle's state
   // since the write is not in flight.
   SCOPED_TIMER(disk_read_timer_);
-  unique_ptr<DiskIoMgr::BufferDescriptor> io_mgr_buffer;
+  unique_ptr<BufferDescriptor> io_mgr_buffer;
   Status status = handle->read_range_->GetNext(&io_mgr_buffer);
   if (!status.ok()) goto exit;
   DCHECK(io_mgr_buffer != NULL);
@@ -525,9 +527,9 @@ string TmpFileMgr::WriteHandle::TmpFilePath() const {
   return file_->path();
 }
 
-Status TmpFileMgr::WriteHandle::Write(DiskIoMgr* io_mgr, DiskIoRequestContext* io_ctx,
+Status TmpFileMgr::WriteHandle::Write(DiskIoMgr* io_mgr, RequestContext* io_ctx,
     File* file, int64_t offset, MemRange buffer,
-    DiskIoMgr::WriteRange::WriteDoneCallback callback) {
+    WriteRange::WriteDoneCallback callback) {
   DCHECK(!write_in_flight_);
 
   if (FLAGS_disk_spill_encryption) RETURN_IF_ERROR(EncryptAndHash(buffer));
@@ -536,7 +538,7 @@ Status TmpFileMgr::WriteHandle::Write(DiskIoMgr* io_mgr, DiskIoRequestContext* i
   // WriteComplete() may be called concurrently with the remainder of this function.
   file_ = file;
   write_range_.reset(
-      new DiskIoMgr::WriteRange(file->path(), offset, file->AssignDiskQueue(), callback));
+      new WriteRange(file->path(), offset, file->AssignDiskQueue(), callback));
   write_range_->SetData(buffer.data(), buffer.len());
   write_in_flight_ = true;
   Status status = io_mgr->AddWriteRange(io_ctx, write_range_.get());
@@ -553,7 +555,7 @@ Status TmpFileMgr::WriteHandle::Write(DiskIoMgr* io_mgr, DiskIoRequestContext* i
 }
 
 Status TmpFileMgr::WriteHandle::RetryWrite(
-    DiskIoMgr* io_mgr, DiskIoRequestContext* io_ctx, File* file, int64_t offset) {
+    DiskIoMgr* io_mgr, RequestContext* io_ctx, File* file, int64_t offset) {
   DCHECK(write_in_flight_);
   file_ = file;
   write_range_->SetRange(file->path(), offset, file->AssignDiskQueue());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/tmp-file-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.h b/be/src/runtime/tmp-file-mgr.h
index f550af2..95072ae 100644
--- a/be/src/runtime/tmp-file-mgr.h
+++ b/be/src/runtime/tmp-file-mgr.h
@@ -28,10 +28,10 @@
 #include "common/object-pool.h"
 #include "common/status.h"
 #include "gen-cpp/Types_types.h" // for TUniqueId
-#include "runtime/disk-io-mgr.h"
-#include "util/mem-range.h"
+#include "runtime/io/request-ranges.h"
 #include "util/collection-metrics.h"
 #include "util/condition-variable.h"
+#include "util/mem-range.h"
 #include "util/openssl-util.h"
 #include "util/runtime-profile.h"
 #include "util/spinlock.h"
@@ -100,7 +100,7 @@ class TmpFileMgr {
     /// space used. 'unique_id' is a unique ID that is used to prefix any scratch file
     /// names. It is an error to create multiple FileGroups with the same 'unique_id'.
     /// 'bytes_limit' is the limit on the total file space to allocate.
-    FileGroup(TmpFileMgr* tmp_file_mgr, DiskIoMgr* io_mgr, RuntimeProfile* profile,
+    FileGroup(TmpFileMgr* tmp_file_mgr, io::DiskIoMgr* io_mgr, RuntimeProfile* profile,
         const TUniqueId& unique_id, int64_t bytes_limit = -1);
 
     ~FileGroup();
@@ -198,10 +198,10 @@ class TmpFileMgr {
     TmpFileMgr* const tmp_file_mgr_;
 
     /// DiskIoMgr used for all I/O to temporary files.
-    DiskIoMgr* const io_mgr_;
+    io::DiskIoMgr* const io_mgr_;
 
     /// I/O context used for all reads and writes. Registered in constructor.
-    std::unique_ptr<DiskIoRequestContext> io_ctx_;
+    std::unique_ptr<io::RequestContext> io_ctx_;
 
     /// Stores scan ranges allocated in Read(). Needed because ScanRange objects may be
     /// touched by DiskIoMgr even after the scan is finished.
@@ -303,14 +303,14 @@ class TmpFileMgr {
     /// Starts a write of 'buffer' to 'offset' of 'file'. 'write_in_flight_' must be false
     /// before calling. After returning, 'write_in_flight_' is true on success or false on
     /// failure and 'is_cancelled_' is set to true on failure.
-    Status Write(DiskIoMgr* io_mgr, DiskIoRequestContext* io_ctx, File* file,
+    Status Write(io::DiskIoMgr* io_mgr, io::RequestContext* io_ctx, File* file,
         int64_t offset, MemRange buffer,
-        DiskIoMgr::WriteRange::WriteDoneCallback callback) WARN_UNUSED_RESULT;
+        io::WriteRange::WriteDoneCallback callback) WARN_UNUSED_RESULT;
 
     /// Retry the write after the initial write failed with an error, instead writing to
     /// 'offset' of 'file'. 'write_in_flight_' must be true before calling.
     /// After returning, 'write_in_flight_' is true on success or false on failure.
-    Status RetryWrite(DiskIoMgr* io_mgr, DiskIoRequestContext* io_ctx, File* file,
+    Status RetryWrite(io::DiskIoMgr* io_mgr, io::RequestContext* io_ctx, File* file,
         int64_t offset) WARN_UNUSED_RESULT;
 
     /// Called when the write has completed successfully or not. Sets 'write_in_flight_'
@@ -340,7 +340,7 @@ class TmpFileMgr {
     RuntimeProfile::Counter* encryption_timer_;
 
     /// The DiskIoMgr write range for this write.
-    boost::scoped_ptr<DiskIoMgr::WriteRange> write_range_;
+    boost::scoped_ptr<io::WriteRange> write_range_;
 
     /// The temporary file being written to.
     File* file_;
@@ -355,7 +355,7 @@ class TmpFileMgr {
 
     /// The scan range for the read that is currently in flight. NULL when no read is in
     /// flight.
-    DiskIoMgr::ScanRange* read_range_;
+    io::ScanRange* read_range_;
 
     /// Protects all fields below while 'write_in_flight_' is true. At other times, it is
     /// invalid to call WriteRange/FileGroup methods concurrently from multiple threads,

[10/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr.cc b/be/src/runtime/io/disk-io-mgr.cc
new file mode 100644
index 0000000..f9aed92
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr.cc
@@ -0,0 +1,1191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "common/global-flags.h"
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/io/disk-io-mgr-internal.h"
+#include "runtime/io/handle-cache.inline.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include "gutil/strings/substitute.h"
+#include "util/bit-util.h"
+#include "util/hdfs-util.h"
+#include "util/time.h"
+
+DECLARE_bool(disable_mem_pools);
+#ifndef NDEBUG
+DECLARE_int32(stress_scratch_write_delay_ms);
+#endif
+
+#include "common/names.h"
+
+using namespace impala;
+using namespace impala::io;
+using namespace strings;
+
+// Control the number of disks on the machine.  If 0, this comes from the system
+// settings.
+DEFINE_int32(num_disks, 0, "Number of disks on data node.");
+// Default IoMgr configs:
+// The maximum number of the threads per disk is also the max queue depth per disk.
+DEFINE_int32(num_threads_per_disk, 0, "Number of I/O threads per disk");
+
+// Rotational disks should have 1 thread per disk to minimize seeks.  Non-rotational
+// don't have this penalty and benefit from multiple concurrent IO requests.
+static const int THREADS_PER_ROTATIONAL_DISK = 1;
+static const int THREADS_PER_SOLID_STATE_DISK = 8;
+
+// The maximum number of the threads per rotational disk is also the max queue depth per
+// rotational disk.
+static const string num_io_threads_per_rotational_disk_help_msg = Substitute("Number of "
+    "I/O threads per rotational disk. Has priority over num_threads_per_disk. If neither"
+    " is set, defaults to $0 thread(s) per rotational disk", THREADS_PER_ROTATIONAL_DISK);
+DEFINE_int32(num_io_threads_per_rotational_disk, 0,
+    num_io_threads_per_rotational_disk_help_msg.c_str());
+// The maximum number of the threads per solid state disk is also the max queue depth per
+// solid state disk.
+static const string num_io_threads_per_solid_state_disk_help_msg = Substitute("Number of"
+    " I/O threads per solid state disk. Has priority over num_threads_per_disk. If "
+    "neither is set, defaults to $0 thread(s) per solid state disk",
+    THREADS_PER_SOLID_STATE_DISK);
+DEFINE_int32(num_io_threads_per_solid_state_disk, 0,
+    num_io_threads_per_solid_state_disk_help_msg.c_str());
+// The maximum number of remote HDFS I/O threads.  HDFS access that are expected to be
+// remote are placed on a separate remote disk queue.  This is the queue depth for that
+// queue.  If 0, then the remote queue is not used and instead ranges are round-robined
+// across the local disk queues.
+DEFINE_int32(num_remote_hdfs_io_threads, 8, "Number of remote HDFS I/O threads");
+// The maximum number of S3 I/O threads. The default value of 16 was chosen emperically
+// to maximize S3 throughput. Maximum throughput is achieved with multiple connections
+// open to S3 and use of multiple CPU cores since S3 reads are relatively compute
+// expensive (SSL and JNI buffer overheads).
+DEFINE_int32(num_s3_io_threads, 16, "Number of S3 I/O threads");
+// The maximum number of ADLS I/O threads. This number is a good default to have for
+// clusters that may vary widely in size, due to an undocumented concurrency limit
+// enforced by ADLS for a cluster, which spans between 500-700. For smaller clusters
+// (~10 nodes), 64 threads would be more ideal.
+DEFINE_int32(num_adls_io_threads, 16, "Number of ADLS I/O threads");
+
+DECLARE_int64(min_buffer_size);
+
+// With 1024B through 8MB buffers, this is up to ~2GB of buffers.
+DEFINE_int32(max_free_io_buffers, 128,
+    "For each io buffer size, the maximum number of buffers the IoMgr will hold onto");
+
+// The number of cached file handles defines how much memory can be used per backend for
+// caching frequently used file handles. Measurements indicate that a single file handle
+// uses about 6kB of memory. 20k file handles will thus reserve ~120MB of memory.
+// The actual amount of memory that is associated with a file handle can be larger
+// or smaller, depending on the replication factor for this file or the path name.
+DEFINE_uint64(max_cached_file_handles, 20000, "Maximum number of HDFS file handles "
+    "that will be cached. Disabled if set to 0.");
+
+// The unused file handle timeout specifies how long a file handle will remain in the
+// cache if it is not being used. Aging out unused handles ensures that the cache is not
+// wasting memory on handles that aren't useful. This allows users to specify a larger
+// cache size, as the system will only use the memory on useful file handles.
+// Additionally, cached file handles keep an open file descriptor for local files.
+// If a file is deleted through HDFS, this open file descriptor can keep the disk space
+// from being freed. When the metadata sees that a file has been deleted, the file handle
+// will no longer be used by future queries. Aging out this file handle allows the
+// disk space to be freed in an appropriate period of time.
+DEFINE_uint64(unused_file_handle_timeout_sec, 21600, "Maximum time, in seconds, that an "
+    "unused HDFS file handle will remain in the file handle cache. Disabled if set "
+    "to 0.");
+
+// The IoMgr is able to run with a wide range of memory usage. If a query has memory
+// remaining less than this value, the IoMgr will stop all buffering regardless of the
+// current queue size.
+static const int LOW_MEMORY = 64 * 1024 * 1024;
+
+const int DiskIoMgr::SCAN_RANGE_READY_BUFFER_LIMIT;
+
+AtomicInt32 DiskIoMgr::next_disk_id_;
+
+namespace detail {
+// Indicates if file handle caching should be used
+static inline bool is_file_handle_caching_enabled() {
+  return FLAGS_max_cached_file_handles > 0;
+}
+}
+
+string DiskIoMgr::DebugString() {
+  stringstream ss;
+  ss << "Disks: " << endl;
+  for (int i = 0; i < disk_queues_.size(); ++i) {
+    unique_lock<mutex> lock(disk_queues_[i]->lock);
+    ss << "  " << (void*) disk_queues_[i] << ":" ;
+    if (!disk_queues_[i]->request_contexts.empty()) {
+      ss << " Readers: ";
+      for (RequestContext* req_context: disk_queues_[i]->request_contexts) {
+        ss << (void*)req_context;
+      }
+    }
+    ss << endl;
+  }
+  return ss.str();
+}
+
+BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr,
+    RequestContext* reader, ScanRange* scan_range, uint8_t* buffer,
+    int64_t buffer_len, MemTracker* mem_tracker)
+  : io_mgr_(io_mgr),
+    reader_(reader),
+    mem_tracker_(mem_tracker),
+    scan_range_(scan_range),
+    buffer_(buffer),
+    buffer_len_(buffer_len) {
+  DCHECK(io_mgr != nullptr);
+  DCHECK(scan_range != nullptr);
+  DCHECK(buffer != nullptr);
+  DCHECK_GE(buffer_len, 0);
+  DCHECK_NE(scan_range->external_buffer_tag_ == ScanRange::ExternalBufferTag::NO_BUFFER,
+      mem_tracker == nullptr);
+}
+
+void BufferDescriptor::TransferOwnership(MemTracker* dst) {
+  DCHECK(dst != nullptr);
+  DCHECK(!is_client_buffer());
+  // Memory of cached buffers is not tracked against a tracker.
+  if (is_cached()) return;
+  DCHECK(mem_tracker_ != nullptr);
+  dst->Consume(buffer_len_);
+  mem_tracker_->Release(buffer_len_);
+  mem_tracker_ = dst;
+}
+
+WriteRange::WriteRange(
+    const string& file, int64_t file_offset, int disk_id, WriteDoneCallback callback)
+  : RequestRange(RequestType::WRITE), callback_(callback) {
+  SetRange(file, file_offset, disk_id);
+}
+
+void WriteRange::SetRange(
+    const std::string& file, int64_t file_offset, int disk_id) {
+  file_ = file;
+  offset_ = file_offset;
+  disk_id_ = disk_id;
+}
+
+void WriteRange::SetData(const uint8_t* buffer, int64_t len) {
+  data_ = buffer;
+  len_ = len;
+}
+
+static void CheckSseSupport() {
+  if (!CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
+    LOG(WARNING) << "This machine does not support sse4_2.  The default IO system "
+                    "configurations are suboptimal for this hardware.  Consider "
+                    "increasing the number of threads per disk by restarting impalad "
+                    "using the --num_threads_per_disk flag with a higher value";
+  }
+}
+
+// Utility function to select flag that is set (has a positive value) based on precedence
+static inline int GetFirstPositiveVal(const int first_val, const int second_val,
+    const int default_val) {
+  return first_val > 0 ? first_val : (second_val > 0 ? second_val : default_val);
+}
+
+DiskIoMgr::DiskIoMgr() :
+    num_io_threads_per_rotational_disk_(GetFirstPositiveVal(
+        FLAGS_num_io_threads_per_rotational_disk, FLAGS_num_threads_per_disk,
+        THREADS_PER_ROTATIONAL_DISK)),
+    num_io_threads_per_solid_state_disk_(GetFirstPositiveVal(
+        FLAGS_num_io_threads_per_solid_state_disk, FLAGS_num_threads_per_disk,
+        THREADS_PER_SOLID_STATE_DISK)),
+    max_buffer_size_(FLAGS_read_size),
+    min_buffer_size_(FLAGS_min_buffer_size),
+    shut_down_(false),
+    total_bytes_read_counter_(TUnit::BYTES),
+    read_timer_(TUnit::TIME_NS),
+    file_handle_cache_(min(FLAGS_max_cached_file_handles,
+        FileSystemUtil::MaxNumFileHandles()),
+        FLAGS_unused_file_handle_timeout_sec) {
+  DCHECK_LE(READ_SIZE_MIN_VALUE, FLAGS_read_size);
+  int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
+  free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
+  int num_local_disks = DiskInfo::num_disks();
+  if (FLAGS_num_disks < 0 || FLAGS_num_disks > DiskInfo::num_disks()) {
+    LOG(WARNING) << "Number of disks specified should be between 0 and the number of "
+        "logical disks on the system. Defaulting to system setting of " <<
+        DiskInfo::num_disks() << " disks";
+  } else if (FLAGS_num_disks > 0) {
+    num_local_disks = FLAGS_num_disks;
+  }
+  disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
+  CheckSseSupport();
+}
+
+DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_rotational_disk,
+    int threads_per_solid_state_disk, int min_buffer_size, int max_buffer_size) :
+    num_io_threads_per_rotational_disk_(threads_per_rotational_disk),
+    num_io_threads_per_solid_state_disk_(threads_per_solid_state_disk),
+    max_buffer_size_(max_buffer_size),
+    min_buffer_size_(min_buffer_size),
+    shut_down_(false),
+    total_bytes_read_counter_(TUnit::BYTES),
+    read_timer_(TUnit::TIME_NS),
+    file_handle_cache_(min(FLAGS_max_cached_file_handles,
+        FileSystemUtil::MaxNumFileHandles()),
+        FLAGS_unused_file_handle_timeout_sec) {
+  int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
+  free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
+  if (num_local_disks == 0) num_local_disks = DiskInfo::num_disks();
+  disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
+  CheckSseSupport();
+}
+
+DiskIoMgr::~DiskIoMgr() {
+  shut_down_ = true;
+  // Notify all worker threads and shut them down.
+  for (int i = 0; i < disk_queues_.size(); ++i) {
+    if (disk_queues_[i] == nullptr) continue;
+    {
+      // This lock is necessary to properly use the condition var to notify
+      // the disk worker threads.  The readers also grab this lock so updates
+      // to shut_down_ are protected.
+      unique_lock<mutex> disk_lock(disk_queues_[i]->lock);
+    }
+    disk_queues_[i]->work_available.NotifyAll();
+  }
+  disk_thread_group_.JoinAll();
+
+  for (int i = 0; i < disk_queues_.size(); ++i) {
+    if (disk_queues_[i] == nullptr) continue;
+    int disk_id = disk_queues_[i]->disk_id;
+    for (list<RequestContext*>::iterator it = disk_queues_[i]->request_contexts.begin();
+        it != disk_queues_[i]->request_contexts.end(); ++it) {
+      DCHECK_EQ((*it)->disk_states_[disk_id].num_threads_in_op(), 0);
+      DCHECK((*it)->disk_states_[disk_id].done());
+      (*it)->DecrementDiskRefCount();
+    }
+  }
+
+  DCHECK_EQ(num_buffers_in_readers_.Load(), 0);
+
+  // Delete all allocated buffers
+  int num_free_buffers = 0;
+  for (int idx = 0; idx < free_buffers_.size(); ++idx) {
+    num_free_buffers += free_buffers_[idx].size();
+  }
+  DCHECK_EQ(num_allocated_buffers_.Load(), num_free_buffers);
+  GcIoBuffers();
+
+  for (int i = 0; i < disk_queues_.size(); ++i) {
+    delete disk_queues_[i];
+  }
+
+  if (free_buffer_mem_tracker_ != nullptr) free_buffer_mem_tracker_->Close();
+  if (cached_read_options_ != nullptr) hadoopRzOptionsFree(cached_read_options_);
+}
+
+Status DiskIoMgr::Init(MemTracker* process_mem_tracker) {
+  DCHECK(process_mem_tracker != nullptr);
+  free_buffer_mem_tracker_.reset(
+      new MemTracker(-1, "Free Disk IO Buffers", process_mem_tracker, false));
+
+  for (int i = 0; i < disk_queues_.size(); ++i) {
+    disk_queues_[i] = new DiskQueue(i);
+    int num_threads_per_disk;
+    if (i == RemoteDfsDiskId()) {
+      num_threads_per_disk = FLAGS_num_remote_hdfs_io_threads;
+    } else if (i == RemoteS3DiskId()) {
+      num_threads_per_disk = FLAGS_num_s3_io_threads;
+    } else if (i == RemoteAdlsDiskId()) {
+      num_threads_per_disk = FLAGS_num_adls_io_threads;
+    } else if (DiskInfo::is_rotational(i)) {
+      num_threads_per_disk = num_io_threads_per_rotational_disk_;
+    } else {
+      num_threads_per_disk = num_io_threads_per_solid_state_disk_;
+    }
+    for (int j = 0; j < num_threads_per_disk; ++j) {
+      stringstream ss;
+      ss << "work-loop(Disk: " << i << ", Thread: " << j << ")";
+      std::unique_ptr<Thread> t;
+      RETURN_IF_ERROR(Thread::Create("disk-io-mgr", ss.str(), &DiskIoMgr::WorkLoop,
+          this, disk_queues_[i], &t));
+      disk_thread_group_.AddThread(move(t));
+    }
+  }
+  RETURN_IF_ERROR(file_handle_cache_.Init());
+
+  cached_read_options_ = hadoopRzOptionsAlloc();
+  DCHECK(cached_read_options_ != nullptr);
+  // Disable checksumming for cached reads.
+  int ret = hadoopRzOptionsSetSkipChecksum(cached_read_options_, true);
+  DCHECK_EQ(ret, 0);
+  // Disable automatic fallback for cached reads.
+  ret = hadoopRzOptionsSetByteBufferPool(cached_read_options_, nullptr);
+  DCHECK_EQ(ret, 0);
+
+  return Status::OK();
+}
+
+unique_ptr<RequestContext> DiskIoMgr::RegisterContext(MemTracker* mem_tracker) {
+  return unique_ptr<RequestContext>(
+      new RequestContext(this, num_total_disks(), mem_tracker));
+}
+
+void DiskIoMgr::UnregisterContext(RequestContext* reader) {
+  reader->CancelAndMarkInactive();
+}
+
+// Cancellation requires coordination from multiple threads.  Each thread that currently
+// has a reference to the request context must notice the cancel and remove it from its
+// tracking structures.  The last thread to touch the context should deallocate (aka
+// recycle) the request context object.  Potential threads are:
+//  1. Disk threads that are currently reading for this reader.
+//  2. Caller threads that are waiting in GetNext.
+//
+// The steps are:
+// 1. Cancel will immediately set the context in the Cancelled state.  This prevents any
+// other thread from adding more ready buffers to the context (they all take a lock and
+// check the state before doing so), or any write ranges to the context.
+// 2. Cancel will call cancel on each ScanRange that is not yet complete, unblocking
+// any threads in GetNext(). The reader will see the cancelled Status returned. Cancel
+// also invokes the callback for the WriteRanges with the cancelled state.
+// 3. Disk threads notice the context is cancelled either when picking the next context
+// to process or when they try to enqueue a ready buffer.  Upon noticing the cancelled
+// state, removes the context from the disk queue.  The last thread per disk with an
+// outstanding reference to the context decrements the number of disk queues the context
+// is on.
+void DiskIoMgr::CancelContext(RequestContext* context) {
+  context->Cancel(Status::CANCELLED);
+}
+
+void DiskIoMgr::set_read_timer(RequestContext* r, RuntimeProfile::Counter* c) {
+  r->read_timer_ = c;
+}
+
+void DiskIoMgr::set_bytes_read_counter(RequestContext* r, RuntimeProfile::Counter* c) {
+  r->bytes_read_counter_ = c;
+}
+
+void DiskIoMgr::set_active_read_thread_counter(RequestContext* r,
+    RuntimeProfile::Counter* c) {
+  r->active_read_thread_counter_ = c;
+}
+
+void DiskIoMgr::set_disks_access_bitmap(RequestContext* r,
+    RuntimeProfile::Counter* c) {
+  r->disks_accessed_bitmap_ = c;
+}
+
+int64_t DiskIoMgr::queue_size(RequestContext* reader) const {
+  return reader->num_ready_buffers_.Load();
+}
+
+Status DiskIoMgr::context_status(RequestContext* context) const {
+  unique_lock<mutex> lock(context->lock_);
+  return context->status_;
+}
+
+int64_t DiskIoMgr::bytes_read_local(RequestContext* reader) const {
+  return reader->bytes_read_local_.Load();
+}
+
+int64_t DiskIoMgr::bytes_read_short_circuit(RequestContext* reader) const {
+  return reader->bytes_read_short_circuit_.Load();
+}
+
+int64_t DiskIoMgr::bytes_read_dn_cache(RequestContext* reader) const {
+  return reader->bytes_read_dn_cache_.Load();
+}
+
+int DiskIoMgr::num_remote_ranges(RequestContext* reader) const {
+  return reader->num_remote_ranges_.Load();
+}
+
+int64_t DiskIoMgr::unexpected_remote_bytes(RequestContext* reader) const {
+  return reader->unexpected_remote_bytes_.Load();
+}
+
+int DiskIoMgr::cached_file_handles_hit_count(RequestContext* reader) const {
+  return reader->cached_file_handles_hit_count_.Load();
+}
+
+int DiskIoMgr::cached_file_handles_miss_count(RequestContext* reader) const {
+  return reader->cached_file_handles_miss_count_.Load();
+}
+
+int64_t DiskIoMgr::GetReadThroughput() {
+  return RuntimeProfile::UnitsPerSecond(&total_bytes_read_counter_, &read_timer_);
+}
+
+Status DiskIoMgr::ValidateScanRange(ScanRange* range) {
+  int disk_id = range->disk_id_;
+  if (disk_id < 0 || disk_id >= disk_queues_.size()) {
+    return Status(TErrorCode::DISK_IO_ERROR,
+        Substitute("Invalid scan range.  Bad disk id: $0", disk_id));
+  }
+  if (range->offset_ < 0) {
+    return Status(TErrorCode::DISK_IO_ERROR,
+        Substitute("Invalid scan range. Negative offset $0", range->offset_));
+  }
+  if (range->len_ < 0) {
+    return Status(TErrorCode::DISK_IO_ERROR,
+        Substitute("Invalid scan range. Negative length $0", range->len_));
+  }
+  return Status::OK();
+}
+
+Status DiskIoMgr::AddScanRanges(RequestContext* reader,
+    const vector<ScanRange*>& ranges, bool schedule_immediately) {
+  if (ranges.empty()) return Status::OK();
+
+  // Validate and initialize all ranges
+  for (int i = 0; i < ranges.size(); ++i) {
+    RETURN_IF_ERROR(ValidateScanRange(ranges[i]));
+    ranges[i]->InitInternal(this, reader);
+  }
+
+  // disks that this reader needs to be scheduled on.
+  unique_lock<mutex> reader_lock(reader->lock_);
+  DCHECK(reader->Validate()) << endl << reader->DebugString();
+
+  if (reader->state_ == RequestContext::Cancelled) {
+    DCHECK(!reader->status_.ok());
+    return reader->status_;
+  }
+
+  // Add each range to the queue of the disk the range is on
+  for (int i = 0; i < ranges.size(); ++i) {
+    // Don't add empty ranges.
+    DCHECK_NE(ranges[i]->len(), 0);
+    ScanRange* range = ranges[i];
+
+    if (range->try_cache_) {
+      if (schedule_immediately) {
+        bool cached_read_succeeded;
+        RETURN_IF_ERROR(range->ReadFromCache(reader_lock, &cached_read_succeeded));
+        if (cached_read_succeeded) continue;
+        // Cached read failed, fall back to AddRequestRange() below.
+      } else {
+        reader->cached_ranges_.Enqueue(range);
+        continue;
+      }
+    }
+    reader->AddRequestRange(range, schedule_immediately);
+  }
+  DCHECK(reader->Validate()) << endl << reader->DebugString();
+
+  return Status::OK();
+}
+
+Status DiskIoMgr::AddScanRange(
+    RequestContext* reader, ScanRange* range, bool schedule_immediately) {
+  return AddScanRanges(reader, vector<ScanRange*>({range}), schedule_immediately);
+}
+
+// This function returns the next scan range the reader should work on, checking
+// for eos and error cases. If there isn't already a cached scan range or a scan
+// range prepared by the disk threads, the caller waits on the disk threads.
+Status DiskIoMgr::GetNextRange(RequestContext* reader, ScanRange** range) {
+  DCHECK(reader != nullptr);
+  DCHECK(range != nullptr);
+  *range = nullptr;
+  Status status = Status::OK();
+
+  unique_lock<mutex> reader_lock(reader->lock_);
+  DCHECK(reader->Validate()) << endl << reader->DebugString();
+
+  while (true) {
+    if (reader->state_ == RequestContext::Cancelled) {
+      DCHECK(!reader->status_.ok());
+      status = reader->status_;
+      break;
+    }
+
+    if (reader->num_unstarted_scan_ranges_.Load() == 0 &&
+        reader->ready_to_start_ranges_.empty() && reader->cached_ranges_.empty()) {
+      // All ranges are done, just return.
+      break;
+    }
+
+    if (!reader->cached_ranges_.empty()) {
+      // We have a cached range.
+      *range = reader->cached_ranges_.Dequeue();
+      DCHECK((*range)->try_cache_);
+      bool cached_read_succeeded;
+      RETURN_IF_ERROR((*range)->ReadFromCache(reader_lock, &cached_read_succeeded));
+      if (cached_read_succeeded) return Status::OK();
+
+      // This range ended up not being cached. Loop again and pick up a new range.
+      reader->AddRequestRange(*range, false);
+      DCHECK(reader->Validate()) << endl << reader->DebugString();
+      *range = nullptr;
+      continue;
+    }
+
+    if (reader->ready_to_start_ranges_.empty()) {
+      reader->ready_to_start_ranges_cv_.Wait(reader_lock);
+    } else {
+      *range = reader->ready_to_start_ranges_.Dequeue();
+      DCHECK(*range != nullptr);
+      int disk_id = (*range)->disk_id();
+      DCHECK_EQ(*range, reader->disk_states_[disk_id].next_scan_range_to_start());
+      // Set this to nullptr, the next time this disk runs for this reader, it will
+      // get another range ready.
+      reader->disk_states_[disk_id].set_next_scan_range_to_start(nullptr);
+      reader->ScheduleScanRange(*range);
+      break;
+    }
+  }
+  return status;
+}
+
+Status DiskIoMgr::Read(RequestContext* reader,
+    ScanRange* range, std::unique_ptr<BufferDescriptor>* buffer) {
+  DCHECK(range != nullptr);
+  DCHECK(buffer != nullptr);
+  *buffer = nullptr;
+
+  if (range->len() > max_buffer_size_
+      && range->external_buffer_tag_ != ScanRange::ExternalBufferTag::CLIENT_BUFFER) {
+    return Status(TErrorCode::DISK_IO_ERROR, Substitute("Internal error: cannot "
+        "perform sync read of '$0' bytes that is larger than the max read buffer size "
+        "'$1'.", range->len(), max_buffer_size_));
+  }
+
+  vector<ScanRange*> ranges;
+  ranges.push_back(range);
+  RETURN_IF_ERROR(AddScanRanges(reader, ranges, true));
+  RETURN_IF_ERROR(range->GetNext(buffer));
+  DCHECK((*buffer) != nullptr);
+  DCHECK((*buffer)->eosr());
+  return Status::OK();
+}
+
+void DiskIoMgr::ReturnBuffer(unique_ptr<BufferDescriptor> buffer_desc) {
+  DCHECK(buffer_desc != nullptr);
+  if (!buffer_desc->status_.ok()) DCHECK(buffer_desc->buffer_ == nullptr);
+
+  RequestContext* reader = buffer_desc->reader_;
+  if (buffer_desc->buffer_ != nullptr) {
+    if (!buffer_desc->is_cached() && !buffer_desc->is_client_buffer()) {
+      // Buffers the were not allocated by DiskIoMgr don't need to be freed.
+      FreeBufferMemory(buffer_desc.get());
+    }
+    buffer_desc->buffer_ = nullptr;
+    num_buffers_in_readers_.Add(-1);
+    reader->num_buffers_in_reader_.Add(-1);
+  } else {
+    // A nullptr buffer means there was an error in which case there is no buffer
+    // to return.
+  }
+
+  if (buffer_desc->eosr_ || buffer_desc->scan_range_->is_cancelled_) {
+    // Need to close the scan range if returning the last buffer or the scan range
+    // has been cancelled (and the caller might never get the last buffer).
+    // Close() is idempotent so multiple cancelled buffers is okay.
+    buffer_desc->scan_range_->Close();
+  }
+}
+
+unique_ptr<BufferDescriptor> DiskIoMgr::GetFreeBuffer(
+    RequestContext* reader, ScanRange* range, int64_t buffer_size) {
+  DCHECK_LE(buffer_size, max_buffer_size_);
+  DCHECK_GT(buffer_size, 0);
+  buffer_size = min(static_cast<int64_t>(max_buffer_size_), buffer_size);
+  int idx = free_buffers_idx(buffer_size);
+  // Quantize buffer size to nearest power of 2 greater than the specified buffer size and
+  // convert to bytes
+  buffer_size = (1LL << idx) * min_buffer_size_;
+
+  // Track memory against the reader. This is checked the next time we start
+  // a read for the next reader in DiskIoMgr::GetNextScanRange().
+  DCHECK(reader->mem_tracker_ != nullptr);
+  reader->mem_tracker_->Consume(buffer_size);
+
+  uint8_t* buffer = nullptr;
+  {
+    unique_lock<mutex> lock(free_buffers_lock_);
+    if (free_buffers_[idx].empty()) {
+      num_allocated_buffers_.Add(1);
+      if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != nullptr) {
+        ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(1L);
+      }
+      if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != nullptr) {
+        ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(buffer_size);
+      }
+      // We already tracked this memory against the reader's MemTracker.
+      buffer = new uint8_t[buffer_size];
+    } else {
+      if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != nullptr) {
+        ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(-1L);
+      }
+      buffer = free_buffers_[idx].front();
+      free_buffers_[idx].pop_front();
+      free_buffer_mem_tracker_->Release(buffer_size);
+    }
+  }
+
+  // Validate more invariants.
+  DCHECK(range != nullptr);
+  DCHECK(reader != nullptr);
+  DCHECK(buffer != nullptr);
+  return unique_ptr<BufferDescriptor>(new BufferDescriptor(
+      this, reader, range, buffer, buffer_size, reader->mem_tracker_));
+}
+
+void DiskIoMgr::GcIoBuffers(int64_t bytes_to_free) {
+  unique_lock<mutex> lock(free_buffers_lock_);
+  int buffers_freed = 0;
+  int bytes_freed = 0;
+  // Free small-to-large to avoid retaining many small buffers and fragmenting memory.
+  for (int idx = 0; idx < free_buffers_.size(); ++idx) {
+    deque<uint8_t*>* free_buffers = &free_buffers_[idx];
+    while (
+        !free_buffers->empty() && (bytes_to_free == -1 || bytes_freed <= bytes_to_free)) {
+      uint8_t* buffer = free_buffers->front();
+      free_buffers->pop_front();
+      int64_t buffer_size = (1LL << idx) * min_buffer_size_;
+      delete[] buffer;
+      free_buffer_mem_tracker_->Release(buffer_size);
+      num_allocated_buffers_.Add(-1);
+
+      ++buffers_freed;
+      bytes_freed += buffer_size;
+    }
+    if (bytes_to_free != -1 && bytes_freed >= bytes_to_free) break;
+  }
+
+  if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != nullptr) {
+    ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(-buffers_freed);
+  }
+  if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != nullptr) {
+    ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(-bytes_freed);
+  }
+  if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != nullptr) {
+    ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(-buffers_freed);
+  }
+}
+
+void DiskIoMgr::FreeBufferMemory(BufferDescriptor* desc) {
+  DCHECK(!desc->is_cached());
+  DCHECK(!desc->is_client_buffer());
+  uint8_t* buffer = desc->buffer_;
+  int64_t buffer_size = desc->buffer_len_;
+  int idx = free_buffers_idx(buffer_size);
+  DCHECK_EQ(BitUtil::Ceil(buffer_size, min_buffer_size_) & ~(1LL << idx), 0)
+      << "buffer_size_ / min_buffer_size_ should be power of 2, got buffer_size = "
+      << buffer_size << ", min_buffer_size_ = " << min_buffer_size_;
+
+  {
+    unique_lock<mutex> lock(free_buffers_lock_);
+    if (!FLAGS_disable_mem_pools &&
+        free_buffers_[idx].size() < FLAGS_max_free_io_buffers) {
+      free_buffers_[idx].push_back(buffer);
+      if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != nullptr) {
+        ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(1L);
+      }
+      // This consume call needs to be protected by 'free_buffers_lock_' to avoid a race
+      // with a Release() call for the same buffer that could make consumption negative.
+      // Note: we can't use TryConsume(), which can indirectly call GcIoBuffers().
+      // TODO: after IMPALA-3200 is completed, we should be able to leverage the buffer
+      // pool's free lists, and remove these free lists.
+      free_buffer_mem_tracker_->Consume(buffer_size);
+    } else {
+      num_allocated_buffers_.Add(-1);
+      delete[] buffer;
+      if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != nullptr) {
+        ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(-1L);
+      }
+      if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != nullptr) {
+        ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(-buffer_size);
+      }
+    }
+  }
+
+  // We transferred the buffer ownership from the BufferDescriptor to the DiskIoMgr.
+  desc->mem_tracker_->Release(buffer_size);
+  desc->buffer_ = nullptr;
+}
+
+// This function gets the next RequestRange to work on for this disk. It checks for
+// cancellation and
+// a) Updates ready_to_start_ranges if there are no scan ranges queued for this disk.
+// b) Adds an unstarted write range to in_flight_ranges_. The write range is processed
+//    immediately if there are no preceding scan ranges in in_flight_ranges_
+// It blocks until work is available or the thread is shut down.
+// Work is available if there is a RequestContext with
+//  - A ScanRange with a buffer available, or
+//  - A WriteRange in unstarted_write_ranges_.
+bool DiskIoMgr::GetNextRequestRange(DiskQueue* disk_queue, RequestRange** range,
+    RequestContext** request_context) {
+  int disk_id = disk_queue->disk_id;
+  *range = nullptr;
+
+  // This loops returns either with work to do or when the disk IoMgr shuts down.
+  while (true) {
+    *request_context = nullptr;
+    RequestContext::PerDiskState* request_disk_state = nullptr;
+    {
+      unique_lock<mutex> disk_lock(disk_queue->lock);
+
+      while (!shut_down_ && disk_queue->request_contexts.empty()) {
+        // wait if there are no readers on the queue
+        disk_queue->work_available.Wait(disk_lock);
+      }
+      if (shut_down_) break;
+      DCHECK(!disk_queue->request_contexts.empty());
+
+      // Get the next reader and remove the reader so that another disk thread
+      // can't pick it up.  It will be enqueued before issuing the read to HDFS
+      // so this is not a big deal (i.e. multiple disk threads can read for the
+      // same reader).
+      // TODO: revisit.
+      *request_context = disk_queue->request_contexts.front();
+      disk_queue->request_contexts.pop_front();
+      DCHECK(*request_context != nullptr);
+      request_disk_state = &((*request_context)->disk_states_[disk_id]);
+      request_disk_state->IncrementRequestThreadAndDequeue();
+    }
+
+    // NOTE: no locks were taken in between.  We need to be careful about what state
+    // could have changed to the reader and disk in between.
+    // There are some invariants here.  Only one disk thread can have the
+    // same reader here (the reader is removed from the queue).  There can be
+    // other disk threads operating on this reader in other functions though.
+
+    // We just picked a reader. Before we may allocate a buffer on its behalf, check that
+    // it has not exceeded any memory limits (e.g. the query or process limit).
+    // TODO: once IMPALA-3200 is fixed, we should be able to remove the free lists and
+    // move these memory limit checks to GetFreeBuffer().
+    // Note that calling AnyLimitExceeded() can result in a call to GcIoBuffers().
+    // TODO: IMPALA-3209: we should not force a reader over its memory limit by
+    // pushing more buffers to it. Most readers can make progress and operate within
+    // a fixed memory limit.
+    if ((*request_context)->mem_tracker_ != nullptr
+        && (*request_context)->mem_tracker_->AnyLimitExceeded()) {
+      (*request_context)->Cancel(Status::MemLimitExceeded());
+    }
+
+    unique_lock<mutex> request_lock((*request_context)->lock_);
+    VLOG_FILE << "Disk (id=" << disk_id << ") reading for "
+        << (*request_context)->DebugString();
+
+    // Check if reader has been cancelled
+    if ((*request_context)->state_ == RequestContext::Cancelled) {
+      request_disk_state->DecrementRequestThreadAndCheckDone(*request_context);
+      continue;
+    }
+
+    DCHECK_EQ((*request_context)->state_, RequestContext::Active)
+        << (*request_context)->DebugString();
+
+    if (request_disk_state->next_scan_range_to_start() == nullptr &&
+        !request_disk_state->unstarted_scan_ranges()->empty()) {
+      // We don't have a range queued for this disk for what the caller should
+      // read next. Populate that.  We want to have one range waiting to minimize
+      // wait time in GetNextRange.
+      ScanRange* new_range = request_disk_state->unstarted_scan_ranges()->Dequeue();
+      (*request_context)->num_unstarted_scan_ranges_.Add(-1);
+      (*request_context)->ready_to_start_ranges_.Enqueue(new_range);
+      request_disk_state->set_next_scan_range_to_start(new_range);
+
+      if ((*request_context)->num_unstarted_scan_ranges_.Load() == 0) {
+        // All the ranges have been started, notify everyone blocked on GetNextRange.
+        // Only one of them will get work so make sure to return nullptr to the other
+        // caller threads.
+        (*request_context)->ready_to_start_ranges_cv_.NotifyAll();
+      } else {
+        (*request_context)->ready_to_start_ranges_cv_.NotifyOne();
+      }
+    }
+
+    // Always enqueue a WriteRange to be processed into in_flight_ranges_.
+    // This is done so in_flight_ranges_ does not exclusively contain ScanRanges.
+    // For now, enqueuing a WriteRange on each invocation of GetNextRequestRange()
+    // does not flood in_flight_ranges() with WriteRanges because the entire
+    // WriteRange is processed and removed from the queue after GetNextRequestRange()
+    // returns. (A DCHECK is used to ensure that writes do not exceed 8MB).
+    if (!request_disk_state->unstarted_write_ranges()->empty()) {
+      WriteRange* write_range = request_disk_state->unstarted_write_ranges()->Dequeue();
+      request_disk_state->in_flight_ranges()->Enqueue(write_range);
+    }
+
+    // Get the next scan range to work on from the reader. Only in_flight_ranges
+    // are eligible since the disk threads do not start new ranges on their own.
+
+    // There are no inflight ranges, nothing to do.
+    if (request_disk_state->in_flight_ranges()->empty()) {
+      request_disk_state->DecrementRequestThread();
+      continue;
+    }
+    DCHECK_GT(request_disk_state->num_remaining_ranges(), 0);
+    *range = request_disk_state->in_flight_ranges()->Dequeue();
+    DCHECK(*range != nullptr);
+
+    // Now that we've picked a request range, put the context back on the queue so
+    // another thread can pick up another request range for this context.
+    request_disk_state->ScheduleContext(*request_context, disk_id);
+    DCHECK((*request_context)->Validate()) << endl << (*request_context)->DebugString();
+    return true;
+  }
+
+  DCHECK(shut_down_);
+  return false;
+}
+
+void DiskIoMgr::HandleWriteFinished(
+    RequestContext* writer, WriteRange* write_range, const Status& write_status) {
+  // Copy disk_id before running callback: the callback may modify write_range.
+  int disk_id = write_range->disk_id_;
+
+  // Execute the callback before decrementing the thread count. Otherwise CancelContext()
+  // that waits for the disk ref count to be 0 will return, creating a race, e.g. see
+  // IMPALA-1890.
+  // The status of the write does not affect the status of the writer context.
+  write_range->callback_(write_status);
+  {
+    unique_lock<mutex> writer_lock(writer->lock_);
+    DCHECK(writer->Validate()) << endl << writer->DebugString();
+    RequestContext::PerDiskState& state = writer->disk_states_[disk_id];
+    if (writer->state_ == RequestContext::Cancelled) {
+      state.DecrementRequestThreadAndCheckDone(writer);
+    } else {
+      state.DecrementRequestThread();
+    }
+    --state.num_remaining_ranges();
+  }
+}
+
+void DiskIoMgr::HandleReadFinished(DiskQueue* disk_queue, RequestContext* reader,
+    unique_ptr<BufferDescriptor> buffer) {
+  unique_lock<mutex> reader_lock(reader->lock_);
+
+  RequestContext::PerDiskState& state = reader->disk_states_[disk_queue->disk_id];
+  DCHECK(reader->Validate()) << endl << reader->DebugString();
+  DCHECK_GT(state.num_threads_in_op(), 0);
+  DCHECK(buffer->buffer_ != nullptr);
+
+  if (reader->state_ == RequestContext::Cancelled) {
+    state.DecrementRequestThreadAndCheckDone(reader);
+    DCHECK(reader->Validate()) << endl << reader->DebugString();
+    if (!buffer->is_client_buffer()) FreeBufferMemory(buffer.get());
+    buffer->buffer_ = nullptr;
+    ScanRange* scan_range = buffer->scan_range_;
+    scan_range->Cancel(reader->status_);
+    // Enqueue the buffer to use the scan range's buffer cleanup path.
+    scan_range->EnqueueBuffer(reader_lock, move(buffer));
+    return;
+  }
+
+  DCHECK_EQ(reader->state_, RequestContext::Active);
+  DCHECK(buffer->buffer_ != nullptr);
+
+  // Update the reader's scan ranges.  There are a three cases here:
+  //  1. Read error
+  //  2. End of scan range
+  //  3. Middle of scan range
+  if (!buffer->status_.ok()) {
+    // Error case
+    if (!buffer->is_client_buffer()) FreeBufferMemory(buffer.get());
+    buffer->buffer_ = nullptr;
+    buffer->eosr_ = true;
+    --state.num_remaining_ranges();
+    buffer->scan_range_->Cancel(buffer->status_);
+  } else if (buffer->eosr_) {
+    --state.num_remaining_ranges();
+  }
+
+  // After calling EnqueueBuffer(), it is no longer valid to read from buffer.
+  // Store the state we need before calling EnqueueBuffer().
+  bool eosr = buffer->eosr_;
+  ScanRange* scan_range = buffer->scan_range_;
+  bool is_cached = buffer->is_cached();
+  bool queue_full = scan_range->EnqueueBuffer(reader_lock, move(buffer));
+  if (eosr) {
+    // For cached buffers, we can't close the range until the cached buffer is returned.
+    // Close() is called from DiskIoMgr::ReturnBuffer().
+    if (!is_cached) scan_range->Close();
+  } else {
+    if (queue_full) {
+      reader->blocked_ranges_.Enqueue(scan_range);
+    } else {
+      reader->ScheduleScanRange(scan_range);
+    }
+  }
+  state.DecrementRequestThread();
+}
+
+void DiskIoMgr::WorkLoop(DiskQueue* disk_queue) {
+  // The thread waits until there is work or the entire system is being shut down.
+  // If there is work, performs the read or write requested and re-enqueues the
+  // requesting context.
+  // Locks are not taken when reading from or writing to disk.
+  // The main loop has three parts:
+  //   1. GetNextRequestContext(): get the next request context (read or write) to
+  //      process and dequeue it.
+  //   2. For the dequeued request, gets the next scan- or write-range to process and
+  //      re-enqueues the request.
+  //   3. Perform the read or write as specified.
+  // Cancellation checking needs to happen in both steps 1 and 3.
+  while (true) {
+    RequestContext* worker_context = nullptr;;
+    RequestRange* range = nullptr;
+
+    if (!GetNextRequestRange(disk_queue, &range, &worker_context)) {
+      DCHECK(shut_down_);
+      break;
+    }
+
+    if (range->request_type() == RequestType::READ) {
+      ReadRange(disk_queue, worker_context, static_cast<ScanRange*>(range));
+    } else {
+      DCHECK(range->request_type() == RequestType::WRITE);
+      Write(worker_context, static_cast<WriteRange*>(range));
+    }
+  }
+
+  DCHECK(shut_down_);
+}
+
+// This function reads the specified scan range associated with the
+// specified reader context and disk queue.
+void DiskIoMgr::ReadRange(
+    DiskQueue* disk_queue, RequestContext* reader, ScanRange* range) {
+  int64_t bytes_remaining = range->len_ - range->bytes_read_;
+  DCHECK_GT(bytes_remaining, 0);
+  unique_ptr<BufferDescriptor> buffer_desc;
+  if (range->external_buffer_tag_ == ScanRange::ExternalBufferTag::CLIENT_BUFFER) {
+    buffer_desc = unique_ptr<BufferDescriptor>(new BufferDescriptor(this, reader, range,
+        range->client_buffer_.data, range->client_buffer_.len, nullptr));
+  } else {
+    // Need to allocate a buffer to read into.
+    int64_t buffer_size = ::min(bytes_remaining, static_cast<int64_t>(max_buffer_size_));
+    buffer_desc = TryAllocateNextBufferForRange(disk_queue, reader, range, buffer_size);
+    if (buffer_desc == nullptr) return;
+  }
+  reader->num_used_buffers_.Add(1);
+
+  // No locks in this section.  Only working on local vars.  We don't want to hold a
+  // lock across the read call.
+  buffer_desc->status_ = range->Open(detail::is_file_handle_caching_enabled());
+  if (buffer_desc->status_.ok()) {
+    // Update counters.
+    if (reader->active_read_thread_counter_) {
+      reader->active_read_thread_counter_->Add(1L);
+    }
+    if (reader->disks_accessed_bitmap_) {
+      int64_t disk_bit = 1LL << disk_queue->disk_id;
+      reader->disks_accessed_bitmap_->BitOr(disk_bit);
+    }
+    SCOPED_TIMER(&read_timer_);
+    SCOPED_TIMER(reader->read_timer_);
+
+    buffer_desc->status_ = range->Read(buffer_desc->buffer_, buffer_desc->buffer_len_,
+        &buffer_desc->len_, &buffer_desc->eosr_);
+    buffer_desc->scan_range_offset_ = range->bytes_read_ - buffer_desc->len_;
+
+    if (reader->bytes_read_counter_ != nullptr) {
+      COUNTER_ADD(reader->bytes_read_counter_, buffer_desc->len_);
+    }
+
+    COUNTER_ADD(&total_bytes_read_counter_, buffer_desc->len_);
+    if (reader->active_read_thread_counter_) {
+      reader->active_read_thread_counter_->Add(-1L);
+    }
+  }
+
+  // Finished read, update reader/disk based on the results
+  HandleReadFinished(disk_queue, reader, move(buffer_desc));
+}
+
+unique_ptr<BufferDescriptor> DiskIoMgr::TryAllocateNextBufferForRange(
+    DiskQueue* disk_queue, RequestContext* reader, ScanRange* range,
+    int64_t buffer_size) {
+  DCHECK(reader->mem_tracker_ != nullptr);
+  bool enough_memory = reader->mem_tracker_->SpareCapacity() > LOW_MEMORY;
+  if (!enough_memory) {
+    // Low memory, GC all the buffers and try again.
+    GcIoBuffers();
+    enough_memory = reader->mem_tracker_->SpareCapacity() > LOW_MEMORY;
+  }
+
+  if (!enough_memory) {
+    RequestContext::PerDiskState& state = reader->disk_states_[disk_queue->disk_id];
+    unique_lock<mutex> reader_lock(reader->lock_);
+
+    // Just grabbed the reader lock, check for cancellation.
+    if (reader->state_ == RequestContext::Cancelled) {
+      DCHECK(reader->Validate()) << endl << reader->DebugString();
+      state.DecrementRequestThreadAndCheckDone(reader);
+      range->Cancel(reader->status_);
+      DCHECK(reader->Validate()) << endl << reader->DebugString();
+      return nullptr;
+    }
+
+    if (!range->ready_buffers_.empty()) {
+      // We have memory pressure and this range doesn't need another buffer
+      // (it already has one queued). Skip this range and pick it up later.
+      range->blocked_on_queue_ = true;
+      reader->blocked_ranges_.Enqueue(range);
+      state.DecrementRequestThread();
+      return nullptr;
+    } else {
+      // We need to get a buffer anyway since there are none queued. The query
+      // is likely to fail due to mem limits but there's nothing we can do about that
+      // now.
+    }
+  }
+  unique_ptr<BufferDescriptor> buffer_desc = GetFreeBuffer(reader, range, buffer_size);
+  DCHECK(buffer_desc != nullptr);
+  return buffer_desc;
+}
+
+void DiskIoMgr::Write(RequestContext* writer_context, WriteRange* write_range) {
+  Status ret_status = Status::OK();
+  FILE* file_handle = nullptr;
+  // Raw open() syscall will create file if not present when passed these flags.
+  int fd = open(write_range->file(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+  if (fd < 0) {
+    ret_status = Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
+        Substitute("Opening '$0' for write failed with errno=$1 description=$2",
+                                     write_range->file_, errno, GetStrErrMsg())));
+  } else {
+    file_handle = fdopen(fd, "wb");
+    if (file_handle == nullptr) {
+      ret_status = Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
+          Substitute("fdopen($0, \"wb\") failed with errno=$1 description=$2", fd, errno,
+                                       GetStrErrMsg())));
+    }
+  }
+
+  if (file_handle != nullptr) {
+    ret_status = WriteRangeHelper(file_handle, write_range);
+
+    int success = fclose(file_handle);
+    if (ret_status.ok() && success != 0) {
+      ret_status = Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
+          Substitute("fclose($0) failed", write_range->file_)));
+    }
+  }
+
+  HandleWriteFinished(writer_context, write_range, ret_status);
+}
+
+Status DiskIoMgr::WriteRangeHelper(FILE* file_handle, WriteRange* write_range) {
+  // Seek to the correct offset and perform the write.
+  int success = fseek(file_handle, write_range->offset(), SEEK_SET);
+  if (success != 0) {
+    return Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
+        Substitute("fseek($0, $1, SEEK_SET) failed with errno=$2 description=$3",
+        write_range->file_, write_range->offset(), errno, GetStrErrMsg())));
+  }
+
+#ifndef NDEBUG
+  if (FLAGS_stress_scratch_write_delay_ms > 0) {
+    SleepForMs(FLAGS_stress_scratch_write_delay_ms);
+  }
+#endif
+  int64_t bytes_written = fwrite(write_range->data_, 1, write_range->len_, file_handle);
+  if (bytes_written < write_range->len_) {
+    return Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
+        Substitute("fwrite(buffer, 1, $0, $1) failed with errno=$2 description=$3",
+        write_range->len_, write_range->file_, errno, GetStrErrMsg())));
+  }
+  if (ImpaladMetrics::IO_MGR_BYTES_WRITTEN != nullptr) {
+    ImpaladMetrics::IO_MGR_BYTES_WRITTEN->Increment(write_range->len_);
+  }
+
+  return Status::OK();
+}
+
+int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
+  int64_t buffer_size_scaled = BitUtil::Ceil(buffer_size, min_buffer_size_);
+  int idx = BitUtil::Log2Ceiling64(buffer_size_scaled);
+  DCHECK_GE(idx, 0);
+  DCHECK_LT(idx, free_buffers_.size());
+  return idx;
+}
+
+Status DiskIoMgr::AddWriteRange(RequestContext* writer, WriteRange* write_range) {
+  unique_lock<mutex> writer_lock(writer->lock_);
+
+  if (writer->state_ == RequestContext::Cancelled) {
+    DCHECK(!writer->status_.ok());
+    return writer->status_;
+  }
+
+  writer->AddRequestRange(write_range, false);
+  return Status::OK();
+}
+
+int DiskIoMgr::AssignQueue(const char* file, int disk_id, bool expected_local) {
+  // If it's a remote range, check for an appropriate remote disk queue.
+  if (!expected_local) {
+    if (IsHdfsPath(file) && FLAGS_num_remote_hdfs_io_threads > 0) {
+      return RemoteDfsDiskId();
+    }
+    if (IsS3APath(file)) return RemoteS3DiskId();
+    if (IsADLSPath(file)) return RemoteAdlsDiskId();
+  }
+  // Assign to a local disk queue.
+  DCHECK(!IsS3APath(file)); // S3 is always remote.
+  DCHECK(!IsADLSPath(file)); // ADLS is always remote.
+  if (disk_id == -1) {
+    // disk id is unknown, assign it an arbitrary one.
+    disk_id = next_disk_id_.Add(1);
+  }
+  // TODO: we need to parse the config for the number of dirs configured for this
+  // data node.
+  return disk_id % num_local_disks();
+}
+
+HdfsFileHandle* DiskIoMgr::GetCachedHdfsFileHandle(const hdfsFS& fs,
+    std::string* fname, int64_t mtime, RequestContext *reader,
+    bool require_new) {
+  bool cache_hit;
+  HdfsFileHandle* fh = file_handle_cache_.GetFileHandle(fs, fname, mtime, require_new,
+      &cache_hit);
+  if (fh == nullptr) return nullptr;
+  if (cache_hit) {
+    DCHECK(!require_new);
+    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_HIT_RATIO->Update(1L);
+    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_HIT_COUNT->Increment(1L);
+    reader->cached_file_handles_hit_count_.Add(1L);
+  } else {
+    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_HIT_RATIO->Update(0L);
+    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT->Increment(1L);
+    reader->cached_file_handles_miss_count_.Add(1L);
+  }
+  return fh;
+}
+
+void DiskIoMgr::ReleaseCachedHdfsFileHandle(std::string* fname, HdfsFileHandle* fid,
+    bool destroy_handle) {
+  file_handle_cache_.ReleaseFileHandle(fname, fid, destroy_handle);
+}
+
+Status DiskIoMgr::ReopenCachedHdfsFileHandle(const hdfsFS& fs, std::string* fname,
+    int64_t mtime, HdfsFileHandle** fid) {
+  bool cache_hit;
+  file_handle_cache_.ReleaseFileHandle(fname, *fid, true);
+  // The old handle has been destroyed, so *fid must be overwritten before returning.
+  *fid = file_handle_cache_.GetFileHandle(fs, fname, mtime, true,
+      &cache_hit);
+  if (*fid == nullptr) {
+    return Status(TErrorCode::DISK_IO_ERROR,
+        GetHdfsErrorMsg("Failed to open HDFS file ", fname->data()));
+  }
+  DCHECK(!cache_hit);
+  return Status::OK();
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr.h b/be/src/runtime/io/disk-io-mgr.h
new file mode 100644
index 0000000..71dc840
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr.h
@@ -0,0 +1,550 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_IO_DISK_IO_MGR_H
+#define IMPALA_RUNTIME_IO_DISK_IO_MGR_H
+
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/unordered_set.hpp>
+#include <boost/thread/mutex.hpp>
+
+#include "common/atomic.h"
+#include "common/hdfs.h"
+#include "common/object-pool.h"
+#include "common/status.h"
+#include "runtime/io/handle-cache.h"
+#include "runtime/io/request-ranges.h"
+#include "runtime/thread-resource-mgr.h"
+#include "util/aligned-new.h"
+#include "util/bit-util.h"
+#include "util/condition-variable.h"
+#include "util/error-util.h"
+#include "util/runtime-profile.h"
+#include "util/thread.h"
+
+namespace impala {
+
+class MemTracker;
+
+namespace io {
+/// Manager object that schedules IO for all queries on all disks and remote filesystems
+/// (such as S3). Each query maps to one or more RequestContext objects, each of which
+/// has its own queue of scan ranges and/or write ranges.
+//
+/// The API splits up requesting scan/write ranges (non-blocking) and reading the data
+/// (blocking). The DiskIoMgr has worker threads that will read from and write to
+/// disk/hdfs/remote-filesystems, allowing interleaving of IO and CPU. This allows us to
+/// keep all disks and all cores as busy as possible.
+//
+/// All public APIs are thread-safe. It is not valid to call any of the APIs after
+/// UnregisterContext() returns.
+//
+/// For Readers:
+/// We can model this problem as a multiple producer (threads for each disk), multiple
+/// consumer (scan ranges) problem. There are multiple queues that need to be
+/// synchronized. Conceptually, there are two queues:
+///   1. The per disk queue: this contains a queue of readers that need reads.
+///   2. The per scan range ready-buffer queue: this contains buffers that have been
+///      read and are ready for the caller.
+/// The disk queue contains a queue of readers and is scheduled in a round robin fashion.
+/// Readers map to scan nodes. The reader then contains a queue of scan ranges. The caller
+/// asks the IoMgr for the next range to process. The IoMgr then selects the best range
+/// to read based on disk activity and begins reading and queuing buffers for that range.
+/// TODO: We should map readers to queries. A reader is the unit of scheduling and queries
+/// that have multiple scan nodes shouldn't have more 'turns'.
+//
+/// For Writers:
+/// Data is written via AddWriteRange(). This is non-blocking and adds a WriteRange to a
+/// per-disk queue. After the write is complete, a callback in WriteRange is invoked.
+/// No memory is allocated within IoMgr for writes and no copies are made. It is the
+/// responsibility of the client to ensure that the data to be written is valid and that
+/// the file to be written to exists until the callback is invoked.
+//
+/// The IoMgr provides three key APIs.
+///  1. AddScanRanges: this is non-blocking and tells the IoMgr all the ranges that
+///     will eventually need to be read.
+///  2. GetNextRange: returns to the caller the next scan range it should process.
+///     This is based on disk load. This also begins reading the data in this scan
+///     range. This is blocking.
+///  3. ScanRange::GetNext: returns the next buffer for this range.  This is blocking.
+//
+/// The disk threads do not synchronize with each other. The readers and writers don't
+/// synchronize with each other. There is a lock and condition variable for each request
+/// context queue and each disk queue.
+/// IMPORTANT: whenever both locks are needed, the lock order is to grab the context lock
+/// before the disk lock.
+//
+/// Scheduling: If there are multiple request contexts with work for a single disk, the
+/// request contexts are scheduled in round-robin order. Multiple disk threads can
+/// operate on the same request context. Exactly one request range is processed by a
+/// disk thread at a time. If there are multiple scan ranges scheduled via
+/// GetNextRange() for a single context, these are processed in round-robin order.
+/// If there are multiple scan and write ranges for a disk, a read is always followed
+/// by a write, and a write is followed by a read, i.e. reads and writes alternate.
+/// If multiple write ranges are enqueued for a single disk, they will be processed
+/// by the disk threads in order, but may complete in any order. No guarantees are made
+/// on ordering of writes across disks.
+//
+/// Resource Management: effective resource management in the IoMgr is key to good
+/// performance. The IoMgr helps coordinate two resources: CPU and disk. For CPU,
+/// spinning up too many threads causes thrashing.
+/// Memory usage in the IoMgr comes from queued read buffers.  If we queue the minimum
+/// (i.e. 1), then the disks are idle while we are processing the buffer. If we don't
+/// limit the queue, then it possible we end up queueing the entire data set (i.e. CPU
+/// is slower than disks) and run out of memory.
+/// For both CPU and memory, we want to model the machine as having a fixed amount of
+/// resources.  If a single query is running, it should saturate either CPU or Disk
+/// as well as using as little memory as possible. With multiple queries, each query
+/// should get less CPU. In that case each query will need fewer queued buffers and
+/// therefore have less memory usage.
+//
+/// The IoMgr defers CPU management to the caller. The IoMgr provides a GetNextRange
+/// API which will return the next scan range the caller should process. The caller
+/// can call this from the desired number of reading threads. Once a scan range
+/// has been returned via GetNextRange, the IoMgr will start to buffer reads for
+/// that range and it is expected the caller will pull those buffers promptly. For
+/// example, if the caller would like to have 1 scanner thread, the read loop
+/// would look like:
+///   while (more_ranges)
+///     range = GetNextRange()
+///     while (!range.eosr)
+///       buffer = range.GetNext()
+/// To have multiple reading threads, the caller would simply spin up the threads
+/// and each would process the loops above.
+//
+/// To control the number of IO buffers, each scan range has a limit of two queued
+/// buffers (SCAN_RANGE_READY_BUFFER_LIMIT). If the number of buffers is at capacity,
+/// the IoMgr will no longer read for that scan range until the caller has processed
+/// a buffer. Assuming the client returns each buffer before requesting the next one
+/// from the scan range, then this will consume up to 3 * 8MB = 24MB of I/O buffers per
+/// scan range.
+//
+/// Buffer Management:
+/// Buffers for reads are either a) allocated by the IoMgr and transferred to the caller,
+/// b) cached HDFS buffers if the scan range uses HDFS caching, or c) provided by the
+/// caller when constructing the scan range.
+///
+/// As a caller reads from a scan range, these buffers are wrapped in BufferDescriptors
+/// and returned to the caller. The caller must always call ReturnBuffer() on the buffer
+/// descriptor to allow recycling of the associated buffer (if there is an
+/// IoMgr-allocated or HDFS cached buffer).
+///
+/// Caching support:
+/// Scan ranges contain metadata on whether or not it is cached on the DN. In that
+/// case, we use the HDFS APIs to read the cached data without doing any copies. For these
+/// ranges, the reads happen on the caller thread (as opposed to the disk threads).
+/// It is possible for the cached read APIs to fail, in which case the ranges are then
+/// queued on the disk threads and behave identically to the case where the range
+/// is not cached.
+/// Resources for these ranges are also not accounted against the reader because none
+/// are consumed.
+/// While a cached block is being processed, the block is mlocked. We want to minimize
+/// the time the mlock is held.
+///   - HDFS will time us out if we hold onto the mlock for too long
+///   - Holding the lock prevents uncaching this file due to a caching policy change.
+/// Therefore, we only issue the cached read when the caller is ready to process the
+/// range (GetNextRange()) instead of when the ranges are issued. This guarantees that
+/// there will be a CPU available to process the buffer and any throttling we do with
+/// the number of scanner threads properly controls the amount of files we mlock.
+/// With cached scan ranges, we cannot close the scan range until the cached buffer
+/// is returned (HDFS does not allow this). We therefore need to defer the close until
+/// the cached buffer is returned (ReturnBuffer()).
+//
+/// Remote filesystem support (e.g. S3):
+/// Remote filesystems are modeled as "remote disks". That is, there is a seperate disk
+/// queue for each supported remote filesystem type. In order to maximize throughput,
+/// multiple connections are opened in parallel by having multiple threads running per
+/// queue. Also note that reading from a remote filesystem service can be more CPU
+/// intensive than local disk/hdfs because of non-direct I/O and SSL processing, and can
+/// be CPU bottlenecked especially if not enough I/O threads for these queues are
+/// started.
+//
+/// TODO: IoMgr should be able to request additional scan ranges from the coordinator
+/// to help deal with stragglers.
+/// TODO: look into using a lock free queue
+/// TODO: simplify the common path (less locking, memory allocations).
+//
+/// Structure of the Implementation:
+///  - All client APIs are defined in this file, request-ranges.h and request-context.h.
+///    Clients can include only the files that they need.
+///  - Some internal classes are defined in disk-io-mgr-internal.h
+///  - ScanRange APIs are implemented in scan-range.cc
+///    This contains the ready buffer queue logic
+///  - RequestContext APIs are implemented in request-context.cc
+///    This contains the logic for picking scan ranges for a reader.
+///  - Disk Thread and general APIs are implemented in disk-io-mgr.cc.
+///  - The handle cache is implemented in handle-cache{.inline,}.h
+
+// This is cache line aligned because the FileHandleCache needs cache line alignment
+// for its partitions.
+class DiskIoMgr : public CacheLineAligned {
+ public:
+  /// Create a DiskIoMgr object. This constructor is only used for testing.
+  ///  - num_disks: The number of disks the IoMgr should use. This is used for testing.
+  ///    Specify 0, to have the disk IoMgr query the os for the number of disks.
+  ///  - threads_per_rotational_disk: number of read threads to create per rotational
+  ///    disk. This is also the max queue depth.
+  ///  - threads_per_solid_state_disk: number of read threads to create per solid state
+  ///    disk. This is also the max queue depth.
+  ///  - min_buffer_size: minimum io buffer size (in bytes)
+  ///  - max_buffer_size: maximum io buffer size (in bytes). Also the max read size.
+  DiskIoMgr(int num_disks, int threads_per_rotational_disk,
+      int threads_per_solid_state_disk, int min_buffer_size, int max_buffer_size);
+
+  /// Create DiskIoMgr with default configs.
+  DiskIoMgr();
+
+  /// Clean up all threads and resources. This is mostly useful for testing since
+  /// for impalad, this object is never destroyed.
+  ~DiskIoMgr();
+
+  /// Initialize the IoMgr. Must be called once before any of the other APIs.
+  Status Init(MemTracker* process_mem_tracker) WARN_UNUSED_RESULT;
+
+  /// Allocates tracking structure for a request context.
+  /// Register a new request context and return it to the caller. The caller must call
+  /// UnregisterContext() for each context.
+  /// reader_mem_tracker: Is non-null only for readers. IO buffers
+  ///    used for this reader will be tracked by this. If the limit is exceeded
+  ///    the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
+  ///    GetNext().
+  std::unique_ptr<RequestContext> RegisterContext(MemTracker* reader_mem_tracker);
+
+  /// Unregisters context from the disk IoMgr by first cancelling it then blocking until
+  /// all references to the context are removed from I/O manager internal data structures.
+  /// This must be called for every RegisterContext() to ensure that the context object
+  /// can be safely destroyed. It is invalid to add more request ranges to 'context' after
+  /// after this call. This call blocks until all the disk threads have finished cleaning
+  /// up.
+  void UnregisterContext(RequestContext* context);
+
+  /// This function cancels the context asychronously. All outstanding requests
+  /// are aborted and tracking structures cleaned up. This does not need to be
+  /// called if the context finishes normally.
+  /// This will also fail any outstanding GetNext()/Read requests.
+  void CancelContext(RequestContext* context);
+
+  /// Adds the scan ranges to the queues. This call is non-blocking. The caller must
+  /// not deallocate the scan range pointers before UnregisterContext().
+  /// If schedule_immediately, the ranges are immediately put on the read queue
+  /// (i.e. the caller should not/cannot call GetNextRange for these ranges).
+  /// This can be used to do synchronous reads as well as schedule dependent ranges,
+  /// as in the case for columnar formats.
+  Status AddScanRanges(RequestContext* reader,
+      const std::vector<ScanRange*>& ranges,
+      bool schedule_immediately = false) WARN_UNUSED_RESULT;
+  Status AddScanRange(RequestContext* reader, ScanRange* range,
+      bool schedule_immediately = false) WARN_UNUSED_RESULT;
+
+  /// Add a WriteRange for the writer. This is non-blocking and schedules the context
+  /// on the IoMgr disk queue. Does not create any files.
+  Status AddWriteRange(
+      RequestContext* writer, WriteRange* write_range) WARN_UNUSED_RESULT;
+
+  /// Returns the next unstarted scan range for this reader. When the range is returned,
+  /// the disk threads in the IoMgr will already have started reading from it. The
+  /// caller is expected to call ScanRange::GetNext on the returned range.
+  /// If there are no more unstarted ranges, nullptr is returned.
+  /// This call is blocking.
+  Status GetNextRange(RequestContext* reader, ScanRange** range) WARN_UNUSED_RESULT;
+
+  /// Reads the range and returns the result in buffer.
+  /// This behaves like the typical synchronous read() api, blocking until the data
+  /// is read. This can be called while there are outstanding ScanRanges and is
+  /// thread safe. Multiple threads can be calling Read() per reader at a time.
+  /// range *cannot* have already been added via AddScanRanges.
+  /// This can only be used if the scan range fits in a single IO buffer (i.e. is smaller
+  /// than max_read_buffer_size()) or if reading into a client-provided buffer.
+  Status Read(RequestContext* reader, ScanRange* range,
+      std::unique_ptr<BufferDescriptor>* buffer) WARN_UNUSED_RESULT;
+
+  /// Returns the buffer to the IoMgr. This must be called for every buffer
+  /// returned by GetNext()/Read() that did not return an error. This is non-blocking.
+  /// After calling this, the buffer descriptor is invalid and cannot be accessed.
+  void ReturnBuffer(std::unique_ptr<BufferDescriptor> buffer);
+
+  /// Determine which disk queue this file should be assigned to.  Returns an index into
+  /// disk_queues_.  The disk_id is the volume ID for the local disk that holds the
+  /// files, or -1 if unknown.  Flag expected_local is true iff this impalad is
+  /// co-located with the datanode for this file.
+  int AssignQueue(const char* file, int disk_id, bool expected_local);
+
+  /// TODO: The functions below can be moved to RequestContext.
+  /// Returns the current status of the context.
+  Status context_status(RequestContext* context) const WARN_UNUSED_RESULT;
+
+  void set_bytes_read_counter(RequestContext*, RuntimeProfile::Counter*);
+  void set_read_timer(RequestContext*, RuntimeProfile::Counter*);
+  void set_active_read_thread_counter(RequestContext*, RuntimeProfile::Counter*);
+  void set_disks_access_bitmap(RequestContext*, RuntimeProfile::Counter*);
+
+  int64_t queue_size(RequestContext* reader) const;
+  int64_t bytes_read_local(RequestContext* reader) const;
+  int64_t bytes_read_short_circuit(RequestContext* reader) const;
+  int64_t bytes_read_dn_cache(RequestContext* reader) const;
+  int num_remote_ranges(RequestContext* reader) const;
+  int64_t unexpected_remote_bytes(RequestContext* reader) const;
+  int cached_file_handles_hit_count(RequestContext* reader) const;
+  int cached_file_handles_miss_count(RequestContext* reader) const;
+
+  /// Returns the read throughput across all readers.
+  /// TODO: should this be a sliding window?  This should report metrics for the
+  /// last minute, hour and since the beginning.
+  int64_t GetReadThroughput();
+
+  /// Returns the maximum read buffer size
+  int max_read_buffer_size() const { return max_buffer_size_; }
+
+  /// Returns the total number of disk queues (both local and remote).
+  int num_total_disks() const { return disk_queues_.size(); }
+
+  /// Returns the total number of remote "disk" queues.
+  int num_remote_disks() const { return REMOTE_NUM_DISKS; }
+
+  /// Returns the number of local disks attached to the system.
+  int num_local_disks() const { return num_total_disks() - num_remote_disks(); }
+
+  /// The disk ID (and therefore disk_queues_ index) used for DFS accesses.
+  int RemoteDfsDiskId() const { return num_local_disks() + REMOTE_DFS_DISK_OFFSET; }
+
+  /// The disk ID (and therefore disk_queues_ index) used for S3 accesses.
+  int RemoteS3DiskId() const { return num_local_disks() + REMOTE_S3_DISK_OFFSET; }
+
+  /// The disk ID (and therefore disk_queues_ index) used for ADLS accesses.
+  int RemoteAdlsDiskId() const { return num_local_disks() + REMOTE_ADLS_DISK_OFFSET; }
+
+  /// Dumps the disk IoMgr queues (for readers and disks)
+  std::string DebugString();
+
+  /// Validates the internal state is consistent. This is intended to only be used
+  /// for debugging.
+  bool Validate() const;
+
+  /// Given a FS handle, name and last modified time of the file, gets an HdfsFileHandle
+  /// from the file handle cache. If 'require_new_handle' is true, the cache will open
+  /// a fresh file handle. On success, records statistics about whether this was
+  /// a cache hit or miss in the 'reader' as well as at the system level. In case of an
+  /// error returns nullptr.
+  HdfsFileHandle* GetCachedHdfsFileHandle(const hdfsFS& fs,
+      std::string* fname, int64_t mtime, RequestContext *reader,
+      bool require_new_handle);
+
+  /// Releases a file handle back to the file handle cache when it is no longer in use.
+  /// If 'destroy_handle' is true, the file handle cache will close the file handle
+  /// immediately.
+  void ReleaseCachedHdfsFileHandle(std::string* fname, HdfsFileHandle* fid,
+      bool destroy_handle);
+
+  /// Reopens a file handle by destroying the file handle and getting a fresh
+  /// file handle from the cache. Returns an error if the file could not be reopened.
+  Status ReopenCachedHdfsFileHandle(const hdfsFS& fs, std::string* fname, int64_t mtime,
+      HdfsFileHandle** fid);
+
+  /// Garbage collect unused I/O buffers up to 'bytes_to_free', or all the buffers if
+  /// 'bytes_to_free' is -1.
+  void GcIoBuffers(int64_t bytes_to_free = -1);
+
+  /// The maximum number of ready buffers that can be queued in a scan range. Having two
+  /// queued buffers (plus the buffer that is returned to the client) gives good
+  /// performance in most scenarios:
+  /// 1. If the consumer is consuming data faster than we can read from disk, then the
+  ///    queue will be empty most of the time because the buffer will be immediately
+  ///    pulled off the queue as soon as it is added. There will always be an I/O request
+  ///    in the disk queue to maximize I/O throughput, which is the bottleneck in this
+  ///    case.
+  /// 2. If we can read from disk faster than the consumer is consuming data, the queue
+  ///    will fill up and there will always be a buffer available for the consumer to
+  ///    read, so the consumer will not block and we maximize consumer throughput, which
+  ///    is the bottleneck in this case.
+  /// 3. If the consumer is consuming data at approximately the same rate as we are
+  ///    reading from disk, then the steady state is that the consumer is processing one
+  ///    buffer and one buffer is in the disk queue. The additional buffer can absorb
+  ///    bursts where the producer runs faster than the consumer or the consumer runs
+  ///    faster than the producer without blocking either the producer or consumer.
+  static const int SCAN_RANGE_READY_BUFFER_LIMIT = 2;
+
+  /// "Disk" queue offsets for remote accesses.  Offset 0 corresponds to
+  /// disk ID (i.e. disk_queue_ index) of num_local_disks().
+  enum {
+    REMOTE_DFS_DISK_OFFSET = 0,
+    REMOTE_S3_DISK_OFFSET,
+    REMOTE_ADLS_DISK_OFFSET,
+    REMOTE_NUM_DISKS
+  };
+
+ private:
+  friend class BufferDescriptor;
+  friend class RequestContext;
+  // TODO: remove io:: prefix - it is required for the "using ScanRange" workaround above.
+  friend class io::ScanRange;
+  struct DiskQueue;
+
+  friend class DiskIoMgrTest_Buffers_Test;
+  friend class DiskIoMgrTest_VerifyNumThreadsParameter_Test;
+
+  /// Memory tracker for unused I/O buffers owned by DiskIoMgr.
+  boost::scoped_ptr<MemTracker> free_buffer_mem_tracker_;
+
+  /// Memory tracker for I/O buffers where the RequestContext has no MemTracker.
+  /// TODO: once IMPALA-3200 is fixed, there should be no more cases where readers don't
+  /// provide a MemTracker.
+  boost::scoped_ptr<MemTracker> unowned_buffer_mem_tracker_;
+
+  /// Number of worker(read) threads per rotational disk. Also the max depth of queued
+  /// work to the disk.
+  const int num_io_threads_per_rotational_disk_;
+
+  /// Number of worker(read) threads per solid state disk. Also the max depth of queued
+  /// work to the disk.
+  const int num_io_threads_per_solid_state_disk_;
+
+  /// Maximum read size. This is also the maximum size of each allocated buffer.
+  const int max_buffer_size_;
+
+  /// The minimum size of each read buffer.
+  const int min_buffer_size_;
+
+  /// Thread group containing all the worker threads.
+  ThreadGroup disk_thread_group_;
+
+  /// Options object for cached hdfs reads. Set on startup and never modified.
+  struct hadoopRzOptions* cached_read_options_ = nullptr;
+
+  /// True if the IoMgr should be torn down. Worker threads watch for this to
+  /// know to terminate. This variable is read/written to by different threads.
+  volatile bool shut_down_;
+
+  /// Total bytes read by the IoMgr.
+  RuntimeProfile::Counter total_bytes_read_counter_;
+
+  /// Total time spent in hdfs reading
+  RuntimeProfile::Counter read_timer_;
+
+  /// Protects free_buffers_
+  boost::mutex free_buffers_lock_;
+
+  /// Free buffers that can be handed out to clients. There is one list for each buffer
+  /// size, indexed by the Log2 of the buffer size in units of min_buffer_size_. The
+  /// maximum buffer size is max_buffer_size_, so the maximum index is
+  /// Log2(max_buffer_size_ / min_buffer_size_).
+  //
+  /// E.g. if min_buffer_size_ = 1024 bytes:
+  ///  free_buffers_[0]  => list of free buffers with size 1024 B
+  ///  free_buffers_[1]  => list of free buffers with size 2048 B
+  ///  free_buffers_[10] => list of free buffers with size 1 MB
+  ///  free_buffers_[13] => list of free buffers with size 8 MB
+  ///  free_buffers_[n]  => list of free buffers with size 2^n * 1024 B
+  std::vector<std::deque<uint8_t*>> free_buffers_;
+
+  /// Total number of allocated buffers, used for debugging.
+  AtomicInt32 num_allocated_buffers_;
+
+  /// Total number of buffers in readers
+  AtomicInt32 num_buffers_in_readers_;
+
+  /// Per disk queues. This is static and created once at Init() time.  One queue is
+  /// allocated for each local disk on the system and for each remote filesystem type.
+  /// It is indexed by disk id.
+  std::vector<DiskQueue*> disk_queues_;
+
+  /// The next disk queue to write to if the actual 'disk_id_' is unknown (i.e. the file
+  /// is not associated with a particular local disk or remote queue). Used to implement
+  /// round-robin assignment for that case.
+  static AtomicInt32 next_disk_id_;
+
+  // Number of file handle cache partitions to use
+  static const size_t NUM_FILE_HANDLE_CACHE_PARTITIONS = 16;
+
+  // Caching structure that maps file names to cached file handles. The cache has an upper
+  // limit of entries defined by FLAGS_max_cached_file_handles. Evicted cached file
+  // handles are closed.
+  FileHandleCache<NUM_FILE_HANDLE_CACHE_PARTITIONS> file_handle_cache_;
+
+  /// Returns the index into free_buffers_ for a given buffer size
+  int free_buffers_idx(int64_t buffer_size);
+
+  /// Returns a buffer to read into with size between 'buffer_size' and
+  /// 'max_buffer_size_', If there is an appropriately-sized free buffer in the
+  /// 'free_buffers_', that is returned, otherwise a new one is allocated.
+  /// The returned *buffer_size must be between 0 and 'max_buffer_size_'.
+  /// The buffer memory is tracked against reader's mem tracker, or
+  /// 'unowned_buffer_mem_tracker_' if the reader does not have one.
+  std::unique_ptr<BufferDescriptor> GetFreeBuffer(
+      RequestContext* reader, ScanRange* range, int64_t buffer_size);
+
+  /// Disassociates the desc->buffer_ memory from 'desc' (which cannot be nullptr), either
+  /// freeing it or returning it to 'free_buffers_'. Memory tracking is updated to
+  /// reflect the transfer of ownership from desc->mem_tracker_ to the disk I/O mgr.
+  void FreeBufferMemory(BufferDescriptor* desc);
+
+  /// Disk worker thread loop. This function retrieves the next range to process on
+  /// the disk queue and invokes ReadRange() or Write() depending on the type of Range().
+  /// There can be multiple threads per disk running this loop.
+  void WorkLoop(DiskQueue* queue);
+
+  /// This is called from the disk thread to get the next range to process. It will
+  /// wait until a scan range and buffer are available, or a write range is available.
+  /// This functions returns the range to process.
+  /// Only returns false if the disk thread should be shut down.
+  /// No locks should be taken before this function call and none are left taken after.
+  bool GetNextRequestRange(DiskQueue* disk_queue, RequestRange** range,
+      RequestContext** request_context);
+
+  /// Updates disk queue and reader state after a read is complete. The read result
+  /// is captured in the buffer descriptor.
+  void HandleReadFinished(DiskQueue* disk_queue, RequestContext* reader,
+      std::unique_ptr<BufferDescriptor> buffer);
+
+  /// Invokes write_range->callback_  after the range has been written and
+  /// updates per-disk state and handle state. The status of the write OK/RUNTIME_ERROR
+  /// etc. is passed via write_status and to the callback.
+  /// The write_status does not affect the writer->status_. That is, an write error does
+  /// not cancel the writer context - that decision is left to the callback handler.
+  /// TODO: On the read path, consider not canceling the reader context on error.
+  void HandleWriteFinished(
+      RequestContext* writer, WriteRange* write_range, const Status& write_status);
+
+  /// Validates that range is correctly initialized
+  Status ValidateScanRange(ScanRange* range) WARN_UNUSED_RESULT;
+
+  /// Write the specified range to disk and calls HandleWriteFinished when done.
+  /// Responsible for opening and closing the file that is written.
+  void Write(RequestContext* writer_context, WriteRange* write_range);
+
+  /// Helper method to write a range using the specified FILE handle. Returns Status:OK
+  /// if the write succeeded, or a RUNTIME_ERROR with an appropriate message otherwise.
+  /// Does not open or close the file that is written.
+  Status WriteRangeHelper(FILE* file_handle, WriteRange* write_range) WARN_UNUSED_RESULT;
+
+  /// Reads the specified scan range and calls HandleReadFinished when done.
+  void ReadRange(DiskQueue* disk_queue, RequestContext* reader, ScanRange* range);
+
+  /// Try to allocate the next buffer for the scan range, returning the new buffer
+  /// if successful. If 'reader' is cancelled, cancels the range and returns nullptr.
+  /// If there is memory pressure and buffers are already queued, adds the range
+  /// to the blocked ranges and returns nullptr.
+  std::unique_ptr<BufferDescriptor> TryAllocateNextBufferForRange(DiskQueue* disk_queue,
+      RequestContext* reader, ScanRange* range, int64_t buffer_size);
+};
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/handle-cache.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/handle-cache.h b/be/src/runtime/io/handle-cache.h
new file mode 100644
index 0000000..78f91cd
--- /dev/null
+++ b/be/src/runtime/io/handle-cache.h
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_H
+#define IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_H
+
+#include <array>
+#include <list>
+#include <map>
+#include <memory>
+
+#include <boost/thread/mutex.hpp>
+
+#include "common/hdfs.h"
+#include "common/status.h"
+#include "util/aligned-new.h"
+#include "util/impalad-metrics.h"
+#include "util/spinlock.h"
+#include "util/thread.h"
+
+namespace impala {
+namespace io {
+
+/// This class is a small wrapper around the hdfsFile handle and the file system
+/// instance which is needed to close the file handle. The handle incorporates
+/// the last modified time of the file when it was opened. This is used to distinguish
+/// between file handles for files that can be updated or overwritten.
+class HdfsFileHandle {
+ public:
+
+  /// Constructor will open the file
+  HdfsFileHandle(const hdfsFS& fs, const char* fname, int64_t mtime);
+
+  /// Destructor will close the file handle
+  ~HdfsFileHandle();
+
+  hdfsFile file() const { return hdfs_file_;  }
+  int64_t mtime() const { return mtime_; }
+  bool ok() const { return hdfs_file_ != nullptr; }
+
+ private:
+  hdfsFS fs_;
+  hdfsFile hdfs_file_;
+  int64_t mtime_;
+};
+
+/// The FileHandleCache is a data structure that owns HdfsFileHandles to share between
+/// threads. The HdfsFileHandles are hash partitioned across NUM_PARTITIONS partitions.
+/// Each partition operates independently with its own locks, reducing contention
+/// between concurrent threads. The `capacity` is split between the partitions and is
+/// enforced independently.
+///
+/// Threads check out a file handle for exclusive access and return it when finished.
+/// If the file handle is not already present in the cache or all file handles for this
+/// file are checked out, the file handle is constructed and added to the cache.
+/// The cache can contain multiple file handles for the same file. If a file handle
+/// is checked out, it cannot be evicted from the cache. In this case, a cache can
+/// exceed the specified capacity.
+///
+/// The file handle cache is currently not suitable for remote files that maintain a
+/// connection as part of the handle. Most remote systems have a limit on the number
+/// of concurrent connections, and file handles in the cache would be counted towards
+/// that limit.
+///
+/// If there is a file handle in the cache and the underlying file is deleted,
+/// the file handle might keep the file from being deleted at the OS level. This can
+/// take up disk space and impact correctness. To avoid this, the cache will evict any
+/// file handle that has been unused for longer than threshold specified by
+/// `unused_handle_timeout_secs`. Eviction is disabled when the threshold is 0.
+///
+/// TODO: The cache should also evict file handles more aggressively if the file handle's
+/// mtime is older than the file's current mtime.
+template <size_t NUM_PARTITIONS>
+class FileHandleCache {
+ public:
+  /// Instantiates the cache with `capacity` split evenly across NUM_PARTITIONS
+  /// partitions. If the capacity does not split evenly, then the capacity is rounded
+  /// up. The cache will age out any file handle that is unused for
+  /// `unused_handle_timeout_secs` seconds. Age out is disabled if this is set to zero.
+  FileHandleCache(size_t capacity, uint64_t unused_handle_timeout_secs);
+
+  /// Destructor is only called for backend tests
+  ~FileHandleCache();
+
+  /// Starts up a thread that monitors the age of file handles and evicts any that
+  /// exceed the limit.
+  Status Init() WARN_UNUSED_RESULT;
+
+  /// Get a file handle from the cache for the specified filename (fname) and
+  /// last modification time (mtime). This will hash the filename to determine
+  /// which partition to use for this file handle.
+  ///
+  /// If 'require_new_handle' is false and the partition contains an available handle,
+  /// the handle is returned and cache_hit is set to true. Otherwise, the partition will
+  /// try to construct a file handle and add it to the partition. On success, the new
+  /// file handle will be returned with cache_hit set to false. On failure, nullptr will
+  /// be returned. In either case, the partition may evict a file handle to make room
+  /// for the new file handle.
+  ///
+  /// This obtains exclusive control over the returned file handle. It must be paired
+  /// with a call to ReleaseFileHandle to release exclusive control.
+  HdfsFileHandle* GetFileHandle(const hdfsFS& fs, std::string* fname, int64_t mtime,
+      bool require_new_handle, bool* cache_hit);
+
+  /// Release the exclusive hold on the specified file handle (which was obtained
+  /// by calling GetFileHandle). The cache may evict a file handle if the cache is
+  /// above capacity. If 'destroy_handle' is true, immediately remove this handle
+  /// from the cache.
+  void ReleaseFileHandle(std::string* fname, HdfsFileHandle* fh, bool destroy_handle);
+
+ private:
+  struct FileHandleEntry;
+  typedef std::multimap<std::string, FileHandleEntry> MapType;
+
+  struct LruListEntry {
+    LruListEntry(typename MapType::iterator map_entry_in);
+    typename MapType::iterator map_entry;
+    uint64_t timestamp_seconds;
+  };
+  typedef std::list<LruListEntry> LruListType;
+
+  struct FileHandleEntry {
+    FileHandleEntry(HdfsFileHandle* fh_in, LruListType& lru_list)
+    : fh(fh_in), lru_entry(lru_list.end()) {}
+    std::unique_ptr<HdfsFileHandle> fh;
+
+    /// in_use is true for a file handle checked out via GetFileHandle() that has not
+    /// been returned via ReleaseFileHandle().
+    bool in_use = false;
+
+    /// Iterator to this element's location in the LRU list. This only points to a
+    /// valid location when in_use is true. For error-checking, this is set to
+    /// lru_list.end() when in_use is false.
+    typename LruListType::iterator lru_entry;
+  };
+
+  /// Each partition operates independently, and thus has its own cache, LRU list,
+  /// and corresponding lock. To avoid contention on the lock_ due to false sharing
+  /// the partitions are aligned to cache line boundaries.
+  struct FileHandleCachePartition : public CacheLineAligned {
+    /// Protects access to cache and lru_list.
+    SpinLock lock;
+
+    /// Multimap from the file name to the file handles for that file. The cache
+    /// can contain multiple file handles for the same file and some may have
+    /// different mtimes if the file is being modified. All file handles are always
+    /// owned by the cache.
+    MapType cache;
+
+    /// The LRU list only contains file handles that are not in use.
+    LruListType lru_list;
+
+    /// Maximum number of file handles in cache without evicting unused file handles.
+    /// It is not a strict limit, and can be exceeded if all file handles are in use.
+    size_t capacity;
+
+    /// Current number of file handles in the cache
+    size_t size;
+  };
+
+  /// Periodic check to evict unused file handles. Only executed by eviction_thread_.
+  void EvictHandlesLoop();
+  static const int64_t EVICT_HANDLES_PERIOD_MS = 1000;
+
+  /// If the partition is above its capacity, evict the oldest unused file handles to
+  /// enforce the capacity.
+  void EvictHandles(FileHandleCachePartition& p);
+
+  std::array<FileHandleCachePartition, NUM_PARTITIONS> cache_partitions_;
+
+  /// Maximum time before an unused file handle is aged out of the cache.
+  /// Aging out is disabled if this is set to 0.
+  uint64_t unused_handle_timeout_secs_;
+
+  /// Thread to check for unused file handles to evict. This thread will exit when
+  /// the shut_down_promise_ is set.
+  std::unique_ptr<Thread> eviction_thread_;
+  Promise<bool> shut_down_promise_;
+};
+}
+}
+
+#endif

[06/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/util/min-max-filter-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/util/min-max-filter-ir.cc b/be/src/util/min-max-filter-ir.cc
new file mode 100644
index 0000000..130d11d
--- /dev/null
+++ b/be/src/util/min-max-filter-ir.cc
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/min-max-filter.h"
+
+#include "runtime/string-value.inline.h"
+
+using std::string;
+
+namespace impala {
+
+#define NUMERIC_MIN_MAX_FILTER_INSERT(NAME, TYPE) \
+  void NAME##MinMaxFilter::Insert(void* val) {    \
+    if (val == nullptr) return;                   \
+    TYPE* value = reinterpret_cast<TYPE*>(val);   \
+    if (*value < min_) min_ = *value;             \
+    if (*value > max_) max_ = *value;             \
+  }
+
+NUMERIC_MIN_MAX_FILTER_INSERT(Bool, bool);
+NUMERIC_MIN_MAX_FILTER_INSERT(TinyInt, int8_t);
+NUMERIC_MIN_MAX_FILTER_INSERT(SmallInt, int16_t);
+NUMERIC_MIN_MAX_FILTER_INSERT(Int, int32_t);
+NUMERIC_MIN_MAX_FILTER_INSERT(BigInt, int64_t);
+NUMERIC_MIN_MAX_FILTER_INSERT(Float, float);
+NUMERIC_MIN_MAX_FILTER_INSERT(Double, double);
+
+void StringMinMaxFilter::Insert(void* val) {
+  if (val == nullptr || always_true_) return;
+  const StringValue* value = reinterpret_cast<const StringValue*>(val);
+  if (always_false_) {
+    min_ = *value;
+    max_ = *value;
+    always_false_ = false;
+  } else {
+    if (*value < min_) {
+      min_ = *value;
+      min_buffer_.Clear();
+    } else if (*value > max_) {
+      max_ = *value;
+      max_buffer_.Clear();
+    }
+  }
+}
+
+void TimestampMinMaxFilter::Insert(void* val) {
+  if (val == nullptr) return;
+  const TimestampValue* value = reinterpret_cast<const TimestampValue*>(val);
+  if (always_false_) {
+    min_ = *value;
+    max_ = *value;
+    always_false_ = false;
+  } else {
+    if (*value < min_) {
+      min_ = *value;
+    } else if (*value > max_) {
+      max_ = *value;
+    }
+  }
+}
+
+} // namespace impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/util/min-max-filter-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/min-max-filter-test.cc b/be/src/util/min-max-filter-test.cc
new file mode 100644
index 0000000..23712e6
--- /dev/null
+++ b/be/src/util/min-max-filter-test.cc
@@ -0,0 +1,364 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "testutil/gtest-util.h"
+#include "util/min-max-filter.h"
+
+#include "runtime/string-value.inline.h"
+#include "runtime/test-env.h"
+#include "service/fe-support.h"
+#include "util/test-info.h"
+
+DECLARE_bool(enable_webserver);
+
+using namespace impala;
+
+// Tests that a BoolMinMaxFilter returns the expected min/max after having values
+// inserted into it, and that MinMaxFilter::Or works for bools.
+TEST(MinMaxFilterTest, TestBoolMinMaxFilter) {
+  MemTracker mem_tracker;
+  MemPool mem_pool(&mem_tracker);
+  ObjectPool obj_pool;
+
+  MinMaxFilter* filter =
+      MinMaxFilter::Create(ColumnType(PrimitiveType::TYPE_BOOLEAN), &obj_pool, &mem_pool);
+  EXPECT_TRUE(filter->AlwaysFalse());
+  bool b1 = true;
+  filter->Insert(&b1);
+  EXPECT_EQ(*reinterpret_cast<bool*>(filter->GetMin()), b1);
+  EXPECT_EQ(*reinterpret_cast<bool*>(filter->GetMax()), b1);
+  EXPECT_FALSE(filter->AlwaysFalse());
+
+  bool b2 = false;
+  filter->Insert(&b2);
+  EXPECT_EQ(*reinterpret_cast<bool*>(filter->GetMin()), b2);
+  EXPECT_EQ(*reinterpret_cast<bool*>(filter->GetMax()), b1);
+
+  // Check the behavior of Or.
+  TMinMaxFilter tFilter1;
+  tFilter1.min.__set_bool_val(false);
+  tFilter1.max.__set_bool_val(true);
+  TMinMaxFilter tFilter2;
+  tFilter2.min.__set_bool_val(false);
+  tFilter2.max.__set_bool_val(false);
+  MinMaxFilter::Or(tFilter1, &tFilter2);
+  EXPECT_FALSE(tFilter2.min.bool_val);
+  EXPECT_TRUE(tFilter2.max.bool_val);
+}
+
+void CheckIntVals(MinMaxFilter* filter, int32_t min, int32_t max) {
+  EXPECT_EQ(*reinterpret_cast<int32_t*>(filter->GetMin()), min);
+  EXPECT_EQ(*reinterpret_cast<int32_t*>(filter->GetMax()), max);
+  EXPECT_FALSE(filter->AlwaysFalse());
+  EXPECT_FALSE(filter->AlwaysTrue());
+}
+
+// Tests that a IntMinMaxFilter returns the expected min/max after having values
+// inserted into it, and that MinMaxFilter::Or works for ints.
+// This also provides coverage for the other numeric MinMaxFilter types as they're
+// generated with maxcros and the logic is identical.
+TEST(MinMaxFilterTest, TestNumericMinMaxFilter) {
+  MemTracker mem_tracker;
+  MemPool mem_pool(&mem_tracker);
+  ObjectPool obj_pool;
+
+  ColumnType int_type(PrimitiveType::TYPE_INT);
+  MinMaxFilter* int_filter = MinMaxFilter::Create(int_type, &obj_pool, &mem_pool);
+
+  // Test the behavior of an empty filter.
+  EXPECT_TRUE(int_filter->AlwaysFalse());
+  EXPECT_FALSE(int_filter->AlwaysTrue());
+  TMinMaxFilter tFilter;
+  int_filter->ToThrift(&tFilter);
+  EXPECT_TRUE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+  EXPECT_FALSE(tFilter.min.__isset.int_val);
+  EXPECT_FALSE(tFilter.max.__isset.int_val);
+  MinMaxFilter* empty_filter =
+      MinMaxFilter::Create(tFilter, int_type, &obj_pool, &mem_pool);
+  EXPECT_TRUE(empty_filter->AlwaysFalse());
+  EXPECT_FALSE(empty_filter->AlwaysTrue());
+
+  // Now insert some stuff.
+  int32_t i1 = 10;
+  int_filter->Insert(&i1);
+  CheckIntVals(int_filter, i1, i1);
+  int32_t i2 = 15;
+  int_filter->Insert(&i2);
+  CheckIntVals(int_filter, i1, i2);
+  int32_t i3 = 12;
+  int_filter->Insert(&i3);
+  CheckIntVals(int_filter, i1, i2);
+  int32_t i4 = 8;
+  int_filter->Insert(&i4);
+  CheckIntVals(int_filter, i4, i2);
+
+  int_filter->ToThrift(&tFilter);
+  EXPECT_FALSE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+  EXPECT_EQ(tFilter.min.int_val, i4);
+  EXPECT_EQ(tFilter.max.int_val, i2);
+  MinMaxFilter* int_filter2 =
+      MinMaxFilter::Create(tFilter, int_type, &obj_pool, &mem_pool);
+  CheckIntVals(int_filter2, i4, i2);
+
+  // Check the behavior of Or.
+  TMinMaxFilter tFilter1;
+  tFilter1.min.__set_int_val(4);
+  tFilter1.max.__set_int_val(8);
+  TMinMaxFilter tFilter2;
+  tFilter2.min.__set_int_val(2);
+  tFilter2.max.__set_int_val(7);
+  MinMaxFilter::Or(tFilter1, &tFilter2);
+  EXPECT_EQ(tFilter2.min.int_val, 2);
+  EXPECT_EQ(tFilter2.max.int_val, 8);
+}
+
+void CheckStringVals(MinMaxFilter* filter, const string& min, const string& max) {
+  StringValue actual_min = *reinterpret_cast<StringValue*>(filter->GetMin());
+  StringValue actual_max = *reinterpret_cast<StringValue*>(filter->GetMax());
+  StringValue expected_min(min);
+  StringValue expected_max(max);
+  EXPECT_EQ(actual_min, expected_min);
+  EXPECT_EQ(actual_max, expected_max);
+  EXPECT_FALSE(filter->AlwaysTrue());
+  EXPECT_FALSE(filter->AlwaysFalse());
+}
+
+// Tests that a StringMinMaxFilter returns the expected min/max after having values
+// inserted into it, and that MinMaxFilter::Or works for strings.
+// Also tests truncation behavior when inserted strings are larger than MAX_BOUND_LENTH
+// and that the filter is disabled if there's not enough mem to store the min/max.
+TEST(MinMaxFilterTest, TestStringMinMaxFilter) {
+  ObjectPool obj_pool;
+  MemTracker mem_tracker;
+  MemPool mem_pool(&mem_tracker);
+
+  ColumnType string_type(PrimitiveType::TYPE_STRING);
+  MinMaxFilter* filter = MinMaxFilter::Create(string_type, &obj_pool, &mem_pool);
+
+  // Test the behavior of an empty filter.
+  EXPECT_TRUE(filter->AlwaysFalse());
+  EXPECT_FALSE(filter->AlwaysTrue());
+  filter->MaterializeValues();
+  EXPECT_TRUE(filter->AlwaysFalse());
+  EXPECT_FALSE(filter->AlwaysTrue());
+  TMinMaxFilter tFilter;
+  filter->ToThrift(&tFilter);
+  EXPECT_TRUE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+
+  MinMaxFilter* empty_filter =
+      MinMaxFilter::Create(tFilter, string_type, &obj_pool, &mem_pool);
+  EXPECT_TRUE(empty_filter->AlwaysFalse());
+  EXPECT_FALSE(empty_filter->AlwaysTrue());
+
+  // Now insert some stuff.
+  string c = "c";
+  StringValue cVal(c);
+  filter->Insert(&cVal);
+  filter->MaterializeValues();
+  CheckStringVals(filter, c, c);
+
+  string d = "d";
+  StringValue dVal(d);
+  filter->Insert(&dVal);
+  filter->MaterializeValues();
+  CheckStringVals(filter, c, d);
+
+  string cc = "cc";
+  StringValue ccVal(cc);
+  filter->Insert(&ccVal);
+  filter->MaterializeValues();
+  CheckStringVals(filter, c, d);
+
+  filter->ToThrift(&tFilter);
+  EXPECT_FALSE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+  EXPECT_EQ(tFilter.min.string_val, c);
+  EXPECT_EQ(tFilter.max.string_val, d);
+
+  // Test that strings longer than 1024 are truncated.
+  string b1030(1030, 'b');
+  StringValue b1030Val(b1030);
+  filter->Insert(&b1030Val);
+  filter->MaterializeValues();
+  string b1024(1024, 'b');
+  CheckStringVals(filter, b1024, d);
+
+  string e1030(1030, 'e');
+  StringValue e1030Val(e1030);
+  filter->Insert(&e1030Val);
+  filter->MaterializeValues();
+  string e1024(1024, 'e');
+  // For max, after truncating the final char is increased by one.
+  e1024[1023] = 'f';
+  CheckStringVals(filter, b1024, e1024);
+
+  string trailMaxChar(1030, 'f');
+  int trailIndex = 1020;
+  for (int i = trailIndex; i < 1030; ++i) trailMaxChar[i] = -1;
+  StringValue trailMaxCharVal(trailMaxChar);
+  filter->Insert(&trailMaxCharVal);
+  filter->MaterializeValues();
+  // Check that when adding one for max, if the final char is the max char it overflows
+  // and carries.
+  string truncTrailMaxChar(1024, 'f');
+  truncTrailMaxChar[trailIndex - 1] = 'g';
+  for (int i = trailIndex; i < 1024; ++i) truncTrailMaxChar[i] = 0;
+  CheckStringVals(filter, b1024, truncTrailMaxChar);
+
+  filter->ToThrift(&tFilter);
+  EXPECT_FALSE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+  EXPECT_EQ(tFilter.min.string_val, b1024);
+  EXPECT_EQ(tFilter.max.string_val, truncTrailMaxChar);
+
+  MinMaxFilter* filter2 =
+      MinMaxFilter::Create(tFilter, string_type, &obj_pool, &mem_pool);
+  CheckStringVals(filter2, b1024, truncTrailMaxChar);
+
+  // Check that if the entire string is the max char and therefore after truncating for
+  // max we can't add one, the filter is disabled.
+  string allMaxChar(1030, -1);
+  StringValue allMaxCharVal(allMaxChar);
+  filter->Insert(&allMaxCharVal);
+  filter->MaterializeValues();
+  EXPECT_TRUE(filter->AlwaysTrue());
+
+  // We should still be able to insert into a disabled filter.
+  filter->Insert(&cVal);
+  EXPECT_TRUE(filter->AlwaysTrue());
+
+  filter->ToThrift(&tFilter);
+  EXPECT_FALSE(tFilter.always_false);
+  EXPECT_TRUE(tFilter.always_true);
+
+  MinMaxFilter* always_true_filter =
+      MinMaxFilter::Create(tFilter, string_type, &obj_pool, &mem_pool);
+  EXPECT_FALSE(always_true_filter->AlwaysFalse());
+  EXPECT_TRUE(always_true_filter->AlwaysTrue());
+
+  mem_pool.FreeAll();
+
+  // Check that a filter that hits the mem limit is disabled.
+  MemTracker limit_mem_tracker(1);
+  MemPool limit_mem_pool(&limit_mem_tracker);
+  // We do not want to start the webserver.
+  FLAGS_enable_webserver = false;
+  std::unique_ptr<TestEnv> env;
+  env.reset(new TestEnv());
+  ASSERT_OK(env->Init());
+
+  MinMaxFilter* limit_filter =
+      MinMaxFilter::Create(string_type, &obj_pool, &limit_mem_pool);
+  EXPECT_FALSE(limit_filter->AlwaysTrue());
+  limit_filter->Insert(&cVal);
+  limit_filter->MaterializeValues();
+  EXPECT_TRUE(limit_filter->AlwaysTrue());
+  limit_filter->Insert(&dVal);
+  limit_filter->MaterializeValues();
+  EXPECT_TRUE(limit_filter->AlwaysTrue());
+
+  limit_filter->ToThrift(&tFilter);
+  EXPECT_FALSE(tFilter.always_false);
+  EXPECT_TRUE(tFilter.always_true);
+
+  // Check the behavior of Or.
+  TMinMaxFilter tFilter1;
+  tFilter1.min.__set_string_val("a");
+  tFilter1.max.__set_string_val("d");
+  TMinMaxFilter tFilter2;
+  tFilter2.min.__set_string_val("b");
+  tFilter2.max.__set_string_val("e");
+  MinMaxFilter::Or(tFilter1, &tFilter2);
+  EXPECT_EQ(tFilter2.min.string_val, "a");
+  EXPECT_EQ(tFilter2.max.string_val, "e");
+}
+
+void CheckTimestampVals(
+    MinMaxFilter* filter, const TimestampValue& min, const TimestampValue& max) {
+  EXPECT_EQ(*reinterpret_cast<TimestampValue*>(filter->GetMin()), min);
+  EXPECT_EQ(*reinterpret_cast<TimestampValue*>(filter->GetMax()), max);
+  EXPECT_FALSE(filter->AlwaysFalse());
+  EXPECT_FALSE(filter->AlwaysTrue());
+}
+
+// Tests that a TimestampMinMaxFilter returns the expected min/max after having values
+// inserted into it, and that MinMaxFilter::Or works for timestamps.
+TEST(MinMaxFilterTest, TestTimestampMinMaxFilter) {
+  ObjectPool obj_pool;
+  MemTracker mem_tracker;
+  MemPool mem_pool(&mem_tracker);
+  ColumnType timestamp_type(PrimitiveType::TYPE_TIMESTAMP);
+  MinMaxFilter* filter = MinMaxFilter::Create(timestamp_type, &obj_pool, &mem_pool);
+
+  // Test the behavior of an empty filter.
+  EXPECT_TRUE(filter->AlwaysFalse());
+  EXPECT_FALSE(filter->AlwaysTrue());
+  TMinMaxFilter tFilter;
+  filter->ToThrift(&tFilter);
+  EXPECT_TRUE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+  EXPECT_FALSE(tFilter.min.__isset.timestamp_val);
+  EXPECT_FALSE(tFilter.max.__isset.timestamp_val);
+  MinMaxFilter* empty_filter =
+      MinMaxFilter::Create(tFilter, timestamp_type, &obj_pool, &mem_pool);
+  EXPECT_TRUE(empty_filter->AlwaysFalse());
+  EXPECT_FALSE(empty_filter->AlwaysTrue());
+
+  // Now insert some stuff.
+  TimestampValue t1 = TimestampValue::Parse("2000-01-01 00:00:00");
+  filter->Insert(&t1);
+  CheckTimestampVals(filter, t1, t1);
+  TimestampValue t2 = TimestampValue::Parse("1990-01-01 12:30:00");
+  filter->Insert(&t2);
+  CheckTimestampVals(filter, t2, t1);
+  TimestampValue t3 = TimestampValue::Parse("2001-04-30 05:00:00");
+  filter->Insert(&t3);
+  CheckTimestampVals(filter, t2, t3);
+  TimestampValue t4 = TimestampValue::Parse("2001-04-30 01:00:00");
+  filter->Insert(&t4);
+  CheckTimestampVals(filter, t2, t3);
+
+  filter->ToThrift(&tFilter);
+  EXPECT_FALSE(tFilter.always_false);
+  EXPECT_FALSE(tFilter.always_true);
+  EXPECT_EQ(TimestampValue::FromTColumnValue(tFilter.min), t2);
+  EXPECT_EQ(TimestampValue::FromTColumnValue(tFilter.max), t3);
+  MinMaxFilter* filter2 =
+      MinMaxFilter::Create(tFilter, timestamp_type, &obj_pool, &mem_pool);
+  CheckTimestampVals(filter2, t2, t3);
+
+  // Check the behavior of Or.
+  TMinMaxFilter tFilter1;
+  t2.ToTColumnValue(&tFilter1.min);
+  t4.ToTColumnValue(&tFilter1.max);
+  TMinMaxFilter tFilter2;
+  t1.ToTColumnValue(&tFilter2.min);
+  t3.ToTColumnValue(&tFilter2.max);
+  MinMaxFilter::Or(tFilter1, &tFilter2);
+  EXPECT_EQ(TimestampValue::FromTColumnValue(tFilter2.min), t2);
+  EXPECT_EQ(TimestampValue::FromTColumnValue(tFilter2.max), t3);
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  InitCommonRuntime(argc, argv, true, TestInfo::BE_TEST);
+  InitFeSupport();
+  return RUN_ALL_TESTS();
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/util/min-max-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/util/min-max-filter.cc b/be/src/util/min-max-filter.cc
new file mode 100644
index 0000000..f50f896
--- /dev/null
+++ b/be/src/util/min-max-filter.cc
@@ -0,0 +1,529 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/min-max-filter.h"
+
+#include <sstream>
+#include <unordered_map>
+
+#include "common/object-pool.h"
+#include "runtime/raw-value.h"
+#include "runtime/string-value.inline.h"
+#include "runtime/timestamp-value.inline.h"
+
+using std::numeric_limits;
+using std::stringstream;
+
+namespace impala {
+
+static std::unordered_map<int, string> MIN_MAX_FILTER_LLVM_CLASS_NAMES = {
+    {PrimitiveType::TYPE_BOOLEAN, BoolMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_TINYINT, TinyIntMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_SMALLINT, SmallIntMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_INT, IntMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_BIGINT, BigIntMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_FLOAT, FloatMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_DOUBLE, DoubleMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_STRING, StringMinMaxFilter::LLVM_CLASS_NAME},
+    {PrimitiveType::TYPE_TIMESTAMP, TimestampMinMaxFilter::LLVM_CLASS_NAME}};
+
+static std::unordered_map<int, IRFunction::Type> MIN_MAX_FILTER_IR_FUNCTION_TYPES = {
+    {PrimitiveType::TYPE_BOOLEAN, IRFunction::BOOL_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_TINYINT, IRFunction::TINYINT_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_SMALLINT, IRFunction::SMALLINT_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_INT, IRFunction::INT_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_BIGINT, IRFunction::BIGINT_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_FLOAT, IRFunction::FLOAT_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_DOUBLE, IRFunction::DOUBLE_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_STRING, IRFunction::STRING_MIN_MAX_FILTER_INSERT},
+    {PrimitiveType::TYPE_TIMESTAMP, IRFunction::TIMESTAMP_MIN_MAX_FILTER_INSERT}};
+
+string MinMaxFilter::GetLlvmClassName(PrimitiveType type) {
+  return MIN_MAX_FILTER_LLVM_CLASS_NAMES[type];
+}
+
+IRFunction::Type MinMaxFilter::GetInsertIRFunctionType(PrimitiveType type) {
+  return MIN_MAX_FILTER_IR_FUNCTION_TYPES[type];
+}
+
+#define NUMERIC_MIN_MAX_FILTER_FUNCS(NAME, TYPE, THRIFT_TYPE, PRIMITIVE_TYPE)  \
+  const char* NAME##MinMaxFilter::LLVM_CLASS_NAME =                            \
+      "class.impala::" #NAME "MinMaxFilter";                                   \
+  NAME##MinMaxFilter::NAME##MinMaxFilter(const TMinMaxFilter& thrift) {        \
+    DCHECK(!thrift.always_true);                                               \
+    if (thrift.always_false) {                                                 \
+      min_ = numeric_limits<TYPE>::max();                                      \
+      max_ = numeric_limits<TYPE>::lowest();                                   \
+    } else {                                                                   \
+      DCHECK(thrift.__isset.min);                                              \
+      DCHECK(thrift.__isset.max);                                              \
+      DCHECK(thrift.min.__isset.THRIFT_TYPE##_val);                            \
+      DCHECK(thrift.max.__isset.THRIFT_TYPE##_val);                            \
+      min_ = thrift.min.THRIFT_TYPE##_val;                                     \
+      max_ = thrift.max.THRIFT_TYPE##_val;                                     \
+    }                                                                          \
+  }                                                                            \
+  PrimitiveType NAME##MinMaxFilter::type() {                                   \
+    return PrimitiveType::TYPE_##PRIMITIVE_TYPE;                               \
+  }                                                                            \
+  void NAME##MinMaxFilter::ToThrift(TMinMaxFilter* thrift) const {             \
+    if (!AlwaysFalse()) {                                                      \
+      thrift->min.__set_##THRIFT_TYPE##_val(min_);                             \
+      thrift->__isset.min = true;                                              \
+      thrift->max.__set_##THRIFT_TYPE##_val(max_);                             \
+      thrift->__isset.max = true;                                              \
+    }                                                                          \
+    thrift->__set_always_false(AlwaysFalse());                                 \
+    thrift->__set_always_true(false);                                          \
+  }                                                                            \
+  string NAME##MinMaxFilter::DebugString() const {                             \
+    stringstream out;                                                          \
+    out << #NAME << "MinMaxFilter(min=" << min_ << ", max=" << max_            \
+        << ", always_false=" << (AlwaysFalse() ? "true" : "false") << ")";     \
+    return out.str();                                                          \
+  }                                                                            \
+  void NAME##MinMaxFilter::Or(const TMinMaxFilter& in, TMinMaxFilter* out) {   \
+    if (out->always_false) {                                                   \
+      out->min.__set_##THRIFT_TYPE##_val(in.min.THRIFT_TYPE##_val);            \
+      out->__isset.min = true;                                                 \
+      out->max.__set_##THRIFT_TYPE##_val(in.max.THRIFT_TYPE##_val);            \
+      out->__isset.max = true;                                                 \
+      out->__set_always_false(false);                                          \
+    } else {                                                                   \
+      out->min.__set_##THRIFT_TYPE##_val(                                      \
+          std::min(in.min.THRIFT_TYPE##_val, out->min.THRIFT_TYPE##_val));     \
+      out->max.__set_##THRIFT_TYPE##_val(                                      \
+          std::max(in.max.THRIFT_TYPE##_val, out->max.THRIFT_TYPE##_val));     \
+    }                                                                          \
+  }                                                                            \
+  void NAME##MinMaxFilter::Copy(const TMinMaxFilter& in, TMinMaxFilter* out) { \
+    out->min.__set_##THRIFT_TYPE##_val(in.min.THRIFT_TYPE##_val);              \
+    out->__isset.min = true;                                                   \
+    out->max.__set_##THRIFT_TYPE##_val(in.max.THRIFT_TYPE##_val);              \
+    out->__isset.max = true;                                                   \
+  }
+
+NUMERIC_MIN_MAX_FILTER_FUNCS(Bool, bool, bool, BOOLEAN);
+NUMERIC_MIN_MAX_FILTER_FUNCS(TinyInt, int8_t, byte, TINYINT);
+NUMERIC_MIN_MAX_FILTER_FUNCS(SmallInt, int16_t, short, SMALLINT);
+NUMERIC_MIN_MAX_FILTER_FUNCS(Int, int32_t, int, INT);
+NUMERIC_MIN_MAX_FILTER_FUNCS(BigInt, int64_t, long, BIGINT);
+NUMERIC_MIN_MAX_FILTER_FUNCS(Float, float, double, FLOAT);
+NUMERIC_MIN_MAX_FILTER_FUNCS(Double, double, double, DOUBLE);
+
+int64_t GetIntTypeMax(const ColumnType& type) {
+  switch (type.type) {
+    case TYPE_TINYINT:
+      return numeric_limits<int8_t>::max();
+    case TYPE_SMALLINT:
+      return numeric_limits<int16_t>::max();
+    case TYPE_INT:
+      return numeric_limits<int32_t>::max();
+    case TYPE_BIGINT:
+      return numeric_limits<int64_t>::max();
+    default:
+      DCHECK(false) << "Not an int type: " << type;
+  }
+  return -1;
+}
+
+int64_t GetIntTypeMin(const ColumnType& type) {
+  switch (type.type) {
+    case TYPE_TINYINT:
+      return numeric_limits<int8_t>::lowest();
+    case TYPE_SMALLINT:
+      return numeric_limits<int16_t>::lowest();
+    case TYPE_INT:
+      return numeric_limits<int32_t>::lowest();
+    case TYPE_BIGINT:
+      return numeric_limits<int64_t>::lowest();
+    default:
+      DCHECK(false) << "Not an int type: " << type;
+  }
+  return -1;
+}
+
+#define NUMERIC_MIN_MAX_FILTER_CAST(NAME)                           \
+  bool NAME##MinMaxFilter::GetCastIntMinMax(                        \
+      const ColumnType& type, int64_t* out_min, int64_t* out_max) { \
+    int64_t type_min = GetIntTypeMin(type);                         \
+    int64_t type_max = GetIntTypeMax(type);                         \
+    if (min_ < type_min) {                                          \
+      *out_min = type_min;                                          \
+    } else if (min_ > type_max) {                                   \
+      return false;                                                 \
+    } else {                                                        \
+      *out_min = min_;                                              \
+    }                                                               \
+    if (max_ > type_max) {                                          \
+      *out_max = type_max;                                          \
+    } else if (max_ < type_min) {                                   \
+      return false;                                                 \
+    } else {                                                        \
+      *out_max = max_;                                              \
+    }                                                               \
+    return true;                                                    \
+  }
+
+NUMERIC_MIN_MAX_FILTER_CAST(TinyInt);
+NUMERIC_MIN_MAX_FILTER_CAST(SmallInt);
+NUMERIC_MIN_MAX_FILTER_CAST(Int);
+NUMERIC_MIN_MAX_FILTER_CAST(BigInt);
+
+#define NUMERIC_MIN_MAX_FILTER_NO_CAST(NAME)                                           \
+  bool NAME##MinMaxFilter::GetCastIntMinMax(                                           \
+      const ColumnType& type, int64_t* out_min, int64_t* out_max) {                    \
+    DCHECK(false) << "Casting min-max filters of type " << #NAME << " not supported."; \
+    return true;                                                                       \
+  }
+
+NUMERIC_MIN_MAX_FILTER_NO_CAST(Bool);
+NUMERIC_MIN_MAX_FILTER_NO_CAST(Float);
+NUMERIC_MIN_MAX_FILTER_NO_CAST(Double);
+
+// STRING
+const char* StringMinMaxFilter::LLVM_CLASS_NAME = "class.impala::StringMinMaxFilter";
+const int StringMinMaxFilter::MAX_BOUND_LENGTH = 1024;
+
+StringMinMaxFilter::StringMinMaxFilter(const TMinMaxFilter& thrift, MemPool* mem_pool)
+  : min_buffer_(mem_pool), max_buffer_(mem_pool) {
+  always_false_ = thrift.always_false;
+  always_true_ = thrift.always_true;
+  if (!always_true_ && !always_false_) {
+    DCHECK(thrift.__isset.min);
+    DCHECK(thrift.__isset.max);
+    DCHECK(thrift.min.__isset.string_val);
+    DCHECK(thrift.max.__isset.string_val);
+    min_ = StringValue(thrift.min.string_val);
+    max_ = StringValue(thrift.max.string_val);
+    CopyToBuffer(&min_buffer_, &min_, min_.len);
+    CopyToBuffer(&max_buffer_, &max_, max_.len);
+  }
+}
+
+PrimitiveType StringMinMaxFilter::type() {
+  return PrimitiveType::TYPE_STRING;
+}
+
+void StringMinMaxFilter::MaterializeValues() {
+  if (always_true_ || always_false_) return;
+  if (min_buffer_.IsEmpty()) {
+    if (min_.len > MAX_BOUND_LENGTH) {
+      // Truncating 'value' gives a valid min bound as the result will be <= 'value'.
+      CopyToBuffer(&min_buffer_, &min_, MAX_BOUND_LENGTH);
+    } else {
+      CopyToBuffer(&min_buffer_, &min_, min_.len);
+    }
+  }
+  if (max_buffer_.IsEmpty()) {
+    if (max_.len > MAX_BOUND_LENGTH) {
+      CopyToBuffer(&max_buffer_, &max_, MAX_BOUND_LENGTH);
+      if (always_true_) return;
+      // After truncating 'value', to still have a valid max bound we add 1 to one char in
+      // the string, so that the result will be > 'value'. If the entire string is already
+      // the max char, then disable this filter by making it always_true.
+      int i = MAX_BOUND_LENGTH - 1;
+      while (i >= 0 && static_cast<int32_t>(max_buffer_.buffer()[i]) == -1) {
+        max_buffer_.buffer()[i] = max_buffer_.buffer()[i] + 1;
+        --i;
+      }
+      if (i == -1) {
+        SetAlwaysTrue();
+        return;
+      }
+      max_buffer_.buffer()[i] = max_buffer_.buffer()[i] + 1;
+    } else {
+      CopyToBuffer(&max_buffer_, &max_, max_.len);
+    }
+  }
+}
+
+void StringMinMaxFilter::ToThrift(TMinMaxFilter* thrift) const {
+  if (!always_true_ && !always_false_) {
+    thrift->min.string_val.assign(static_cast<char*>(min_.ptr), min_.len);
+    thrift->min.__isset.string_val = true;
+    thrift->__isset.min = true;
+    thrift->max.string_val.assign(static_cast<char*>(max_.ptr), max_.len);
+    thrift->max.__isset.string_val = true;
+    thrift->__isset.max = true;
+  }
+  thrift->__set_always_false(always_false_);
+  thrift->__set_always_true(always_true_);
+}
+
+string StringMinMaxFilter::DebugString() const {
+  stringstream out;
+  out << "StringMinMaxFilter(min=" << min_ << ", max=" << max_
+      << ", always_false=" << (always_false_ ? "true" : "false")
+      << ", always_true=" << (always_true_ ? "true" : "false") << ")";
+  return out.str();
+}
+
+void StringMinMaxFilter::Or(const TMinMaxFilter& in, TMinMaxFilter* out) {
+  if (out->always_false) {
+    out->min.__set_string_val(in.min.string_val);
+    out->__isset.min = true;
+    out->max.__set_string_val(in.max.string_val);
+    out->__isset.max = true;
+    out->__set_always_false(false);
+  } else {
+    StringValue in_min_val = StringValue(in.min.string_val);
+    StringValue out_min_val = StringValue(out->min.string_val);
+    if (in_min_val < out_min_val) out->min.__set_string_val(in.min.string_val);
+    StringValue in_max_val = StringValue(in.max.string_val);
+    StringValue out_max_val = StringValue(out->max.string_val);
+    if (in_max_val > out_max_val) out->max.__set_string_val(in.max.string_val);
+  }
+}
+
+void StringMinMaxFilter::Copy(const TMinMaxFilter& in, TMinMaxFilter* out) {
+  out->min.__set_string_val(in.min.string_val);
+  out->__isset.min = true;
+  out->max.__set_string_val(in.max.string_val);
+  out->__isset.max = true;
+}
+
+void StringMinMaxFilter::CopyToBuffer(
+    StringBuffer* buffer, StringValue* value, int64_t len) {
+  if (value->ptr == buffer->buffer()) return;
+  buffer->Clear();
+  if (!buffer->Append(value->ptr, len).ok()) {
+    // If Append() fails, for example because we're out of memory, disable the filter.
+    SetAlwaysTrue();
+    return;
+  }
+  value->ptr = buffer->buffer();
+  value->len = len;
+}
+
+void StringMinMaxFilter::SetAlwaysTrue() {
+  always_true_ = true;
+  max_buffer_.Clear();
+  min_buffer_.Clear();
+  min_.ptr = nullptr;
+  min_.len = 0;
+  max_.ptr = nullptr;
+  max_.len = 0;
+}
+
+// TIMESTAMP
+const char* TimestampMinMaxFilter::LLVM_CLASS_NAME =
+    "class.impala::TimestampMinMaxFilter";
+
+TimestampMinMaxFilter::TimestampMinMaxFilter(const TMinMaxFilter& thrift) {
+  always_false_ = thrift.always_false;
+  if (!always_false_) {
+    DCHECK(thrift.min.__isset.timestamp_val);
+    DCHECK(thrift.max.__isset.timestamp_val);
+    min_ = TimestampValue::FromTColumnValue(thrift.min);
+    max_ = TimestampValue::FromTColumnValue(thrift.max);
+  }
+}
+
+PrimitiveType TimestampMinMaxFilter::type() {
+  return PrimitiveType::TYPE_TIMESTAMP;
+}
+
+void TimestampMinMaxFilter::ToThrift(TMinMaxFilter* thrift) const {
+  if (!always_false_) {
+    min_.ToTColumnValue(&thrift->min);
+    thrift->__isset.min = true;
+    max_.ToTColumnValue(&thrift->max);
+    thrift->__isset.max = true;
+  }
+  thrift->__set_always_false(always_false_);
+  thrift->__set_always_true(false);
+}
+
+string TimestampMinMaxFilter::DebugString() const {
+  stringstream out;
+  out << "TimestampMinMaxFilter(min=" << min_ << ", max=" << max_
+      << " always_false=" << (always_false_ ? "true" : "false") << ")";
+  return out.str();
+}
+
+void TimestampMinMaxFilter::Or(const TMinMaxFilter& in, TMinMaxFilter* out) {
+  if (out->always_false) {
+    out->min.__set_timestamp_val(in.min.timestamp_val);
+    out->__isset.min = true;
+    out->max.__set_timestamp_val(in.max.timestamp_val);
+    out->__isset.max = true;
+    out->__set_always_false(false);
+  } else {
+    TimestampValue in_min_val = TimestampValue::FromTColumnValue(in.min);
+    TimestampValue out_min_val = TimestampValue::FromTColumnValue(out->min);
+    if (in_min_val < out_min_val) out->min.__set_timestamp_val(in.min.timestamp_val);
+    TimestampValue in_max_val = TimestampValue::FromTColumnValue(in.max);
+    TimestampValue out_max_val = TimestampValue::FromTColumnValue(out->max);
+    if (in_max_val > out_max_val) out->max.__set_timestamp_val(in.max.timestamp_val);
+  }
+}
+
+void TimestampMinMaxFilter::Copy(const TMinMaxFilter& in, TMinMaxFilter* out) {
+  out->min.__set_timestamp_val(in.min.timestamp_val);
+  out->__isset.min = true;
+  out->max.__set_timestamp_val(in.max.timestamp_val);
+  out->__isset.max = true;
+}
+
+// MinMaxFilter
+bool MinMaxFilter::GetCastIntMinMax(
+    const ColumnType& type, int64_t* out_min, int64_t* out_max) {
+  DCHECK(false) << "Casting min-max filters of type " << this->type()
+      << " not supported.";
+  return true;
+}
+
+MinMaxFilter* MinMaxFilter::Create(ColumnType type, ObjectPool* pool, MemPool* mem_pool) {
+  switch (type.type) {
+    case PrimitiveType::TYPE_BOOLEAN:
+      return pool->Add(new BoolMinMaxFilter());
+    case PrimitiveType::TYPE_TINYINT:
+      return pool->Add(new TinyIntMinMaxFilter());
+    case PrimitiveType::TYPE_SMALLINT:
+      return pool->Add(new SmallIntMinMaxFilter());
+    case PrimitiveType::TYPE_INT:
+      return pool->Add(new IntMinMaxFilter());
+    case PrimitiveType::TYPE_BIGINT:
+      return pool->Add(new BigIntMinMaxFilter());
+    case PrimitiveType::TYPE_FLOAT:
+      return pool->Add(new FloatMinMaxFilter());
+    case PrimitiveType::TYPE_DOUBLE:
+      return pool->Add(new DoubleMinMaxFilter());
+    case PrimitiveType::TYPE_STRING:
+      return pool->Add(new StringMinMaxFilter(mem_pool));
+    case PrimitiveType::TYPE_TIMESTAMP:
+      return pool->Add(new TimestampMinMaxFilter());
+    default:
+      DCHECK(false) << "Unsupported MinMaxFilter type: " << type;
+  }
+  return nullptr;
+}
+
+MinMaxFilter* MinMaxFilter::Create(
+    const TMinMaxFilter& thrift, ColumnType type, ObjectPool* pool, MemPool* mem_pool) {
+  switch (type.type) {
+    case PrimitiveType::TYPE_BOOLEAN:
+      return pool->Add(new BoolMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_TINYINT:
+      return pool->Add(new TinyIntMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_SMALLINT:
+      return pool->Add(new SmallIntMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_INT:
+      return pool->Add(new IntMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_BIGINT:
+      return pool->Add(new BigIntMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_FLOAT:
+      return pool->Add(new FloatMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_DOUBLE:
+      return pool->Add(new DoubleMinMaxFilter(thrift));
+    case PrimitiveType::TYPE_STRING:
+      return pool->Add(new StringMinMaxFilter(thrift, mem_pool));
+    case PrimitiveType::TYPE_TIMESTAMP:
+      return pool->Add(new TimestampMinMaxFilter(thrift));
+    default:
+      DCHECK(false) << "Unsupported MinMaxFilter type: " << type;
+  }
+  return nullptr;
+}
+
+void MinMaxFilter::Or(const TMinMaxFilter& in, TMinMaxFilter* out) {
+  if (in.always_false || out->always_true) return;
+  if (in.always_true) {
+    out->__set_always_true(true);
+    return;
+  }
+  if (in.min.__isset.bool_val) {
+    DCHECK(out->min.__isset.bool_val);
+    BoolMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.byte_val) {
+    DCHECK(out->min.__isset.byte_val);
+    TinyIntMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.short_val) {
+    DCHECK(out->min.__isset.short_val);
+    SmallIntMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.int_val) {
+    DCHECK(out->min.__isset.int_val);
+    IntMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.long_val) {
+    DCHECK(out->min.__isset.long_val);
+    BigIntMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.double_val) {
+    // Handles FloatMinMaxFilter also as TColumnValue doesn't have a float type.
+    DCHECK(out->min.__isset.double_val);
+    DoubleMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.string_val) {
+    DCHECK(out->min.__isset.string_val);
+    StringMinMaxFilter::Or(in, out);
+    return;
+  } else if (in.min.__isset.timestamp_val) {
+    DCHECK(out->min.__isset.timestamp_val);
+    TimestampMinMaxFilter::Or(in, out);
+    return;
+  }
+  DCHECK(false) << "Unsupported MinMaxFilter type.";
+}
+
+void MinMaxFilter::Copy(const TMinMaxFilter& in, TMinMaxFilter* out) {
+  out->__set_always_false(in.always_false);
+  out->__set_always_true(in.always_true);
+  if (in.always_false || in.always_true) return;
+  if (in.min.__isset.bool_val) {
+    DCHECK(!out->min.__isset.bool_val);
+    BoolMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.byte_val) {
+    DCHECK(!out->min.__isset.byte_val);
+    TinyIntMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.short_val) {
+    DCHECK(!out->min.__isset.short_val);
+    SmallIntMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.int_val) {
+    DCHECK(!out->min.__isset.int_val);
+    IntMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.long_val) {
+    // Handles TimestampMinMaxFilter also as TColumnValue doesn't have a timestamp type.
+    DCHECK(!out->min.__isset.long_val);
+    BigIntMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.double_val) {
+    // Handles FloatMinMaxFilter also as TColumnValue doesn't have a float type.
+    DCHECK(!out->min.__isset.double_val);
+    DoubleMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.string_val) {
+    DCHECK(!out->min.__isset.string_val);
+    StringMinMaxFilter::Copy(in, out);
+    return;
+  } else if (in.min.__isset.timestamp_val) {
+    DCHECK(!out->min.__isset.timestamp_val);
+    TimestampMinMaxFilter::Copy(in, out);
+    return;
+  }
+  DCHECK(false) << "Unsupported MinMaxFilter type.";
+}
+
+} // namespace impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/util/min-max-filter.h
----------------------------------------------------------------------
diff --git a/be/src/util/min-max-filter.h b/be/src/util/min-max-filter.h
new file mode 100644
index 0000000..556f5fa
--- /dev/null
+++ b/be/src/util/min-max-filter.h
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_UTIL_MIN_MAX_FILTER_H
+#define IMPALA_UTIL_MIN_MAX_FILTER_H
+
+#include "gen-cpp/ImpalaInternalService_types.h"
+#include "impala-ir/impala-ir-functions.h"
+#include "runtime/string-buffer.h"
+#include "runtime/string-value.h"
+#include "runtime/timestamp-value.h"
+#include "runtime/types.h"
+
+namespace impala {
+
+class MemPool;
+class ObjectPool;
+
+/// A MinMaxFilter tracks the min and max currently seen values in a data set for use in
+/// runtime filters.
+///
+/// Filters are constructed using MinMaxFilter::Create() which returns a MinMaxFilter of
+/// the appropriate type. Values can then be added using Insert(), and the min and max can
+/// be retrieved using GetMin()/GetMax().
+///
+/// MinMaxFilters ignore NULL values, and so are only appropriate to use as a runtime
+/// filter if the join predicate is '=' and not 'is not distinct from'.
+class MinMaxFilter {
+ public:
+  virtual ~MinMaxFilter() {}
+
+  /// Returns the min/max values in the tuple slot representation. It is not valid to call
+  /// these functions if AlwaysFalse() returns true.
+  virtual void* GetMin() = 0;
+  virtual void* GetMax() = 0;
+
+  /// Returns the min/max values in the out paramsters 'out_min'/'out_max', converted to
+  /// fit into 'type', eg. if the calculated max value is greater than the max value for
+  /// 'type', the returned max is the max for 'type'. Returns false if the entire range
+  /// from the calculated min to max is outside the range for 'type'. May only be called
+  /// for integer-typed filters.
+  virtual bool GetCastIntMinMax(
+      const ColumnType& type, int64_t* out_min, int64_t* out_max);
+
+  virtual PrimitiveType type() = 0;
+
+  /// Add a new value, updating the current min/max.
+  virtual void Insert(void* val) = 0;
+
+  /// If true, this filter allows all rows to pass.
+  virtual bool AlwaysTrue() const = 0;
+
+  /// If true, this filter doesn't allow any rows to pass.
+  virtual bool AlwaysFalse() const = 0;
+
+  /// Materialize filter values by copying any values stored by filters into memory owned
+  /// by the filter. Filters may assume that the memory for Insert()-ed values stays valid
+  /// until this is called.
+  virtual void MaterializeValues() {}
+
+  /// Convert this filter to a thrift representation.
+  virtual void ToThrift(TMinMaxFilter* thrift) const = 0;
+
+  virtual std::string DebugString() const = 0;
+
+  /// Returns a new MinMaxFilter with the given type, allocated from 'pool'.
+  static MinMaxFilter* Create(ColumnType type, ObjectPool* pool, MemPool* mem_pool);
+
+  /// Returns a new MinMaxFilter created from the thrift representation, allocated from
+  /// 'pool'.
+  static MinMaxFilter* Create(
+      const TMinMaxFilter& thrift, ColumnType type, ObjectPool* pool, MemPool* mem_pool);
+
+  /// Computes the logical OR of 'in' with 'out' and stores the result in 'out'.
+  static void Or(const TMinMaxFilter& in, TMinMaxFilter* out);
+
+  /// Copies the contents of 'in' into 'out'.
+  static void Copy(const TMinMaxFilter& in, TMinMaxFilter* out);
+
+  /// Returns the LLVM_CLASS_NAME for the given type.
+  static std::string GetLlvmClassName(PrimitiveType type);
+
+  /// Returns the IRFunction::Type for Insert() for the given type.
+  static IRFunction::Type GetInsertIRFunctionType(PrimitiveType type);
+};
+
+#define NUMERIC_MIN_MAX_FILTER(NAME, TYPE)                                    \
+  class NAME##MinMaxFilter : public MinMaxFilter {                            \
+   public:                                                                    \
+    NAME##MinMaxFilter() {                                                    \
+      min_ = std::numeric_limits<TYPE>::max();                                \
+      max_ = std::numeric_limits<TYPE>::lowest();                             \
+    }                                                                         \
+    NAME##MinMaxFilter(const TMinMaxFilter& thrift);                          \
+    virtual ~NAME##MinMaxFilter() {}                                          \
+    virtual void* GetMin() override { return &min_; }                         \
+    virtual void* GetMax() override { return &max_; }                         \
+    virtual bool GetCastIntMinMax(                                            \
+        const ColumnType& type, int64_t* out_min, int64_t* out_max) override; \
+    virtual PrimitiveType type() override;                                    \
+    virtual void Insert(void* val) override;                                  \
+    virtual bool AlwaysTrue() const override { return false; }                \
+    virtual bool AlwaysFalse() const override {                               \
+      return min_ == std::numeric_limits<TYPE>::max()                         \
+          && max_ == std::numeric_limits<TYPE>::lowest();                     \
+    }                                                                         \
+    virtual void ToThrift(TMinMaxFilter* thrift) const override;              \
+    virtual std::string DebugString() const override;                         \
+    static void Or(const TMinMaxFilter& in, TMinMaxFilter* out);              \
+    static void Copy(const TMinMaxFilter& in, TMinMaxFilter* out);            \
+    static const char* LLVM_CLASS_NAME;                                       \
+                                                                              \
+   private:                                                                   \
+    TYPE min_;                                                                \
+    TYPE max_;                                                                \
+  };
+
+NUMERIC_MIN_MAX_FILTER(Bool, bool);
+NUMERIC_MIN_MAX_FILTER(TinyInt, int8_t);
+NUMERIC_MIN_MAX_FILTER(SmallInt, int16_t);
+NUMERIC_MIN_MAX_FILTER(Int, int32_t);
+NUMERIC_MIN_MAX_FILTER(BigInt, int64_t);
+NUMERIC_MIN_MAX_FILTER(Float, float);
+NUMERIC_MIN_MAX_FILTER(Double, double);
+
+class StringMinMaxFilter : public MinMaxFilter {
+ public:
+  StringMinMaxFilter(MemPool* mem_pool)
+    : min_buffer_(mem_pool),
+      max_buffer_(mem_pool),
+      always_false_(true),
+      always_true_(false) {}
+  StringMinMaxFilter(const TMinMaxFilter& thrift, MemPool* mem_pool);
+  virtual ~StringMinMaxFilter() {}
+
+  virtual void* GetMin() override { return &min_; }
+  virtual void* GetMax() override { return &max_; }
+  virtual PrimitiveType type() override;
+
+  virtual void Insert(void* val) override;
+  virtual bool AlwaysTrue() const override { return always_true_; }
+  virtual bool AlwaysFalse() const override { return always_false_; }
+
+  /// Copies the values pointed to by 'min_'/'max_' into 'min_buffer_'/'max_buffer_',
+  /// truncating them if necessary.
+  virtual void MaterializeValues() override;
+
+  virtual void ToThrift(TMinMaxFilter* thrift) const override;
+  virtual std::string DebugString() const override;
+
+  static void Or(const TMinMaxFilter& in, TMinMaxFilter* out);
+  static void Copy(const TMinMaxFilter& in, TMinMaxFilter* out);
+
+  /// Struct name in LLVM IR.
+  static const char* LLVM_CLASS_NAME;
+
+ private:
+  /// Copies the contents of 'value' into 'buffer', up to 'len', and reassignes 'value' to
+  /// point to 'buffer'. If an oom is hit, disables the filter by setting 'always_true_'
+  /// to true.
+  void CopyToBuffer(StringBuffer* buffer, StringValue* value, int64_t len);
+
+  /// Sets 'always_true_' to true and clears the values of 'min_', 'max_', 'min_buffer_',
+  /// and 'max_buffer_'.
+  void SetAlwaysTrue();
+
+  /// The maximum length of string to store in 'min_str_' or 'max_str_'. Strings inserted
+  /// into this filter that are longer than this will be truncated.
+  static const int MAX_BOUND_LENGTH;
+
+  /// The min/max values. After a call to MaterializeValues() these will point to
+  /// 'min_buffer_'/'max_buffer_'.
+  StringValue min_;
+  StringValue max_;
+
+  /// Local buffers to copy min/max data into. If Insert() was called and 'min_'/'max_'
+  /// was updated, these will be empty until MaterializeValues() is called.
+  StringBuffer min_buffer_;
+  StringBuffer max_buffer_;
+
+  /// True if no rows have been inserted.
+  bool always_false_;
+  bool always_true_;
+};
+
+class TimestampMinMaxFilter : public MinMaxFilter {
+ public:
+  TimestampMinMaxFilter() { always_false_ = true; }
+  TimestampMinMaxFilter(const TMinMaxFilter& thrift);
+  virtual ~TimestampMinMaxFilter() {}
+
+  virtual void* GetMin() override { return &min_; }
+  virtual void* GetMax() override { return &max_; }
+  virtual PrimitiveType type() override;
+
+  virtual void Insert(void* val) override;
+  virtual bool AlwaysTrue() const override { return false; }
+  virtual bool AlwaysFalse() const override { return always_false_; }
+  virtual void ToThrift(TMinMaxFilter* thrift) const override;
+  virtual std::string DebugString() const override;
+
+  static void Or(const TMinMaxFilter& in, TMinMaxFilter* out);
+  static void Copy(const TMinMaxFilter& in, TMinMaxFilter* out);
+
+  /// Struct name in LLVM IR.
+  static const char* LLVM_CLASS_NAME;
+
+ private:
+  TimestampValue min_;
+  TimestampValue max_;
+
+  /// True if no rows have been inserted.
+  bool always_false_;
+};
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/common/thrift/Data.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/Data.thrift b/common/thrift/Data.thrift
index 61d1988..6361e7e 100644
--- a/common/thrift/Data.thrift
+++ b/common/thrift/Data.thrift
@@ -28,6 +28,7 @@ struct TColumnValue {
   4: optional double double_val
   5: optional string string_val
   8: optional binary binary_val
+  9: optional binary timestamp_val
 }
 
 struct TResultRow {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/common/thrift/ImpalaInternalService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index 1f261f4..1fd9c25 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -33,6 +33,7 @@ include "DataSinks.thrift"
 include "Results.thrift"
 include "RuntimeProfile.thrift"
 include "ImpalaService.thrift"
+include "Data.thrift"
 
 // constants for TQueryOptions.num_nodes
 const i32 NUM_NODES_ALL = 0
@@ -191,14 +192,14 @@ struct TQueryOptions {
   // be rounded up to the nearest power of two.
   38: optional i32 runtime_bloom_filter_size = 1048576
 
-  // Time in ms to wait until partition filters are delivered. If 0, the default defined
+  // Time in ms to wait until runtime filters are delivered. If 0, the default defined
   // by the startup flag of the same name is used.
   39: optional i32 runtime_filter_wait_time_ms = 0
 
   // If true, per-row runtime filtering is disabled
   40: optional bool disable_row_runtime_filtering = false
 
-  // Maximum number of runtime filters allowed per query
+  // Maximum number of bloom runtime filters allowed per query
   41: optional i32 max_num_runtime_filters = 10
 
   // If true, use UTF-8 annotation for string columns. Note that char and varchar columns
@@ -226,10 +227,10 @@ struct TQueryOptions {
   // the files there.
   45: optional bool s3_skip_insert_staging = true
 
-  // Minimum runtime filter size, in bytes
+  // Minimum runtime bloom filter size, in bytes
   46: optional i32 runtime_filter_min_size = 1048576
 
-  // Maximum runtime filter size, in bytes
+  // Maximum runtime bloom filter size, in bytes
   47: optional i32 runtime_filter_max_size = 16777216
 
   // Prefetching behavior during hash tables' building and probing.
@@ -771,6 +772,16 @@ struct TBloomFilter {
   4: required bool always_false
 }
 
+struct TMinMaxFilter {
+  // If true, filter allows all elements to pass and 'min'/'max' will not be set.
+  1: required bool always_true
+
+  // If true, filter doesn't allow any elements to pass and 'min'/'max' will not be set.
+  2: required bool always_false
+
+  3: optional Data.TColumnValue min
+  4: optional Data.TColumnValue max
+}
 
 // UpdateFilter
 
@@ -787,6 +798,8 @@ struct TUpdateFilterParams {
 
   // required in V1
   4: optional TBloomFilter bloom_filter
+
+  5: optional TMinMaxFilter min_max_filter
 }
 
 struct TUpdateFilterResult {
@@ -812,6 +825,8 @@ struct TPublishFilterParams {
   // Actual bloom_filter payload
   // required in V1
   5: optional TBloomFilter bloom_filter
+
+  6: optional TMinMaxFilter min_max_filter
 }
 
 struct TPublishFilterResult {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/common/thrift/ImpalaService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index b8a073e..061da00 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -199,13 +199,13 @@ enum TImpalaQueryOptions {
   // two.
   RUNTIME_BLOOM_FILTER_SIZE,
 
-  // Time (in ms) to wait in scans for partition filters to arrive.
+  // Time (in ms) to wait in scans for runtime filters to arrive.
   RUNTIME_FILTER_WAIT_TIME_MS,
 
   // If true, disable application of runtime filters to individual rows.
   DISABLE_ROW_RUNTIME_FILTERING,
 
-  // Maximum number of runtime filters allowed per query.
+  // Maximum number of bloom runtime filters allowed per query.
   MAX_NUM_RUNTIME_FILTERS,
 
   // If true, use UTF-8 annotation for string columns. Note that char and varchar columns
@@ -227,10 +227,10 @@ enum TImpalaQueryOptions {
   // TODO: Find a way to get this working for INSERT OVERWRITEs too.
   S3_SKIP_INSERT_STAGING,
 
-  // Maximum runtime filter size, in bytes.
+  // Maximum runtime bloom filter size, in bytes.
   RUNTIME_FILTER_MAX_SIZE,
 
-  // Minimum runtime filter size, in bytes.
+  // Minimum runtime bloom filter size, in bytes.
   RUNTIME_FILTER_MIN_SIZE,
 
   // Prefetching behavior during hash tables' building and probing.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/common/thrift/PlanNodes.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift
index c04d08a..97ef1b3 100644
--- a/common/thrift/PlanNodes.thrift
+++ b/common/thrift/PlanNodes.thrift
@@ -98,6 +98,16 @@ struct TRuntimeFilterTargetDesc {
   // Indicates if this target is on the same fragment as the join that
   // produced the runtime filter
   5: required bool is_local_target
+
+  // If the target node is a Kudu scan node, the name, in the case it appears in Kudu, and
+  // type of the targeted column.
+  6: optional string kudu_col_name
+  7: optional Types.TColumnType kudu_col_type;
+}
+
+enum TRuntimeFilterType {
+  BLOOM,
+  MIN_MAX
 }
 
 // Specification of a runtime filter.
@@ -132,6 +142,9 @@ struct TRuntimeFilterDesc {
   // The estimated number of distinct values that the planner expects the filter to hold.
   // Used to compute the size of the filter.
   9: optional i64 ndv_estimate
+
+  // The type of runtime filter to build.
+  10: required TRuntimeFilterType type
 }
 
 // The information contained in subclasses of ScanNode captured in two separate

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java b/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
index 917d918..d04a15e 100644
--- a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
@@ -189,7 +189,7 @@ public class HashJoinNode extends JoinNode {
       }
       if (!runtimeFilters_.isEmpty()) {
         output.append(detailPrefix + "runtime filters: ");
-        output.append(getRuntimeFilterExplainString(true));
+        output.append(getRuntimeFilterExplainString(true, detailLevel));
       }
     }
     return output.toString();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index ad8501a..44f58eb 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -978,7 +978,7 @@ public class HdfsScanNode extends ScanNode {
       }
       if (!runtimeFilters_.isEmpty()) {
         output.append(detailPrefix + "runtime filters: ");
-        output.append(getRuntimeFilterExplainString(false));
+        output.append(getRuntimeFilterExplainString(false, detailLevel));
       }
     }
     if (detailLevel.ordinal() >= TExplainLevel.EXTENDED.ordinal()) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
index cbc132b..390592e 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
@@ -295,6 +295,10 @@ public class KuduScanNode extends ScanNode {
           result.append(detailPrefix + "kudu predicates: " + getExplainString(
               kuduConjuncts_) + "\n");
         }
+        if (!runtimeFilters_.isEmpty()) {
+          result.append(detailPrefix + "runtime filters: ");
+          result.append(getRuntimeFilterExplainString(false, detailLevel));
+        }
       }
     }
     return result.toString();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index c149b5c..a14c89a 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -705,25 +705,28 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
 
   protected Collection<RuntimeFilter> getRuntimeFilters() { return runtimeFilters_; }
 
-  protected String getRuntimeFilterExplainString(boolean isBuildNode) {
+  protected String getRuntimeFilterExplainString(
+      boolean isBuildNode, TExplainLevel detailLevel) {
     if (runtimeFilters_.isEmpty()) return "";
-    final String applyNodeFilterFormat = "%s -> %s";
-    final String buildNodeFilterFormat = "%s <- %s";
-    String format = isBuildNode ? buildNodeFilterFormat : applyNodeFilterFormat;
-    StringBuilder output = new StringBuilder();
     List<String> filtersStr = Lists.newArrayList();
     for (RuntimeFilter filter: runtimeFilters_) {
-      Expr expr = null;
+      StringBuilder filterStr = new StringBuilder();
+      filterStr.append(filter.getFilterId());
+      if (detailLevel.ordinal() >= TExplainLevel.EXTENDED.ordinal()) {
+        filterStr.append("[");
+        filterStr.append(filter.getType().toString().toLowerCase());
+        filterStr.append("]");
+      }
       if (isBuildNode) {
-        expr = filter.getSrcExpr();
+        filterStr.append(" <- ");
+        filterStr.append(filter.getSrcExpr().toSql());
       } else {
-        expr = filter.getTargetExpr(getId());
+        filterStr.append(" -> ");
+        filterStr.append(filter.getTargetExpr(getId()).toSql());
       }
-      Preconditions.checkNotNull(expr);
-      filtersStr.add(String.format(format, filter.getFilterId(), expr.toSql()));
+      filtersStr.add(filterStr.toString());
     }
-    output.append(Joiner.on(", ").join(filtersStr) + "\n");
-    return output.toString();
+    return Joiner.on(", ").join(filtersStr) + "\n";
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java b/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
index 98365e6..758e79d 100644
--- a/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
+++ b/fe/src/main/java/org/apache/impala/planner/RuntimeFilterGenerator.java
@@ -26,6 +26,8 @@ import java.util.Set;
 
 import org.apache.impala.analysis.Analyzer;
 import org.apache.impala.analysis.BinaryPredicate;
+import org.apache.impala.analysis.BinaryPredicate.Operator;
+import org.apache.impala.analysis.CastExpr;
 import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.ExprSubstitutionMap;
 import org.apache.impala.analysis.Predicate;
@@ -35,6 +37,7 @@ import org.apache.impala.analysis.SlotRef;
 import org.apache.impala.analysis.TupleDescriptor;
 import org.apache.impala.analysis.TupleId;
 import org.apache.impala.analysis.TupleIsNullPredicate;
+import org.apache.impala.catalog.KuduColumn;
 import org.apache.impala.catalog.Table;
 import org.apache.impala.catalog.Type;
 import org.apache.impala.common.AnalysisException;
@@ -44,6 +47,7 @@ import org.apache.impala.planner.PlanNode;
 import org.apache.impala.thrift.TRuntimeFilterDesc;
 import org.apache.impala.thrift.TRuntimeFilterMode;
 import org.apache.impala.thrift.TRuntimeFilterTargetDesc;
+import org.apache.impala.thrift.TRuntimeFilterType;
 
 import com.google.common.base.Joiner;
 import com.google.common.base.Preconditions;
@@ -107,6 +111,8 @@ public final class RuntimeFilterGenerator {
     private final Expr srcExpr_;
     // Expr (lhs of join predicate) from which the targetExprs_ are generated.
     private final Expr origTargetExpr_;
+    // The operator comparing 'srcExpr_' and 'origTargetExpr_'.
+    private final Operator exprCmpOp_;
     // Runtime filter targets
     private final List<RuntimeFilterTarget> targets_ = Lists.newArrayList();
     // Slots from base table tuples that have value transfer from the slots
@@ -131,6 +137,8 @@ public final class RuntimeFilterGenerator {
     // If set, indicates that the filter can't be assigned to another scan node.
     // Once set, it can't be unset.
     private boolean finalized_ = false;
+    // The type of filter to build.
+    private TRuntimeFilterType type_;
 
     /**
      * Internal representation of a runtime filter target.
@@ -165,6 +173,14 @@ public final class RuntimeFilterGenerator {
         tFilterTarget.setTarget_expr_slotids(tSlotIds);
         tFilterTarget.setIs_bound_by_partition_columns(isBoundByPartitionColumns);
         tFilterTarget.setIs_local_target(isLocalTarget);
+        if (node instanceof KuduScanNode) {
+          // assignRuntimeFilters() only assigns KuduScanNode targets if the target expr
+          // is a slot ref, possibly with an implicit cast, pointing to a column.
+          SlotRef slotRef = expr.unwrapSlotRef(true);
+          KuduColumn col = (KuduColumn) slotRef.getDesc().getColumn();
+          tFilterTarget.setKudu_col_name(col.getKuduName());
+          tFilterTarget.setKudu_col_type(col.getType().toThrift());
+        }
         return tFilterTarget;
       }
 
@@ -179,13 +195,16 @@ public final class RuntimeFilterGenerator {
       }
     }
 
-    private RuntimeFilter(RuntimeFilterId filterId, JoinNode filterSrcNode,
-        Expr srcExpr, Expr origTargetExpr, Map<TupleId, List<SlotId>> targetSlots) {
+    private RuntimeFilter(RuntimeFilterId filterId, JoinNode filterSrcNode, Expr srcExpr,
+        Expr origTargetExpr, Operator exprCmpOp, Map<TupleId, List<SlotId>> targetSlots,
+        TRuntimeFilterType type) {
       id_ = filterId;
       src_ = filterSrcNode;
       srcExpr_ = srcExpr;
       origTargetExpr_ = origTargetExpr;
+      exprCmpOp_ = exprCmpOp;
       targetSlotsByTid_ = targetSlots;
+      type_ = type;
       computeNdvEstimate();
     }
 
@@ -221,6 +240,7 @@ public final class RuntimeFilterGenerator {
             appliedOnPartitionColumns && target.isBoundByPartitionColumns;
       }
       tFilter.setApplied_on_partition_columns(appliedOnPartitionColumns);
+      tFilter.setType(type_);
       return tFilter;
     }
 
@@ -230,7 +250,8 @@ public final class RuntimeFilterGenerator {
      * or null if a runtime filter cannot be generated from the specified predicate.
      */
     public static RuntimeFilter create(IdGenerator<RuntimeFilterId> idGen,
-        Analyzer analyzer, Expr joinPredicate, JoinNode filterSrcNode) {
+        Analyzer analyzer, Expr joinPredicate, JoinNode filterSrcNode,
+        TRuntimeFilterType type) {
       Preconditions.checkNotNull(idGen);
       Preconditions.checkNotNull(joinPredicate);
       Preconditions.checkNotNull(filterSrcNode);
@@ -256,8 +277,8 @@ public final class RuntimeFilterGenerator {
       if (LOG.isTraceEnabled()) {
         LOG.trace("Generating runtime filter from predicate " + joinPredicate);
       }
-      return new RuntimeFilter(idGen.getNextId(), filterSrcNode,
-          srcExpr, targetExpr, targetSlots);
+      return new RuntimeFilter(idGen.getNextId(), filterSrcNode, srcExpr, targetExpr,
+          normalizedJoinConjunct.getOp(), targetSlots, type);
     }
 
     /**
@@ -337,6 +358,8 @@ public final class RuntimeFilterGenerator {
     public Expr getOrigTargetExpr() { return origTargetExpr_; }
     public Map<TupleId, List<SlotId>> getTargetSlots() { return targetSlotsByTid_; }
     public RuntimeFilterId getFilterId() { return id_; }
+    public TRuntimeFilterType getType() { return type_; }
+    public Operator getExprCompOp() { return exprCmpOp_; }
 
     /**
      * Estimates the selectivity of a runtime filter as the cardinality of the
@@ -394,14 +417,14 @@ public final class RuntimeFilterGenerator {
   public static void generateRuntimeFilters(PlannerContext ctx, PlanNode plan) {
     Preconditions.checkNotNull(ctx);
     Preconditions.checkNotNull(ctx.getQueryOptions());
-    int maxNumFilters = ctx.getQueryOptions().getMax_num_runtime_filters();
-    Preconditions.checkState(maxNumFilters >= 0);
+    int maxNumBloomFilters = ctx.getQueryOptions().getMax_num_runtime_filters();
+    Preconditions.checkState(maxNumBloomFilters >= 0);
     RuntimeFilterGenerator filterGenerator = new RuntimeFilterGenerator();
     filterGenerator.generateFilters(ctx, plan);
     List<RuntimeFilter> filters = Lists.newArrayList(filterGenerator.getRuntimeFilters());
-    if (filters.size() > maxNumFilters) {
-      // If more than 'maxNumFilters' were generated, sort them by increasing selectivity
-      // and keep the 'maxNumFilters' most selective.
+    if (filters.size() > maxNumBloomFilters) {
+      // If more than 'maxNumBloomFilters' were generated, sort them by increasing
+      // selectivity and keep the 'maxNumBloomFilters' most selective bloom filters.
       Collections.sort(filters, new Comparator<RuntimeFilter>() {
           public int compare(RuntimeFilter a, RuntimeFilter b) {
             double aSelectivity =
@@ -413,8 +436,14 @@ public final class RuntimeFilterGenerator {
         }
       );
     }
-    for (RuntimeFilter filter:
-         filters.subList(0, Math.min(filters.size(), maxNumFilters))) {
+    // We only enforce a limit on the number of bloom filters as they are much more
+    // heavy-weight than the other filter types.
+    int numBloomFilters = 0;
+    for (RuntimeFilter filter : filters) {
+      if (filter.getType() == TRuntimeFilterType.BLOOM) {
+        if (numBloomFilters >= maxNumBloomFilters) continue;
+        ++numBloomFilters;
+      }
       filter.setIsBroadcast(
           filter.src_.getDistributionMode() == DistributionMode.BROADCAST);
       filter.computeHasLocalTargets();
@@ -462,12 +491,14 @@ public final class RuntimeFilterGenerator {
       }
       joinConjuncts.addAll(joinNode.getConjuncts());
       List<RuntimeFilter> filters = Lists.newArrayList();
-      for (Expr conjunct: joinConjuncts) {
-        RuntimeFilter filter = RuntimeFilter.create(filterIdGenerator,
-            ctx.getRootAnalyzer(), conjunct, joinNode);
-        if (filter == null) continue;
-        registerRuntimeFilter(filter);
-        filters.add(filter);
+      for (TRuntimeFilterType type : TRuntimeFilterType.values()) {
+        for (Expr conjunct : joinConjuncts) {
+          RuntimeFilter filter = RuntimeFilter.create(
+              filterIdGenerator, ctx.getRootAnalyzer(), conjunct, joinNode, type);
+          if (filter == null) continue;
+          registerRuntimeFilter(filter);
+          filters.add(filter);
+        }
       }
       generateFilters(ctx, root.getChild(0));
       // Finalize every runtime filter of that join. This is to ensure that we don't
@@ -538,11 +569,14 @@ public final class RuntimeFilterGenerator {
    * 2. If the RUNTIME_FILTER_MODE query option is set to LOCAL, a filter is only assigned
    *    to 'scanNode' if the filter is produced within the same fragment that contains the
    *    scan node.
+   * 3. Only Hdfs and Kudu scan nodes are supported:
+   *     a. If the target is an HdfsScanNode, the filter must be type BLOOM.
+   *     b. If the target is a KuduScanNode, the filter must be type MIN_MAX, the target
+   *         must be a slot ref on a column, and the comp op cannot be 'not distinct'.
    * A scan node may be used as a destination node for multiple runtime filters.
-   * Currently, runtime filters can only be assigned to HdfsScanNodes.
    */
   private void assignRuntimeFilters(PlannerContext ctx, ScanNode scanNode) {
-    if (!(scanNode instanceof HdfsScanNode)) return;
+    if (!(scanNode instanceof HdfsScanNode || scanNode instanceof KuduScanNode)) return;
     TupleId tid = scanNode.getTupleIds().get(0);
     if (!runtimeFiltersByTid_.containsKey(tid)) return;
     Analyzer analyzer = ctx.getRootAnalyzer();
@@ -558,6 +592,26 @@ public final class RuntimeFilterGenerator {
       if (disableRowRuntimeFiltering && !isBoundByPartitionColumns) continue;
       boolean isLocalTarget = isLocalTarget(filter, scanNode);
       if (runtimeFilterMode == TRuntimeFilterMode.LOCAL && !isLocalTarget) continue;
+
+      // Check that the scan node supports applying filters of this type and targetExpr.
+      if (scanNode instanceof HdfsScanNode
+          && filter.getType() != TRuntimeFilterType.BLOOM) {
+        continue;
+      } else if (scanNode instanceof KuduScanNode) {
+        if (filter.getType() != TRuntimeFilterType.MIN_MAX) continue;
+        SlotRef slotRef = targetExpr.unwrapSlotRef(true);
+        // Kudu only supports targeting a single column, not general exprs, so the target
+        // must be a SlotRef pointing to a column. We can allow implicit integer casts
+        // by casting the min/max values before sending them to Kudu.
+        // Kudu also cannot currently return nulls if a filter is applied, so it does not
+        // work with "is not distinct".
+        if (slotRef == null || slotRef.getDesc().getColumn() == null
+            || (targetExpr instanceof CastExpr && !targetExpr.getType().isIntegerType())
+            || filter.getExprCompOp() == Operator.NOT_DISTINCT) {
+          continue;
+        }
+      }
+
       RuntimeFilter.RuntimeFilterTarget target = new RuntimeFilter.RuntimeFilterTarget(
           scanNode, targetExpr, isBoundByPartitionColumns, isLocalTarget);
       filter.addTarget(target);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
index f49e39c..760334d 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -497,4 +497,11 @@ public class PlannerTest extends PlannerTestBase {
     requestWithDisableSpillOn = frontend_.createExecRequest(queryCtx, explainBuilder);
     Assert.assertNotNull(requestWithDisableSpillOn);
   }
+
+  @Test
+  public void testMinMaxRuntimeFilters() {
+    TQueryOptions options = defaultQueryOptions();
+    options.setExplain_level(TExplainLevel.EXTENDED);
+    runPlannerTestFile("min-max-runtime-filters", options);
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
index 15db74d..cf5a78b 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
@@ -1245,7 +1245,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey, l_returnflag = o_clerk
-|  runtime filters: RF002 <- o_orderkey, RF003 <- o_clerk
+|  runtime filters: RF004 <- o_orderkey, RF005 <- o_clerk
 |
 |--07:EXCHANGE [HASH(o_orderkey,o_clerk)]
 |  |
@@ -1257,7 +1257,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch_parquet.lineitem]
    partitions=1/1 files=3 size=193.92MB
-   runtime filters: RF002 -> l_orderkey, RF003 -> l_returnflag
+   runtime filters: RF004 -> l_orderkey, RF005 -> l_returnflag
 ====
 # IMPALA-4263: Grouping agg needs a merge step because the grouping exprs reference a
 # tuple that is made nullable in the join fragment.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
index 9dc9f22..5571b21 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
@@ -12,7 +12,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: ss_customer_sk = c_customer_sk
-|  runtime filters: RF000 <- c_customer_sk
+|  runtime filters: RF000[bloom] <- c_customer_sk
 |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=529700
 |
@@ -28,7 +28,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_customer_sk
+   runtime filters: RF000[bloom] -> ss_customer_sk
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all
@@ -87,7 +87,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: ss_customer_sk = c_customer_sk
-|  runtime filters: RF000 <- c_customer_sk
+|  runtime filters: RF000[bloom] <- c_customer_sk
 |  mem-estimate=8.50MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0N,1 row-size=355B cardinality=529700
 |
@@ -103,7 +103,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_customer_sk
+   runtime filters: RF000[bloom] -> ss_customer_sk
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all
@@ -124,7 +124,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_item_sk = sr_item_sk, ss_ticket_number = sr_ticket_number
 |  fk/pk conjuncts: ss_item_sk = sr_item_sk, ss_ticket_number = sr_ticket_number
-|  runtime filters: RF000 <- sr_item_sk, RF001 <- sr_ticket_number
+|  runtime filters: RF000[bloom] <- sr_item_sk, RF001[bloom] <- sr_ticket_number
 |  mem-estimate=4.75MB mem-reservation=4.75MB spill-buffer=256.00KB
 |  tuple-ids=0,1 row-size=188B cardinality=211838
 |
@@ -140,7 +140,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_item_sk, RF001 -> ss_ticket_number
+   runtime filters: RF000[bloom] -> ss_item_sk, RF001[bloom] -> ss_ticket_number
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all
@@ -160,7 +160,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_time_sk = ws_sold_time_sk
 |  fk/pk conjuncts: none
-|  runtime filters: RF000 <- ws_sold_time_sk
+|  runtime filters: RF000[bloom] <- ws_sold_time_sk
 |  mem-estimate=108.67MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=244B cardinality=44136418
 |
@@ -174,7 +174,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_sold_time_sk
+   runtime filters: RF000[bloom] -> ss_sold_time_sk
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all
@@ -195,7 +195,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: b.d_date_sk = a.d_date_sk
 |  fk/pk conjuncts: b.d_date_sk = a.d_date_sk
-|  runtime filters: RF000 <- a.d_date_sk
+|  runtime filters: RF000[bloom] <- a.d_date_sk
 |  mem-estimate=17.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=1,0 row-size=606B cardinality=36525
 |
@@ -211,7 +211,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.date_dim b]
    partitions=1/1 files=1 size=9.84MB
-   runtime filters: RF000 -> b.d_date_sk
+   runtime filters: RF000[bloom] -> b.d_date_sk
    stats-rows=73049 extrapolated-rows=disabled
    table stats: rows=73049 size=9.84MB
    column stats: all
@@ -236,7 +236,7 @@ PLAN-ROOT SINK
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_addr_sk = c_current_addr_sk
 |  fk/pk conjuncts: none
-|  runtime filters: RF000 <- c_current_addr_sk
+|  runtime filters: RF000[bloom] <- c_current_addr_sk
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=1,0,3,4,2 row-size=60B cardinality=19358
 |
@@ -251,7 +251,7 @@ PLAN-ROOT SINK
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: sr_returned_date_sk = d2.d_date_sk
 |  fk/pk conjuncts: sr_returned_date_sk = d2.d_date_sk
-|  runtime filters: RF001 <- d2.d_date_sk
+|  runtime filters: RF002[bloom] <- d2.d_date_sk
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=1,0,3,4 row-size=56B cardinality=8131
 |
@@ -266,14 +266,14 @@ PLAN-ROOT SINK
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: sr_item_sk = ss_item_sk, sr_ticket_number = ss_ticket_number
 |  fk/pk conjuncts: sr_item_sk = ss_item_sk, sr_ticket_number = ss_ticket_number
-|  runtime filters: RF002 <- ss_item_sk, RF003 <- ss_ticket_number
+|  runtime filters: RF004[bloom] <- ss_item_sk, RF005[bloom] <- ss_ticket_number
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=1,0,3 row-size=52B cardinality=8131
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ss_sold_date_sk = d1.d_date_sk
 |  |  fk/pk conjuncts: ss_sold_date_sk = d1.d_date_sk
-|  |  runtime filters: RF004 <- d1.d_date_sk
+|  |  runtime filters: RF008[bloom] <- d1.d_date_sk
 |  |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  |  tuple-ids=0,3 row-size=32B cardinality=11055
 |  |
@@ -289,7 +289,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpcds.store_sales]
 |     partitions=1824/1824 files=1824 size=326.32MB
-|     runtime filters: RF000 -> ss_addr_sk, RF004 -> ss_sold_date_sk
+|     runtime filters: RF000[bloom] -> ss_addr_sk, RF008[bloom] -> ss_sold_date_sk
 |     stats-rows=2880404 extrapolated-rows=disabled
 |     table stats: rows=2880404 size=326.32MB
 |     column stats: all
@@ -298,7 +298,7 @@ PLAN-ROOT SINK
 |
 01:SCAN HDFS [tpcds.store_returns]
    partitions=1/1 files=1 size=31.19MB
-   runtime filters: RF001 -> sr_returned_date_sk, RF002 -> sr_item_sk, RF003 -> sr_ticket_number
+   runtime filters: RF002[bloom] -> sr_returned_date_sk, RF004[bloom] -> sr_item_sk, RF005[bloom] -> sr_ticket_number
    stats-rows=287514 extrapolated-rows=disabled
    table stats: rows=287514 size=31.19MB
    column stats: all
@@ -318,7 +318,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_customer_sk % 10 = c_customer_sk / 100
 |  fk/pk conjuncts: assumed fk/pk
-|  runtime filters: RF000 <- c_customer_sk / 100
+|  runtime filters: RF000[bloom] <- c_customer_sk / 100
 |  mem-estimate=34.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=355B cardinality=2880404
 |
@@ -332,7 +332,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_customer_sk % 10
+   runtime filters: RF000[bloom] -> ss_customer_sk % 10
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all
@@ -353,7 +353,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: assumed fk/pk
-|  runtime filters: RF000 <- c_customer_sk
+|  runtime filters: RF000[bloom] <- c_customer_sk
 |  mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=8B cardinality=2880404
 |
@@ -367,7 +367,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_customer_sk
+   runtime filters: RF000[bloom] -> ss_customer_sk
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all
@@ -387,7 +387,7 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: assumed fk/pk
-|  runtime filters: RF000 <- c_customer_sk
+|  runtime filters: RF000[bloom] <- c_customer_sk
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=8B cardinality=unavailable
 |
@@ -401,7 +401,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds_seq_snap.store_sales]
    partitions=1824/1824 files=1824 size=207.85MB
-   runtime filters: RF000 -> ss_customer_sk
+   runtime filters: RF000[bloom] -> ss_customer_sk
    stats-rows=unavailable extrapolated-rows=disabled
    table stats: rows=unavailable size=unavailable
    column stats: unavailable
@@ -423,7 +423,7 @@ PLAN-ROOT SINK
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: ss_sold_time_sk = ws_sold_time_sk
 |  fk/pk conjuncts: none
-|  runtime filters: RF000 <- ws_sold_time_sk
+|  runtime filters: RF000[bloom] <- ws_sold_time_sk
 |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB
 |  tuple-ids=0,2 row-size=104B cardinality=2440073
 |
@@ -442,7 +442,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpcds.store_sales]
    partitions=1824/1824 files=1824 size=326.32MB
-   runtime filters: RF000 -> ss_sold_time_sk
+   runtime filters: RF000[bloom] -> ss_sold_time_sk
    stats-rows=2880404 extrapolated-rows=disabled
    table stats: rows=2880404 size=326.32MB
    column stats: all

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test
index e5aa5e8..184fd1c 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/implicit-joins.test
@@ -193,14 +193,14 @@ PLAN-ROOT SINK
 |  |
 |  |--04:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: b.id = c.id
-|  |  |  runtime filters: RF001 <- c.id
+|  |  |  runtime filters: RF002 <- c.id
 |  |  |
 |  |  |--02:SCAN HDFS [functional.alltypestiny c]
 |  |  |     partitions=4/4 files=4 size=460B
 |  |  |
 |  |  01:SCAN HDFS [functional.alltypes b]
 |  |     partitions=24/24 files=24 size=478.45KB
-|  |     runtime filters: RF001 -> b.id
+|  |     runtime filters: RF002 -> b.id
 |  |
 |  00:SCAN HDFS [functional.alltypestiny a]
 |     partitions=4/4 files=4 size=460B

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
index 556ba65..c75c02d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/inline-view-limit.test
@@ -391,7 +391,7 @@ PLAN-ROOT SINK
 |
 |--03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = functional.alltypestiny.id
-|  |  runtime filters: RF001 <- functional.alltypestiny.id
+|  |  runtime filters: RF002 <- functional.alltypestiny.id
 |  |  limit: 10
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny]
@@ -399,7 +399,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [functional.alltypessmall a]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> a.id
+|     runtime filters: RF002 -> a.id
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
@@ -426,7 +426,7 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: a.id = functional.alltypestiny.id
-|  |  runtime filters: RF001 <- functional.alltypestiny.id
+|  |  runtime filters: RF002 <- functional.alltypestiny.id
 |  |  limit: 10
 |  |
 |  |--06:EXCHANGE [BROADCAST]
@@ -436,7 +436,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [functional.alltypessmall a]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> a.id
+|     runtime filters: RF002 -> a.id
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
@@ -470,14 +470,14 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN]
 |  |  hash predicates: a.id = functional.alltypestiny.id
-|  |  runtime filters: RF001 <- functional.alltypestiny.id
+|  |  runtime filters: RF002 <- functional.alltypestiny.id
 |  |
 |  |--02:SCAN HDFS [functional.alltypestiny]
 |  |     partitions=4/4 files=4 size=460B
 |  |
 |  01:SCAN HDFS [functional.alltypessmall a]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> a.id
+|     runtime filters: RF002 -> a.id
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
@@ -508,7 +508,7 @@ PLAN-ROOT SINK
 |  |
 |  03:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: a.id = functional.alltypestiny.id
-|  |  runtime filters: RF001 <- functional.alltypestiny.id
+|  |  runtime filters: RF002 <- functional.alltypestiny.id
 |  |
 |  |--07:EXCHANGE [BROADCAST]
 |  |  |
@@ -517,7 +517,7 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [functional.alltypessmall a]
 |     partitions=4/4 files=4 size=6.32KB
-|     runtime filters: RF001 -> a.id
+|     runtime filters: RF002 -> a.id
 |
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB

[02/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test
index f36751d..da2c745 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test
@@ -137,7 +137,7 @@ PLAN-ROOT SINK
 |
 |--16:HASH JOIN [INNER JOIN]
 |  |  hash predicates: n_regionkey = r_regionkey
-|  |  runtime filters: RF005 <- r_regionkey
+|  |  runtime filters: RF010 <- r_regionkey
 |  |
 |  |--04:SCAN HDFS [tpch.region]
 |  |     partitions=1/1 files=1 size=384B
@@ -145,19 +145,19 @@ PLAN-ROOT SINK
 |  |
 |  15:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF006 <- n_nationkey
+|  |  runtime filters: RF012 <- n_nationkey
 |  |
 |  |--03:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
-|  |     runtime filters: RF005 -> n_regionkey
+|  |     runtime filters: RF010 -> n_regionkey
 |  |
 |  14:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_suppkey = ps_suppkey
-|  |  runtime filters: RF007 <- ps_suppkey
+|  |  runtime filters: RF014 <- ps_suppkey
 |  |
 |  |--13:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: ps_partkey = p_partkey
-|  |  |  runtime filters: RF008 <- p_partkey
+|  |  |  runtime filters: RF016 <- p_partkey
 |  |  |
 |  |  |--00:SCAN HDFS [tpch.part]
 |  |  |     partitions=1/1 files=1 size=22.83MB
@@ -165,11 +165,11 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch.partsupp]
 |  |     partitions=1/1 files=1 size=112.71MB
-|  |     runtime filters: RF008 -> ps_partkey
+|  |     runtime filters: RF016 -> ps_partkey
 |  |
 |  01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF006 -> s_nationkey, RF007 -> s_suppkey
+|     runtime filters: RF012 -> s_nationkey, RF014 -> s_suppkey
 |
 12:AGGREGATE [FINALIZE]
 |  output: min(ps_supplycost)
@@ -177,7 +177,7 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: n_regionkey = r_regionkey
-|  runtime filters: RF002 <- r_regionkey
+|  runtime filters: RF004 <- r_regionkey
 |
 |--08:SCAN HDFS [tpch.region]
 |     partitions=1/1 files=1 size=384B
@@ -185,23 +185,23 @@ PLAN-ROOT SINK
 |
 10:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF003 <- n_nationkey
+|  runtime filters: RF006 <- n_nationkey
 |
 |--07:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF002 -> n_regionkey
+|     runtime filters: RF004 -> n_regionkey
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF004 <- s_suppkey
+|  runtime filters: RF008 <- s_suppkey
 |
 |--06:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF003 -> s_nationkey
+|     runtime filters: RF006 -> s_nationkey
 |
 05:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF004 -> ps_suppkey
+   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF008 -> ps_suppkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -220,7 +220,7 @@ PLAN-ROOT SINK
 |  |
 |  16:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: n_regionkey = r_regionkey
-|  |  runtime filters: RF005 <- r_regionkey
+|  |  runtime filters: RF010 <- r_regionkey
 |  |
 |  |--27:EXCHANGE [BROADCAST]
 |  |  |
@@ -230,23 +230,23 @@ PLAN-ROOT SINK
 |  |
 |  15:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF006 <- n_nationkey
+|  |  runtime filters: RF012 <- n_nationkey
 |  |
 |  |--26:EXCHANGE [BROADCAST]
 |  |  |
 |  |  03:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
-|  |     runtime filters: RF005 -> n_regionkey
+|  |     runtime filters: RF010 -> n_regionkey
 |  |
 |  14:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_suppkey = ps_suppkey
-|  |  runtime filters: RF007 <- ps_suppkey
+|  |  runtime filters: RF014 <- ps_suppkey
 |  |
 |  |--25:EXCHANGE [BROADCAST]
 |  |  |
 |  |  13:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: ps_partkey = p_partkey
-|  |  |  runtime filters: RF008 <- p_partkey
+|  |  |  runtime filters: RF016 <- p_partkey
 |  |  |
 |  |  |--24:EXCHANGE [BROADCAST]
 |  |  |  |
@@ -256,11 +256,11 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch.partsupp]
 |  |     partitions=1/1 files=1 size=112.71MB
-|  |     runtime filters: RF008 -> ps_partkey
+|  |     runtime filters: RF016 -> ps_partkey
 |  |
 |  01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF006 -> s_nationkey, RF007 -> s_suppkey
+|     runtime filters: RF012 -> s_nationkey, RF014 -> s_suppkey
 |
 28:EXCHANGE [HASH(ps_partkey,min(ps_supplycost))]
 |
@@ -276,7 +276,7 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: n_regionkey = r_regionkey
-|  runtime filters: RF002 <- r_regionkey
+|  runtime filters: RF004 <- r_regionkey
 |
 |--21:EXCHANGE [BROADCAST]
 |  |
@@ -286,27 +286,27 @@ PLAN-ROOT SINK
 |
 10:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF003 <- n_nationkey
+|  runtime filters: RF006 <- n_nationkey
 |
 |--20:EXCHANGE [BROADCAST]
 |  |
 |  07:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF002 -> n_regionkey
+|     runtime filters: RF004 -> n_regionkey
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF004 <- s_suppkey
+|  runtime filters: RF008 <- s_suppkey
 |
 |--19:EXCHANGE [BROADCAST]
 |  |
 |  06:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF003 -> s_nationkey
+|     runtime filters: RF006 -> s_nationkey
 |
 05:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF004 -> ps_suppkey
+   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF008 -> ps_suppkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -329,7 +329,7 @@ PLAN-ROOT SINK
 |  |
 |  16:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: n_regionkey = r_regionkey
-|  |  runtime filters: RF005 <- r_regionkey
+|  |  runtime filters: RF010 <- r_regionkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -343,7 +343,7 @@ PLAN-ROOT SINK
 |  |
 |  15:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF006 <- n_nationkey
+|  |  runtime filters: RF012 <- n_nationkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=02 plan-id=03 cohort-id=02
@@ -353,11 +353,11 @@ PLAN-ROOT SINK
 |  |  |
 |  |  03:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
-|  |     runtime filters: RF005 -> n_regionkey
+|  |     runtime filters: RF010 -> n_regionkey
 |  |
 |  14:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_suppkey = ps_suppkey
-|  |  runtime filters: RF007 <- ps_suppkey
+|  |  runtime filters: RF014 <- ps_suppkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=03 plan-id=04 cohort-id=02
@@ -367,7 +367,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  13:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: ps_partkey = p_partkey
-|  |  |  runtime filters: RF008 <- p_partkey
+|  |  |  runtime filters: RF016 <- p_partkey
 |  |  |
 |  |  |--JOIN BUILD
 |  |  |  |  join-table-id=04 plan-id=05 cohort-id=03
@@ -381,11 +381,11 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch.partsupp]
 |  |     partitions=1/1 files=1 size=112.71MB
-|  |     runtime filters: RF008 -> ps_partkey
+|  |     runtime filters: RF016 -> ps_partkey
 |  |
 |  01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF006 -> s_nationkey, RF007 -> s_suppkey
+|     runtime filters: RF012 -> s_nationkey, RF014 -> s_suppkey
 |
 28:EXCHANGE [HASH(ps_partkey,min(ps_supplycost))]
 |
@@ -401,7 +401,7 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: n_regionkey = r_regionkey
-|  runtime filters: RF002 <- r_regionkey
+|  runtime filters: RF004 <- r_regionkey
 |
 |--JOIN BUILD
 |  |  join-table-id=05 plan-id=06 cohort-id=01
@@ -415,7 +415,7 @@ PLAN-ROOT SINK
 |
 10:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF003 <- n_nationkey
+|  runtime filters: RF006 <- n_nationkey
 |
 |--JOIN BUILD
 |  |  join-table-id=06 plan-id=07 cohort-id=01
@@ -425,11 +425,11 @@ PLAN-ROOT SINK
 |  |
 |  07:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF002 -> n_regionkey
+|     runtime filters: RF004 -> n_regionkey
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF004 <- s_suppkey
+|  runtime filters: RF008 <- s_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=07 plan-id=08 cohort-id=01
@@ -439,11 +439,11 @@ PLAN-ROOT SINK
 |  |
 |  06:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF003 -> s_nationkey
+|     runtime filters: RF006 -> s_nationkey
 |
 05:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF004 -> ps_suppkey
+   runtime filters: RF000 -> tpch.partsupp.ps_partkey, RF008 -> ps_suppkey
 ====
 # TPCH-Q3
 # Q3 - Shipping Priority Query
@@ -490,7 +490,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF001 <- o_orderkey
+|  runtime filters: RF002 <- o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
@@ -500,7 +500,7 @@ PLAN-ROOT SINK
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate > '1995-03-15'
-   runtime filters: RF001 -> l_orderkey
+   runtime filters: RF002 -> l_orderkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -533,7 +533,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF001 <- o_orderkey
+|  runtime filters: RF002 <- o_orderkey
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -545,7 +545,7 @@ PLAN-ROOT SINK
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate > '1995-03-15'
-   runtime filters: RF001 -> l_orderkey
+   runtime filters: RF002 -> l_orderkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -582,7 +582,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF001 <- o_orderkey
+|  runtime filters: RF002 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -598,7 +598,7 @@ PLAN-ROOT SINK
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate > '1995-03-15'
-   runtime filters: RF001 -> l_orderkey
+   runtime filters: RF002 -> l_orderkey
 ====
 # TPCH-Q4
 # Q4 - Order Priority Checking Query
@@ -766,7 +766,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF001 <- n_nationkey
+|  runtime filters: RF002 <- n_nationkey
 |
 |--04:SCAN HDFS [tpch.nation]
 |     partitions=1/1 files=1 size=2.15KB
@@ -774,32 +774,32 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = s_nationkey, l_suppkey = s_suppkey
-|  runtime filters: RF002 <- s_nationkey, RF003 <- s_suppkey
+|  runtime filters: RF004 <- s_nationkey, RF005 <- s_suppkey
 |
 |--03:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF004 <- c_custkey
+|  runtime filters: RF008 <- c_custkey
 |
 |--00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF001 -> tpch.customer.c_nationkey, RF002 -> c_nationkey
+|     runtime filters: RF002 -> tpch.customer.c_nationkey, RF004 -> c_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1995-01-01', o_orderdate >= '1994-01-01'
-|     runtime filters: RF004 -> o_custkey
+|     runtime filters: RF008 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF003 -> l_suppkey, RF005 -> l_orderkey
+   runtime filters: RF005 -> l_suppkey, RF010 -> l_orderkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -831,7 +831,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF001 <- n_nationkey
+|  runtime filters: RF002 <- n_nationkey
 |
 |--16:EXCHANGE [BROADCAST]
 |  |
@@ -841,38 +841,38 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = s_nationkey, l_suppkey = s_suppkey
-|  runtime filters: RF002 <- s_nationkey, RF003 <- s_suppkey
+|  runtime filters: RF004 <- s_nationkey, RF005 <- s_suppkey
 |
 |--15:EXCHANGE [BROADCAST]
 |  |
 |  03:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF004 <- c_custkey
+|  runtime filters: RF008 <- c_custkey
 |
 |--14:EXCHANGE [BROADCAST]
 |  |
 |  00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF001 -> tpch.customer.c_nationkey, RF002 -> c_nationkey
+|     runtime filters: RF002 -> tpch.customer.c_nationkey, RF004 -> c_nationkey
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
 |  01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1995-01-01', o_orderdate >= '1994-01-01'
-|     runtime filters: RF004 -> o_custkey
+|     runtime filters: RF008 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF003 -> l_suppkey, RF005 -> l_orderkey
+   runtime filters: RF005 -> l_suppkey, RF010 -> l_orderkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -908,7 +908,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF001 <- n_nationkey
+|  runtime filters: RF002 <- n_nationkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -922,7 +922,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = s_nationkey, l_suppkey = s_suppkey
-|  runtime filters: RF002 <- s_nationkey, RF003 <- s_suppkey
+|  runtime filters: RF004 <- s_nationkey, RF005 <- s_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -932,11 +932,11 @@ PLAN-ROOT SINK
 |  |
 |  03:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF004 <- c_custkey
+|  runtime filters: RF008 <- c_custkey
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -946,11 +946,11 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF001 -> tpch.customer.c_nationkey, RF002 -> c_nationkey
+|     runtime filters: RF002 -> tpch.customer.c_nationkey, RF004 -> c_nationkey
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -961,11 +961,11 @@ PLAN-ROOT SINK
 |  01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1995-01-01', o_orderdate >= '1994-01-01'
-|     runtime filters: RF004 -> o_custkey
+|     runtime filters: RF008 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF003 -> l_suppkey, RF005 -> l_orderkey
+   runtime filters: RF005 -> l_suppkey, RF010 -> l_orderkey
 ====
 # TPCH-Q6
 # Q6 - Forecasting Revenue Change Query
@@ -1076,14 +1076,14 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--04:SCAN HDFS [tpch.nation n1]
 |     partitions=1/1 files=1 size=2.15KB
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF002 <- c_custkey
+|  runtime filters: RF004 <- c_custkey
 |
 |--03:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
@@ -1091,24 +1091,24 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF003 <- s_suppkey
+|  runtime filters: RF006 <- s_suppkey
 |
 |--00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF004 <- o_orderkey
+|  runtime filters: RF008 <- o_orderkey
 |
 |--02:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF002 -> o_custkey
+|     runtime filters: RF004 -> o_custkey
 |
 01:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate <= '1996-12-31', l_shipdate >= '1995-01-01'
-   runtime filters: RF003 -> l_suppkey, RF004 -> l_orderkey
+   runtime filters: RF006 -> l_suppkey, RF008 -> l_orderkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1140,7 +1140,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--17:EXCHANGE [BROADCAST]
 |  |
@@ -1149,7 +1149,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF002 <- c_custkey
+|  runtime filters: RF004 <- c_custkey
 |
 |--16:EXCHANGE [BROADCAST]
 |  |
@@ -1159,30 +1159,30 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF003 <- s_suppkey
+|  runtime filters: RF006 <- s_suppkey
 |
 |--15:EXCHANGE [BROADCAST]
 |  |
 |  00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 06:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF004 <- o_orderkey
+|  runtime filters: RF008 <- o_orderkey
 |
 |--14:EXCHANGE [HASH(o_orderkey)]
 |  |
 |  02:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF002 -> o_custkey
+|     runtime filters: RF004 -> o_custkey
 |
 13:EXCHANGE [HASH(l_orderkey)]
 |
 01:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate <= '1996-12-31', l_shipdate >= '1995-01-01'
-   runtime filters: RF003 -> l_suppkey, RF004 -> l_orderkey
+   runtime filters: RF006 -> l_suppkey, RF008 -> l_orderkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1218,7 +1218,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1231,7 +1231,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF002 <- c_custkey
+|  runtime filters: RF004 <- c_custkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -1245,7 +1245,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF003 <- s_suppkey
+|  runtime filters: RF006 <- s_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -1255,11 +1255,11 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 06:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF004 <- o_orderkey
+|  runtime filters: RF008 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -1269,14 +1269,14 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF002 -> o_custkey
+|     runtime filters: RF004 -> o_custkey
 |
 13:EXCHANGE [HASH(l_orderkey)]
 |
 01:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate <= '1996-12-31', l_shipdate >= '1995-01-01'
-   runtime filters: RF003 -> l_suppkey, RF004 -> l_orderkey
+   runtime filters: RF006 -> l_suppkey, RF008 -> l_orderkey
 ====
 # TPCH-Q8
 # Q8 - National Market Share Query
@@ -1336,7 +1336,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [INNER JOIN]
 |  hash predicates: n1.n_regionkey = r_regionkey
-|  runtime filters: RF001 <- r_regionkey
+|  runtime filters: RF002 <- r_regionkey
 |
 |--07:SCAN HDFS [tpch.region]
 |     partitions=1/1 files=1 size=384B
@@ -1344,19 +1344,19 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = n1.n_nationkey
-|  runtime filters: RF002 <- n1.n_nationkey
+|  runtime filters: RF004 <- n1.n_nationkey
 |
 |--05:SCAN HDFS [tpch.nation n1]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF001 -> n1.n_regionkey
+|     runtime filters: RF002 -> n1.n_regionkey
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: c_custkey = o_custkey
-|  runtime filters: RF003 <- o_custkey
+|  runtime filters: RF006 <- o_custkey
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_suppkey = s_suppkey
-|  |  runtime filters: RF004 <- s_suppkey
+|  |  runtime filters: RF008 <- s_suppkey
 |  |
 |  |--01:SCAN HDFS [tpch.supplier]
 |  |     partitions=1/1 files=1 size=1.33MB
@@ -1364,11 +1364,11 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o_orderkey = l_orderkey
-|  |  runtime filters: RF005 <- l_orderkey
+|  |  runtime filters: RF010 <- l_orderkey
 |  |
 |  |--08:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: l_partkey = p_partkey
-|  |  |  runtime filters: RF006 <- p_partkey
+|  |  |  runtime filters: RF012 <- p_partkey
 |  |  |
 |  |  |--00:SCAN HDFS [tpch.part]
 |  |  |     partitions=1/1 files=1 size=22.83MB
@@ -1376,16 +1376,16 @@ PLAN-ROOT SINK
 |  |  |
 |  |  02:SCAN HDFS [tpch.lineitem]
 |  |     partitions=1/1 files=1 size=718.94MB
-|  |     runtime filters: RF004 -> l_suppkey, RF006 -> l_partkey
+|  |     runtime filters: RF008 -> l_suppkey, RF012 -> l_partkey
 |  |
 |  03:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate <= '1996-12-31', o_orderdate >= '1995-01-01'
-|     runtime filters: RF005 -> o_orderkey
+|     runtime filters: RF010 -> o_orderkey
 |
 04:SCAN HDFS [tpch.customer]
    partitions=1/1 files=1 size=23.08MB
-   runtime filters: RF002 -> c_nationkey, RF003 -> c_custkey
+   runtime filters: RF004 -> c_nationkey, RF006 -> c_custkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1416,7 +1416,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: n1.n_regionkey = r_regionkey
-|  runtime filters: RF001 <- r_regionkey
+|  runtime filters: RF002 <- r_regionkey
 |
 |--25:EXCHANGE [BROADCAST]
 |  |
@@ -1426,29 +1426,29 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = n1.n_nationkey
-|  runtime filters: RF002 <- n1.n_nationkey
+|  runtime filters: RF004 <- n1.n_nationkey
 |
 |--24:EXCHANGE [BROADCAST]
 |  |
 |  05:SCAN HDFS [tpch.nation n1]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF001 -> n1.n_regionkey
+|     runtime filters: RF002 -> n1.n_regionkey
 |
 11:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF003 <- c_custkey
+|  runtime filters: RF006 <- c_custkey
 |
 |--23:EXCHANGE [HASH(c_custkey)]
 |  |
 |  04:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF002 -> c_nationkey
+|     runtime filters: RF004 -> c_nationkey
 |
 22:EXCHANGE [HASH(o_custkey)]
 |
 10:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF004 <- s_suppkey
+|  runtime filters: RF008 <- s_suppkey
 |
 |--21:EXCHANGE [HASH(s_suppkey)]
 |  |
@@ -1460,20 +1460,20 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--19:EXCHANGE [HASH(o_orderkey)]
 |  |
 |  03:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate <= '1996-12-31', o_orderdate >= '1995-01-01'
-|     runtime filters: RF003 -> o_custkey
+|     runtime filters: RF006 -> o_custkey
 |
 18:EXCHANGE [HASH(l_orderkey)]
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF006 <- p_partkey
+|  runtime filters: RF012 <- p_partkey
 |
 |--17:EXCHANGE [BROADCAST]
 |  |
@@ -1483,7 +1483,7 @@ PLAN-ROOT SINK
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF004 -> l_suppkey, RF005 -> l_orderkey, RF006 -> l_partkey
+   runtime filters: RF008 -> l_suppkey, RF010 -> l_orderkey, RF012 -> l_partkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1518,7 +1518,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: n1.n_regionkey = r_regionkey
-|  runtime filters: RF001 <- r_regionkey
+|  runtime filters: RF002 <- r_regionkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1532,7 +1532,7 @@ PLAN-ROOT SINK
 |
 12:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = n1.n_nationkey
-|  runtime filters: RF002 <- n1.n_nationkey
+|  runtime filters: RF004 <- n1.n_nationkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -1542,11 +1542,11 @@ PLAN-ROOT SINK
 |  |
 |  05:SCAN HDFS [tpch.nation n1]
 |     partitions=1/1 files=1 size=2.15KB
-|     runtime filters: RF001 -> n1.n_regionkey
+|     runtime filters: RF002 -> n1.n_regionkey
 |
 11:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF003 <- c_custkey
+|  runtime filters: RF006 <- c_custkey
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -1556,13 +1556,13 @@ PLAN-ROOT SINK
 |  |
 |  04:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
-|     runtime filters: RF002 -> c_nationkey
+|     runtime filters: RF004 -> c_nationkey
 |
 22:EXCHANGE [HASH(o_custkey)]
 |
 10:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF004 <- s_suppkey
+|  runtime filters: RF008 <- s_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -1578,7 +1578,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF005 <- o_orderkey
+|  runtime filters: RF010 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=05 plan-id=06 cohort-id=01
@@ -1589,13 +1589,13 @@ PLAN-ROOT SINK
 |  03:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate <= '1996-12-31', o_orderdate >= '1995-01-01'
-|     runtime filters: RF003 -> o_custkey
+|     runtime filters: RF006 -> o_custkey
 |
 18:EXCHANGE [HASH(l_orderkey)]
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF006 <- p_partkey
+|  runtime filters: RF012 <- p_partkey
 |
 |--JOIN BUILD
 |  |  join-table-id=06 plan-id=07 cohort-id=01
@@ -1609,7 +1609,7 @@ PLAN-ROOT SINK
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF004 -> l_suppkey, RF005 -> l_orderkey, RF006 -> l_partkey
+   runtime filters: RF008 -> l_suppkey, RF010 -> l_orderkey, RF012 -> l_partkey
 ====
 # TPCH-Q9
 # Q9 - Product Type Measure Query
@@ -1663,38 +1663,38 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
-|  runtime filters: RF001 <- ps_partkey, RF002 <- ps_suppkey
+|  runtime filters: RF002 <- ps_partkey, RF003 <- ps_suppkey
 |
 |--03:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF003 <- s_suppkey
+|  runtime filters: RF006 <- s_suppkey
 |
 |--01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF000 -> s_nationkey, RF002 -> tpch.supplier.s_suppkey
+|     runtime filters: RF000 -> s_nationkey, RF003 -> tpch.supplier.s_suppkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF004 <- o_orderkey
+|  runtime filters: RF008 <- o_orderkey
 |
 |--04:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF005 <- p_partkey
+|  runtime filters: RF010 <- p_partkey
 |
 |--00:SCAN HDFS [tpch.part]
 |     partitions=1/1 files=1 size=22.83MB
 |     predicates: p_name LIKE '%green%'
-|     runtime filters: RF001 -> tpch.part.p_partkey
+|     runtime filters: RF002 -> tpch.part.p_partkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF001 -> l_partkey, RF002 -> l_suppkey, RF003 -> l_suppkey, RF004 -> l_orderkey, RF005 -> l_partkey
+   runtime filters: RF002 -> l_partkey, RF003 -> l_suppkey, RF006 -> l_suppkey, RF008 -> l_orderkey, RF010 -> l_partkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1725,7 +1725,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
-|  runtime filters: RF001 <- ps_partkey, RF002 <- ps_suppkey
+|  runtime filters: RF002 <- ps_partkey, RF003 <- ps_suppkey
 |
 |--17:EXCHANGE [BROADCAST]
 |  |
@@ -1734,17 +1734,17 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF003 <- s_suppkey
+|  runtime filters: RF006 <- s_suppkey
 |
 |--16:EXCHANGE [BROADCAST]
 |  |
 |  01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF000 -> s_nationkey, RF002 -> tpch.supplier.s_suppkey
+|     runtime filters: RF000 -> s_nationkey, RF003 -> tpch.supplier.s_suppkey
 |
 07:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF004 <- o_orderkey
+|  runtime filters: RF008 <- o_orderkey
 |
 |--15:EXCHANGE [HASH(o_orderkey)]
 |  |
@@ -1755,18 +1755,18 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF005 <- p_partkey
+|  runtime filters: RF010 <- p_partkey
 |
 |--13:EXCHANGE [BROADCAST]
 |  |
 |  00:SCAN HDFS [tpch.part]
 |     partitions=1/1 files=1 size=22.83MB
 |     predicates: p_name LIKE '%green%'
-|     runtime filters: RF001 -> tpch.part.p_partkey
+|     runtime filters: RF002 -> tpch.part.p_partkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF001 -> l_partkey, RF002 -> l_suppkey, RF003 -> l_suppkey, RF004 -> l_orderkey, RF005 -> l_partkey
+   runtime filters: RF002 -> l_partkey, RF003 -> l_suppkey, RF006 -> l_suppkey, RF008 -> l_orderkey, RF010 -> l_partkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -1801,7 +1801,7 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
-|  runtime filters: RF001 <- ps_partkey, RF002 <- ps_suppkey
+|  runtime filters: RF002 <- ps_partkey, RF003 <- ps_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -1814,7 +1814,7 @@ PLAN-ROOT SINK
 |
 08:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_suppkey = s_suppkey
-|  runtime filters: RF003 <- s_suppkey
+|  runtime filters: RF006 <- s_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -1824,11 +1824,11 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF000 -> s_nationkey, RF002 -> tpch.supplier.s_suppkey
+|     runtime filters: RF000 -> s_nationkey, RF003 -> tpch.supplier.s_suppkey
 |
 07:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF004 <- o_orderkey
+|  runtime filters: RF008 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=03 plan-id=04 cohort-id=01
@@ -1843,7 +1843,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF005 <- p_partkey
+|  runtime filters: RF010 <- p_partkey
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -1854,11 +1854,11 @@ PLAN-ROOT SINK
 |  00:SCAN HDFS [tpch.part]
 |     partitions=1/1 files=1 size=22.83MB
 |     predicates: p_name LIKE '%green%'
-|     runtime filters: RF001 -> tpch.part.p_partkey
+|     runtime filters: RF002 -> tpch.part.p_partkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF001 -> l_partkey, RF002 -> l_suppkey, RF003 -> l_suppkey, RF004 -> l_orderkey, RF005 -> l_partkey
+   runtime filters: RF002 -> l_partkey, RF003 -> l_suppkey, RF006 -> l_suppkey, RF008 -> l_orderkey, RF010 -> l_partkey
 ====
 # TPCH-Q10
 # Q10 - Returned Item Reporting Query
@@ -1914,11 +1914,11 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: c_custkey = o_custkey
-|  runtime filters: RF001 <- o_custkey
+|  runtime filters: RF002 <- o_custkey
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_orderkey = o_orderkey
-|  |  runtime filters: RF002 <- o_orderkey
+|  |  runtime filters: RF004 <- o_orderkey
 |  |
 |  |--01:SCAN HDFS [tpch.orders]
 |  |     partitions=1/1 files=1 size=162.56MB
@@ -1927,11 +1927,11 @@ PLAN-ROOT SINK
 |  02:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB
 |     predicates: l_returnflag = 'R'
-|     runtime filters: RF002 -> l_orderkey
+|     runtime filters: RF004 -> l_orderkey
 |
 00:SCAN HDFS [tpch.customer]
    partitions=1/1 files=1 size=23.08MB
-   runtime filters: RF000 -> c_nationkey, RF001 -> c_custkey
+   runtime filters: RF000 -> c_nationkey, RF002 -> c_custkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1963,7 +1963,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--11:EXCHANGE [HASH(c_custkey)]
 |  |
@@ -1975,19 +1975,19 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004 <- o_orderkey
 |
 |--09:EXCHANGE [BROADCAST]
 |  |
 |  01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1994-01-01', o_orderdate >= '1993-10-01'
-|     runtime filters: RF001 -> o_custkey
+|     runtime filters: RF002 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_returnflag = 'R'
-   runtime filters: RF002 -> l_orderkey
+   runtime filters: RF004 -> l_orderkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -2023,7 +2023,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -2039,7 +2039,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -2050,12 +2050,12 @@ PLAN-ROOT SINK
 |  01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
 |     predicates: o_orderdate < '1994-01-01', o_orderdate >= '1993-10-01'
-|     runtime filters: RF001 -> o_custkey
+|     runtime filters: RF002 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_returnflag = 'R'
-   runtime filters: RF002 -> l_orderkey
+   runtime filters: RF004 -> l_orderkey
 ====
 # TPCH-Q11
 # Q11 - Important Stock Identification
@@ -2106,7 +2106,7 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF002 <- n_nationkey
+|  |  runtime filters: RF004 <- n_nationkey
 |  |
 |  |--08:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
@@ -2114,15 +2114,15 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ps_suppkey = s_suppkey
-|  |  runtime filters: RF003 <- s_suppkey
+|  |  runtime filters: RF006 <- s_suppkey
 |  |
 |  |--07:SCAN HDFS [tpch.supplier]
 |  |     partitions=1/1 files=1 size=1.33MB
-|  |     runtime filters: RF002 -> s_nationkey
+|  |     runtime filters: RF004 -> s_nationkey
 |  |
 |  06:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF003 -> ps_suppkey
+|     runtime filters: RF006 -> ps_suppkey
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(ps_supplycost * ps_availqty)
@@ -2138,7 +2138,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF001 <- s_suppkey
+|  runtime filters: RF002 <- s_suppkey
 |
 |--01:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
@@ -2146,7 +2146,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF001 -> ps_suppkey
+   runtime filters: RF002 -> ps_suppkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -2171,7 +2171,7 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF002 <- n_nationkey
+|  |  runtime filters: RF004 <- n_nationkey
 |  |
 |  |--19:EXCHANGE [BROADCAST]
 |  |  |
@@ -2181,17 +2181,17 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ps_suppkey = s_suppkey
-|  |  runtime filters: RF003 <- s_suppkey
+|  |  runtime filters: RF006 <- s_suppkey
 |  |
 |  |--18:EXCHANGE [BROADCAST]
 |  |  |
 |  |  07:SCAN HDFS [tpch.supplier]
 |  |     partitions=1/1 files=1 size=1.33MB
-|  |     runtime filters: RF002 -> s_nationkey
+|  |     runtime filters: RF004 -> s_nationkey
 |  |
 |  06:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF003 -> ps_suppkey
+|     runtime filters: RF006 -> ps_suppkey
 |
 17:AGGREGATE [FINALIZE]
 |  output: sum:merge(ps_supplycost * ps_availqty)
@@ -2215,7 +2215,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF001 <- s_suppkey
+|  runtime filters: RF002 <- s_suppkey
 |
 |--14:EXCHANGE [BROADCAST]
 |  |
@@ -2225,7 +2225,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF001 -> ps_suppkey
+   runtime filters: RF002 -> ps_suppkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -2254,7 +2254,7 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF002 <- n_nationkey
+|  |  runtime filters: RF004 <- n_nationkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -2268,7 +2268,7 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: ps_suppkey = s_suppkey
-|  |  runtime filters: RF003 <- s_suppkey
+|  |  runtime filters: RF006 <- s_suppkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=02 plan-id=03 cohort-id=02
@@ -2278,11 +2278,11 @@ PLAN-ROOT SINK
 |  |  |
 |  |  07:SCAN HDFS [tpch.supplier]
 |  |     partitions=1/1 files=1 size=1.33MB
-|  |     runtime filters: RF002 -> s_nationkey
+|  |     runtime filters: RF004 -> s_nationkey
 |  |
 |  06:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF003 -> ps_suppkey
+|     runtime filters: RF006 -> ps_suppkey
 |
 17:AGGREGATE [FINALIZE]
 |  output: sum:merge(ps_supplycost * ps_availqty)
@@ -2310,7 +2310,7 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: ps_suppkey = s_suppkey
-|  runtime filters: RF001 <- s_suppkey
+|  runtime filters: RF002 <- s_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=04 plan-id=05 cohort-id=01
@@ -2324,7 +2324,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch.partsupp]
    partitions=1/1 files=1 size=112.71MB
-   runtime filters: RF001 -> ps_suppkey
+   runtime filters: RF002 -> ps_suppkey
 ====
 # TPCH-Q12
 # Q12 - Shipping Mode and Order Priority Query
@@ -3063,7 +3063,7 @@ PLAN-ROOT SINK
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_partkey = p_partkey
-|  |  runtime filters: RF001 <- p_partkey
+|  |  runtime filters: RF002 <- p_partkey
 |  |
 |  |--01:SCAN HDFS [tpch.part]
 |  |     partitions=1/1 files=1 size=22.83MB
@@ -3071,7 +3071,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB
-|     runtime filters: RF001 -> l_partkey
+|     runtime filters: RF002 -> l_partkey
 |
 03:AGGREGATE [FINALIZE]
 |  output: avg(l_quantity)
@@ -3100,7 +3100,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: l_partkey = p_partkey
-|  |  runtime filters: RF001 <- p_partkey
+|  |  runtime filters: RF002 <- p_partkey
 |  |
 |  |--09:EXCHANGE [BROADCAST]
 |  |  |
@@ -3110,7 +3110,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB
-|     runtime filters: RF001 -> l_partkey
+|     runtime filters: RF002 -> l_partkey
 |
 08:AGGREGATE [FINALIZE]
 |  output: avg:merge(l_quantity)
@@ -3149,7 +3149,7 @@ PLAN-ROOT SINK
 |  |
 |  04:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: l_partkey = p_partkey
-|  |  runtime filters: RF001 <- p_partkey
+|  |  runtime filters: RF002 <- p_partkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -3163,7 +3163,7 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.lineitem]
 |     partitions=1/1 files=1 size=718.94MB
-|     runtime filters: RF001 -> l_partkey
+|     runtime filters: RF002 -> l_partkey
 |
 08:AGGREGATE [FINALIZE]
 |  output: avg:merge(l_quantity)
@@ -3239,22 +3239,22 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--00:SCAN HDFS [tpch.customer]
 |     partitions=1/1 files=1 size=23.08MB
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004 <- o_orderkey
 |
 |--01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> o_orderkey, RF001 -> o_custkey
+|     runtime filters: RF000 -> o_orderkey, RF002 -> o_custkey
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> l_orderkey
+   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF004 -> l_orderkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -3295,7 +3295,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--12:EXCHANGE [BROADCAST]
 |  |
@@ -3304,19 +3304,19 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004 <- o_orderkey
 |
 |--11:EXCHANGE [HASH(o_orderkey)]
 |  |
 |  01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> o_orderkey, RF001 -> o_custkey
+|     runtime filters: RF000 -> o_orderkey, RF002 -> o_custkey
 |
 10:EXCHANGE [HASH(l_orderkey)]
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> l_orderkey
+   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF004 -> l_orderkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -3361,7 +3361,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
-|  runtime filters: RF001 <- c_custkey
+|  runtime filters: RF002 <- c_custkey
 |
 |--JOIN BUILD
 |  |  join-table-id=01 plan-id=02 cohort-id=01
@@ -3374,7 +3374,7 @@ PLAN-ROOT SINK
 |
 05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
-|  runtime filters: RF002 <- o_orderkey
+|  runtime filters: RF004 <- o_orderkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -3384,13 +3384,13 @@ PLAN-ROOT SINK
 |  |
 |  01:SCAN HDFS [tpch.orders]
 |     partitions=1/1 files=1 size=162.56MB
-|     runtime filters: RF000 -> o_orderkey, RF001 -> o_custkey
+|     runtime filters: RF000 -> o_orderkey, RF002 -> o_custkey
 |
 10:EXCHANGE [HASH(l_orderkey)]
 |
 02:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
-   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF002 -> l_orderkey
+   runtime filters: RF000 -> tpch.lineitem.l_orderkey, RF004 -> l_orderkey
 ====
 # TPCH-Q19
 # Q19 - Discounted Revenue Query
@@ -3556,7 +3556,7 @@ PLAN-ROOT SINK
 |
 |--08:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF004 <- n_nationkey
+|  |  runtime filters: RF008 <- n_nationkey
 |  |
 |  |--01:SCAN HDFS [tpch.nation]
 |  |     partitions=1/1 files=1 size=2.15KB
@@ -3564,16 +3564,16 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF004 -> s_nationkey
+|     runtime filters: RF008 -> s_nationkey
 |
 07:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
 |  other join predicates: ps_availqty > 0.5 * sum(l_quantity)
-|  runtime filters: RF001 <- ps_partkey, RF002 <- ps_suppkey
+|  runtime filters: RF002 <- ps_partkey, RF003 <- ps_suppkey
 |
 |--06:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: ps_partkey = p_partkey
-|  |  runtime filters: RF003 <- p_partkey
+|  |  runtime filters: RF006 <- p_partkey
 |  |
 |  |--03:SCAN HDFS [tpch.part]
 |  |     partitions=1/1 files=1 size=22.83MB
@@ -3581,7 +3581,7 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF000 -> ps_suppkey, RF003 -> ps_partkey
+|     runtime filters: RF000 -> ps_suppkey, RF006 -> ps_partkey
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(l_quantity)
@@ -3590,7 +3590,7 @@ PLAN-ROOT SINK
 04:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate < '1995-01-01', l_shipdate >= '1994-01-01'
-   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF001 -> tpch.lineitem.l_partkey, RF002 -> tpch.lineitem.l_suppkey
+   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF002 -> tpch.lineitem.l_partkey, RF003 -> tpch.lineitem.l_suppkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -3608,7 +3608,7 @@ PLAN-ROOT SINK
 |  |
 |  08:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF004 <- n_nationkey
+|  |  runtime filters: RF008 <- n_nationkey
 |  |
 |  |--15:EXCHANGE [BROADCAST]
 |  |  |
@@ -3618,20 +3618,20 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF004 -> s_nationkey
+|     runtime filters: RF008 -> s_nationkey
 |
 16:EXCHANGE [HASH(ps_suppkey)]
 |
 07:HASH JOIN [RIGHT SEMI JOIN, PARTITIONED]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
 |  other join predicates: ps_availqty > 0.5 * sum(l_quantity)
-|  runtime filters: RF001 <- ps_partkey, RF002 <- ps_suppkey
+|  runtime filters: RF002 <- ps_partkey, RF003 <- ps_suppkey
 |
 |--14:EXCHANGE [HASH(ps_partkey,ps_suppkey)]
 |  |
 |  06:HASH JOIN [LEFT SEMI JOIN, BROADCAST]
 |  |  hash predicates: ps_partkey = p_partkey
-|  |  runtime filters: RF003 <- p_partkey
+|  |  runtime filters: RF006 <- p_partkey
 |  |
 |  |--13:EXCHANGE [BROADCAST]
 |  |  |
@@ -3641,7 +3641,7 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF000 -> ps_suppkey, RF003 -> ps_partkey
+|     runtime filters: RF000 -> ps_suppkey, RF006 -> ps_partkey
 |
 12:AGGREGATE [FINALIZE]
 |  output: sum:merge(l_quantity)
@@ -3656,7 +3656,7 @@ PLAN-ROOT SINK
 04:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate < '1995-01-01', l_shipdate >= '1994-01-01'
-   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF001 -> tpch.lineitem.l_partkey, RF002 -> tpch.lineitem.l_suppkey
+   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF002 -> tpch.lineitem.l_partkey, RF003 -> tpch.lineitem.l_suppkey
 ---- PARALLELPLANS
 PLAN-ROOT SINK
 |
@@ -3678,7 +3678,7 @@ PLAN-ROOT SINK
 |  |
 |  08:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF004 <- n_nationkey
+|  |  runtime filters: RF008 <- n_nationkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=01 plan-id=02 cohort-id=02
@@ -3692,14 +3692,14 @@ PLAN-ROOT SINK
 |  |
 |  00:SCAN HDFS [tpch.supplier]
 |     partitions=1/1 files=1 size=1.33MB
-|     runtime filters: RF004 -> s_nationkey
+|     runtime filters: RF008 -> s_nationkey
 |
 16:EXCHANGE [HASH(ps_suppkey)]
 |
 07:HASH JOIN [RIGHT SEMI JOIN, PARTITIONED]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
 |  other join predicates: ps_availqty > 0.5 * sum(l_quantity)
-|  runtime filters: RF001 <- ps_partkey, RF002 <- ps_suppkey
+|  runtime filters: RF002 <- ps_partkey, RF003 <- ps_suppkey
 |
 |--JOIN BUILD
 |  |  join-table-id=02 plan-id=03 cohort-id=01
@@ -3709,7 +3709,7 @@ PLAN-ROOT SINK
 |  |
 |  06:HASH JOIN [LEFT SEMI JOIN, BROADCAST]
 |  |  hash predicates: ps_partkey = p_partkey
-|  |  runtime filters: RF003 <- p_partkey
+|  |  runtime filters: RF006 <- p_partkey
 |  |
 |  |--JOIN BUILD
 |  |  |  join-table-id=03 plan-id=04 cohort-id=03
@@ -3723,7 +3723,7 @@ PLAN-ROOT SINK
 |  |
 |  02:SCAN HDFS [tpch.partsupp]
 |     partitions=1/1 files=1 size=112.71MB
-|     runtime filters: RF000 -> ps_suppkey, RF003 -> ps_partkey
+|     runtime filters: RF000 -> ps_suppkey, RF006 -> ps_partkey
 |
 12:AGGREGATE [FINALIZE]
 |  output: sum:merge(l_quantity)
@@ -3738,7 +3738,7 @@ PLAN-ROOT SINK
 04:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    predicates: l_shipdate < '1995-01-01', l_shipdate >= '1994-01-01'
-   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF001 -> tpch.lineitem.l_partkey, RF002 -> tpch.lineitem.l_suppkey
+   runtime filters: RF000 -> tpch.lineitem.l_suppkey, RF002 -> tpch.lineitem.l_partkey, RF003 -> tpch.lineitem.l_suppkey
 ====
 # TPCH-Q21
 # Q21 - Suppliers Who Kept Orders Waiting Query
@@ -3803,7 +3803,7 @@ PLAN-ROOT SINK
 |  |
 |  |--08:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: s_nationkey = n_nationkey
-|  |  |  runtime filters: RF001 <- n_nationkey
+|  |  |  runtime filters: RF002 <- n_nationkey
 |  |  |
 |  |  |--03:SCAN HDFS [tpch.nation]
 |  |  |     partitions=1/1 files=1 size=2.15KB
@@ -3811,15 +3811,15 @@ PLAN-ROOT SINK
 |  |  |
 |  |  07:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: l1.l_suppkey = s_suppkey
-|  |  |  runtime filters: RF002 <- s_suppkey
+|  |  |  runtime filters: RF004 <- s_suppkey
 |  |  |
 |  |  |--00:SCAN HDFS [tpch.supplier]
 |  |  |     partitions=1/1 files=1 size=1.33MB
-|  |  |     runtime filters: RF001 -> s_nationkey
+|  |  |     runtime filters: RF002 -> s_nationkey
 |  |  |
 |  |  06:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: l1.l_orderkey = o_orderkey
-|  |  |  runtime filters: RF003 <- o_orderkey
+|  |  |  runtime filters: RF006 <- o_orderkey
 |  |  |
 |  |  |--02:SCAN HDFS [tpch.orders]
 |  |  |     partitions=1/1 files=1 size=162.56MB
@@ -3828,7 +3828,7 @@ PLAN-ROOT SINK
 |  |  01:SCAN HDFS [tpch.lineitem l1]
 |  |     partitions=1/1 files=1 size=718.94MB
 |  |     predicates: l1.l_receiptdate > l1.l_commitdate
-|  |     runtime filters: RF002 -> l1.l_suppkey, RF003 -> l1.l_orderkey
+|  |     runtime filters: RF004 -> l1.l_suppkey, RF006 -> l1.l_orderkey
 |  |
 |  04:SCAN HDFS [tpch.lineitem l2]
 |     partitions=1/1 files=1 size=718.94MB
@@ -3868,7 +3868,7 @@ PLAN-ROOT SINK
 |  |
 |  |--08:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: s_nationkey = n_nationkey
-|  |  |  runtime filters: RF001 <- n_nationkey
+|  |  |  runtime filters: RF002 <- n_nationkey
 |  |  |
 |  |  |--16:EXCHANGE [BROADCAST]
 |  |  |  |
@@ -3878,17 +3878,17 @@ PLAN-ROOT SINK
 |  |  |
 |  |  07:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: l1.l_suppkey = s_suppkey
-|  |  |  runtime filters: RF002 <- s_suppkey
+|  |  |  runtime filters: RF004 <- s_suppkey
 |  |  |
 |  |  |--15:EXCHANGE [BROADCAST]
 |  |  |  |
 |  |  |  00:SCAN HDFS [tpch.supplier]
 |  |  |     partitions=1/1 files=1 size=1.33MB
-|  |  |     runtime filters: RF001 -> s_nationkey
+|  |  |     runtime filters: RF002 -> s_nationkey
 |  |  |
 |  |  06:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  |  hash predicates: l1.l_orderkey = o_orderkey
-|  |  |  runtime filters: RF003 <- o_orderkey
+|  |  |  runtime filters: RF006 <- o_orderkey
 |  |  |
 |  |  |--14:EXCHANGE [HASH(o_orderkey)]
 |  |  |  |
@@ -3901,7 +3901,7 @@ PLAN-ROOT SINK
 |  |  01:SCAN HDFS [tpch.lineitem l1]
 |  |     partitions=1/1 files=1 size=718.94MB
 |  |     predicates: l1.l_receiptdate > l1.l_commitdate
-|  |     runtime filters: RF002 -> l1.l_suppkey, RF003 -> l1.l_orderkey
+|  |     runtime filters: RF004 -> l1.l_suppkey, RF006 -> l1.l_orderkey
 |  |
 |  17:EXCHANGE [HASH(l2.l_orderkey)]
 |  |
@@ -3953,7 +3953,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  08:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: s_nationkey = n_nationkey
-|  |  |  runtime filters: RF001 <- n_nationkey
+|  |  |  runtime filters: RF002 <- n_nationkey
 |  |  |
 |  |  |--JOIN BUILD
 |  |  |  |  join-table-id=02 plan-id=03 cohort-id=03
@@ -3967,7 +3967,7 @@ PLAN-ROOT SINK
 |  |  |
 |  |  07:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  |  hash predicates: l1.l_suppkey = s_suppkey
-|  |  |  runtime filters: RF002 <- s_suppkey
+|  |  |  runtime filters: RF004 <- s_suppkey
 |  |  |
 |  |  |--JOIN BUILD
 |  |  |  |  join-table-id=03 plan-id=04 cohort-id=03
@@ -3977,11 +3977,11 @@ PLAN-ROOT SINK
 |  |  |  |
 |  |  |  00:SCAN HDFS [tpch.supplier]
 |  |  |     partitions=1/1 files=1 size=1.33MB
-|  |  |     runtime filters: RF001 -> s_nationkey
+|  |  |     runtime filters: RF002 -> s_nationkey
 |  |  |
 |  |  06:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  |  hash predicates: l1.l_orderkey = o_orderkey
-|  |  |  runtime filters: RF003 <- o_orderkey
+|  |  |  runtime filters: RF006 <- o_orderkey
 |  |  |
 |  |  |--JOIN BUILD
 |  |  |  |  join-table-id=04 plan-id=05 cohort-id=03
@@ -3998,7 +3998,7 @@ PLAN-ROOT SINK
 |  |  01:SCAN HDFS [tpch.lineitem l1]
 |  |     partitions=1/1 files=1 size=718.94MB
 |  |     predicates: l1.l_receiptdate > l1.l_commitdate
-|  |     runtime filters: RF002 -> l1.l_suppkey, RF003 -> l1.l_orderkey
+|  |     runtime filters: RF004 -> l1.l_suppkey, RF006 -> l1.l_orderkey
 |  |
 |  17:EXCHANGE [HASH(l2.l_orderkey)]
 |  |

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-kudu.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-kudu.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-kudu.test
index 2dde552..9c4da19 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-kudu.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-kudu.test
@@ -86,31 +86,39 @@ PLAN-ROOT SINK
 |
 17:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: ps_partkey = p_partkey, min(ps_supplycost) = ps_supplycost
+|  runtime filters: RF002 <- p_partkey
 |
 |--16:HASH JOIN [INNER JOIN]
 |  |  hash predicates: n_regionkey = r_regionkey
+|  |  runtime filters: RF011 <- r_regionkey
 |  |
 |  |--04:SCAN KUDU [tpch_kudu.region]
 |  |     kudu predicates: r_name = 'EUROPE'
 |  |
 |  15:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
+|  |  runtime filters: RF013 <- n_nationkey
 |  |
 |  |--03:SCAN KUDU [tpch_kudu.nation]
+|  |     runtime filters: RF011 -> n_regionkey
 |  |
 |  14:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_suppkey = ps_suppkey
+|  |  runtime filters: RF015 <- ps_suppkey
 |  |
 |  |--13:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: ps_partkey = p_partkey
+|  |  |  runtime filters: RF017 <- p_partkey
 |  |  |
 |  |  |--00:SCAN KUDU [tpch_kudu.part]
 |  |  |     predicates: p_type LIKE '%BRASS'
 |  |  |     kudu predicates: p_size = 15
 |  |  |
 |  |  02:SCAN KUDU [tpch_kudu.partsupp]
+|  |     runtime filters: RF017 -> ps_partkey
 |  |
 |  01:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF013 -> s_nationkey, RF015 -> s_suppkey
 |
 12:AGGREGATE [FINALIZE]
 |  output: min(ps_supplycost)
@@ -118,21 +126,27 @@ PLAN-ROOT SINK
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: n_regionkey = r_regionkey
+|  runtime filters: RF005 <- r_regionkey
 |
 |--08:SCAN KUDU [tpch_kudu.region]
 |     kudu predicates: r_name = 'EUROPE'
 |
 10:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
+|  runtime filters: RF007 <- n_nationkey
 |
 |--07:SCAN KUDU [tpch_kudu.nation]
+|     runtime filters: RF005 -> n_regionkey
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: ps_suppkey = s_suppkey
+|  runtime filters: RF009 <- s_suppkey
 |
 |--06:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF007 -> s_nationkey
 |
 05:SCAN KUDU [tpch_kudu.partsupp]
+   runtime filters: RF002 -> tpch_kudu.partsupp.ps_partkey, RF009 -> ps_suppkey
 ====
 # Q3 - Shipping Priority Query
 select
@@ -170,18 +184,22 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
+|  runtime filters: RF001 <- c_custkey
 |
 |--00:SCAN KUDU [tpch_kudu.customer]
 |     kudu predicates: c_mktsegment = 'BUILDING'
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
+|  runtime filters: RF003 <- o_orderkey
 |
 |--01:SCAN KUDU [tpch_kudu.orders]
 |     kudu predicates: o_orderdate < '1995-03-15'
+|     runtime filters: RF001 -> o_custkey
 |
 02:SCAN KUDU [tpch_kudu.lineitem]
    kudu predicates: l_shipdate > '1995-03-15'
+   runtime filters: RF003 -> l_orderkey
 ====
 # Q4 - Order Priority Checking Query
 select
@@ -217,12 +235,14 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: l_orderkey = o_orderkey
+|  runtime filters: RF001 <- o_orderkey
 |
 |--00:SCAN KUDU [tpch_kudu.orders]
 |     kudu predicates: o_orderdate < '1993-10-01', o_orderdate >= '1993-07-01'
 |
 01:SCAN KUDU [tpch_kudu.lineitem]
    predicates: l_commitdate < l_receiptdate
+   runtime filters: RF001 -> l_orderkey
 ====
 # Q5 - Local Supplier Volume Query
 select
@@ -261,32 +281,42 @@ PLAN-ROOT SINK
 |
 10:HASH JOIN [INNER JOIN]
 |  hash predicates: n_regionkey = r_regionkey
+|  runtime filters: RF001 <- r_regionkey
 |
 |--05:SCAN KUDU [tpch_kudu.region]
 |     kudu predicates: r_name = 'ASIA'
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
+|  runtime filters: RF003 <- n_nationkey
 |
 |--04:SCAN KUDU [tpch_kudu.nation]
+|     runtime filters: RF001 -> n_regionkey
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = s_nationkey, l_suppkey = s_suppkey
+|  runtime filters: RF006 <- s_nationkey, RF007 <- s_suppkey
 |
 |--03:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF003 -> s_nationkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
+|  runtime filters: RF009 <- c_custkey
 |
 |--00:SCAN KUDU [tpch_kudu.customer]
+|     runtime filters: RF003 -> tpch_kudu.customer.c_nationkey, RF006 -> c_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
+|  runtime filters: RF011 <- o_orderkey
 |
 |--01:SCAN KUDU [tpch_kudu.orders]
 |     kudu predicates: o_orderdate < '1995-01-01', o_orderdate >= '1994-01-01'
+|     runtime filters: RF009 -> o_custkey
 |
 02:SCAN KUDU [tpch_kudu.lineitem]
+   runtime filters: RF007 -> l_suppkey, RF011 -> l_orderkey
 ====
 # Q6 - Forecasting Revenue Change Query
 select
@@ -359,31 +389,40 @@ PLAN-ROOT SINK
 10:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = n2.n_nationkey
 |  other predicates: ((n1.n_name = 'FRANCE' AND n2.n_name = 'GERMANY') OR (n1.n_name = 'GERMANY' AND n2.n_name = 'FRANCE'))
+|  runtime filters: RF001 <- n2.n_nationkey
 |
 |--05:SCAN KUDU [tpch_kudu.nation n2]
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n1.n_nationkey
+|  runtime filters: RF003 <- n1.n_nationkey
 |
 |--04:SCAN KUDU [tpch_kudu.nation n1]
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
+|  runtime filters: RF005 <- c_custkey
 |
 |--03:SCAN KUDU [tpch_kudu.customer]
+|     runtime filters: RF001 -> c_nationkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: l_suppkey = s_suppkey
+|  runtime filters: RF007 <- s_suppkey
 |
 |--00:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF003 -> s_nationkey
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
+|  runtime filters: RF009 <- o_orderkey
 |
 |--02:SCAN KUDU [tpch_kudu.orders]
+|     runtime filters: RF005 -> o_custkey
 |
 01:SCAN KUDU [tpch_kudu.lineitem]
    kudu predicates: l_shipdate <= '1996-12-31', l_shipdate >= '1995-01-01'
+   runtime filters: RF007 -> l_suppkey, RF009 -> l_orderkey
 ====
 # Q8 - National Market Share Query
 select
@@ -435,43 +474,55 @@ PLAN-ROOT SINK
 |
 14:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n2.n_nationkey
+|  runtime filters: RF001 <- n2.n_nationkey
 |
 |--06:SCAN KUDU [tpch_kudu.nation n2]
 |
 13:HASH JOIN [INNER JOIN]
 |  hash predicates: n1.n_regionkey = r_regionkey
+|  runtime filters: RF003 <- r_regionkey
 |
 |--07:SCAN KUDU [tpch_kudu.region]
 |     kudu predicates: r_name = 'AMERICA'
 |
 12:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = n1.n_nationkey
+|  runtime filters: RF005 <- n1.n_nationkey
 |
 |--05:SCAN KUDU [tpch_kudu.nation n1]
+|     runtime filters: RF003 -> n1.n_regionkey
 |
 11:HASH JOIN [INNER JOIN]
 |  hash predicates: c_custkey = o_custkey
+|  runtime filters: RF007 <- o_custkey
 |
 |--10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_suppkey = s_suppkey
+|  |  runtime filters: RF009 <- s_suppkey
 |  |
 |  |--01:SCAN KUDU [tpch_kudu.supplier]
+|  |     runtime filters: RF001 -> s_nationkey
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o_orderkey = l_orderkey
+|  |  runtime filters: RF011 <- l_orderkey
 |  |
 |  |--08:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: l_partkey = p_partkey
+|  |  |  runtime filters: RF013 <- p_partkey
 |  |  |
 |  |  |--00:SCAN KUDU [tpch_kudu.part]
 |  |  |     kudu predicates: p_type = 'ECONOMY ANODIZED STEEL'
 |  |  |
 |  |  02:SCAN KUDU [tpch_kudu.lineitem]
+|  |     runtime filters: RF009 -> l_suppkey, RF013 -> l_partkey
 |  |
 |  03:SCAN KUDU [tpch_kudu.orders]
 |     kudu predicates: o_orderdate <= '1996-12-31', o_orderdate >= '1995-01-01'
+|     runtime filters: RF011 -> o_orderkey
 |
 04:SCAN KUDU [tpch_kudu.customer]
+   runtime filters: RF005 -> c_nationkey, RF007 -> c_custkey
 ====
 # Q9 - Product Type Measure Query
 select
@@ -517,31 +568,39 @@ PLAN-ROOT SINK
 |
 10:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
+|  runtime filters: RF001 <- n_nationkey
 |
 |--05:SCAN KUDU [tpch_kudu.nation]
 |
 09:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
+|  runtime filters: RF004 <- ps_partkey, RF005 <- ps_suppkey
 |
 |--03:SCAN KUDU [tpch_kudu.partsupp]
 |
 08:HASH JOIN [INNER JOIN]
 |  hash predicates: l_suppkey = s_suppkey
+|  runtime filters: RF007 <- s_suppkey
 |
 |--01:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF001 -> s_nationkey, RF005 -> tpch_kudu.supplier.s_suppkey
 |
 07:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
+|  runtime filters: RF009 <- o_orderkey
 |
 |--04:SCAN KUDU [tpch_kudu.orders]
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = p_partkey
+|  runtime filters: RF011 <- p_partkey
 |
 |--00:SCAN KUDU [tpch_kudu.part]
 |     predicates: p_name LIKE '%green%'
+|     runtime filters: RF004 -> tpch_kudu.part.p_partkey
 |
 02:SCAN KUDU [tpch_kudu.lineitem]
+   runtime filters: RF004 -> l_partkey, RF005 -> l_suppkey, RF007 -> l_suppkey, RF009 -> l_orderkey, RF011 -> l_partkey
 ====
 # Q10 - Returned Item Reporting Query
 # Converted select from multiple tables to joins
@@ -589,22 +648,27 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = n_nationkey
+|  runtime filters: RF001 <- n_nationkey
 |
 |--03:SCAN KUDU [tpch_kudu.nation]
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: c_custkey = o_custkey
+|  runtime filters: RF003 <- o_custkey
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_orderkey = o_orderkey
+|  |  runtime filters: RF005 <- o_orderkey
 |  |
 |  |--01:SCAN KUDU [tpch_kudu.orders]
 |  |     kudu predicates: o_orderdate < '1994-01-01', o_orderdate >= '1993-10-01'
 |  |
 |  02:SCAN KUDU [tpch_kudu.lineitem]
 |     kudu predicates: l_returnflag = 'R'
+|     runtime filters: RF005 -> l_orderkey
 |
 00:SCAN KUDU [tpch_kudu.customer]
+   runtime filters: RF001 -> c_nationkey, RF003 -> c_custkey
 ====
 # Q11 - Important Stock Identification
 # Modifications: query was rewritten to not have a subquery in the having clause
@@ -654,16 +718,20 @@ PLAN-ROOT SINK
 |  |
 |  10:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
+|  |  runtime filters: RF005 <- n_nationkey
 |  |
 |  |--08:SCAN KUDU [tpch_kudu.nation]
 |  |     kudu predicates: n_name = 'GERMANY'
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ps_suppkey = s_suppkey
+|  |  runtime filters: RF007 <- s_suppkey
 |  |
 |  |--07:SCAN KUDU [tpch_kudu.supplier]
+|  |     runtime filters: RF005 -> s_nationkey
 |  |
 |  06:SCAN KUDU [tpch_kudu.partsupp]
+|     runtime filters: RF007 -> ps_suppkey
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(ps_supplycost * ps_availqty)
@@ -671,16 +739,20 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
+|  runtime filters: RF001 <- n_nationkey
 |
 |--02:SCAN KUDU [tpch_kudu.nation]
 |     kudu predicates: n_name = 'GERMANY'
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: ps_suppkey = s_suppkey
+|  runtime filters: RF003 <- s_suppkey
 |
 |--01:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF001 -> s_nationkey
 |
 00:SCAN KUDU [tpch_kudu.partsupp]
+   runtime filters: RF003 -> ps_suppkey
 ====
 # Q12 - Shipping Mode and Order Priority Query
 select
@@ -723,12 +795,14 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: o_orderkey = l_orderkey
+|  runtime filters: RF001 <- l_orderkey
 |
 |--01:SCAN KUDU [tpch_kudu.lineitem]
 |     predicates: l_commitdate < l_receiptdate, l_shipdate < l_commitdate
 |     kudu predicates: l_shipmode IN ('MAIL', 'SHIP'), l_receiptdate < '1995-01-01', l_receiptdate >= '1994-01-01'
 |
 00:SCAN KUDU [tpch_kudu.orders]
+   runtime filters: RF001 -> o_orderkey
 ====
 # Q13 - Customer Distribution Query
 select
@@ -767,11 +841,13 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [RIGHT OUTER JOIN]
 |  hash predicates: o_custkey = c_custkey
+|  runtime filters: RF001 <- c_custkey
 |
 |--00:SCAN KUDU [tpch_kudu.customer]
 |
 01:SCAN KUDU [tpch_kudu.orders]
    predicates: NOT o_comment LIKE '%special%requests%'
+   runtime filters: RF001 -> o_custkey
 ====
 # Q14 - Promotion Effect
 select
@@ -795,11 +871,13 @@ PLAN-ROOT SINK
 |
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = p_partkey
+|  runtime filters: RF001 <- p_partkey
 |
 |--01:SCAN KUDU [tpch_kudu.part]
 |
 00:SCAN KUDU [tpch_kudu.lineitem]
    kudu predicates: l_shipdate < '1995-10-01', l_shipdate >= '1995-09-01'
+   runtime filters: RF001 -> l_partkey
 ====
 # Q15 - Top Supplier Query
 with revenue_view as (
@@ -853,6 +931,7 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: s_suppkey = l_suppkey
+|  runtime filters: RF001 <- l_suppkey
 |
 |--02:AGGREGATE [FINALIZE]
 |  |  output: sum(l_extendedprice * (1 - l_discount))
@@ -862,6 +941,7 @@ PLAN-ROOT SINK
 |     kudu predicates: l_shipdate < '1996-04-01', l_shipdate >= '1996-01-01'
 |
 00:SCAN KUDU [tpch_kudu.supplier]
+   runtime filters: RF001 -> s_suppkey
 ====
 # Q16 - Parts/Supplier Relation Query
 select
@@ -915,12 +995,14 @@ PLAN-ROOT SINK
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: ps_partkey = p_partkey
+|  runtime filters: RF001 <- p_partkey
 |
 |--01:SCAN KUDU [tpch_kudu.part]
 |     predicates: p_brand != 'Brand#45', NOT p_type LIKE 'MEDIUM POLISHED%'
 |     kudu predicates: p_size IN (49, 14, 23, 45, 19, 3, 36, 9)
 |
 00:SCAN KUDU [tpch_kudu.partsupp]
+   runtime filters: RF001 -> ps_partkey
 ====
 # Q17 - Small-Quantity-Order Revenue Query
 select
@@ -949,20 +1031,24 @@ PLAN-ROOT SINK
 05:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: l_partkey = p_partkey
 |  other join predicates: l_quantity < round(0.2 * avg(l_quantity), 2)
+|  runtime filters: RF001 <- p_partkey
 |
 |--04:HASH JOIN [INNER JOIN]
 |  |  hash predicates: l_partkey = p_partkey
+|  |  runtime filters: RF003 <- p_partkey
 |  |
 |  |--01:SCAN KUDU [tpch_kudu.part]
 |  |     kudu predicates: p_container = 'MED BOX', p_brand = 'Brand#23'
 |  |
 |  00:SCAN KUDU [tpch_kudu.lineitem]
+|     runtime filters: RF003 -> l_partkey
 |
 03:AGGREGATE [FINALIZE]
 |  output: avg(l_quantity)
 |  group by: l_partkey
 |
 02:SCAN KUDU [tpch_kudu.lineitem]
+   runtime filters: RF001 -> tpch_kudu.lineitem.l_partkey
 ====
 # Q18 - Large Value tpch_kudu.customer Query
 select
@@ -1011,6 +1097,7 @@ PLAN-ROOT SINK
 |
 07:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: o_orderkey = l_orderkey
+|  runtime filters: RF001 <- l_orderkey
 |
 |--04:AGGREGATE [FINALIZE]
 |  |  output: sum(l_quantity)
@@ -1021,15 +1108,19 @@ PLAN-ROOT SINK
 |
 06:HASH JOIN [INNER JOIN]
 |  hash predicates: o_custkey = c_custkey
+|  runtime filters: RF003 <- c_custkey
 |
 |--00:SCAN KUDU [tpch_kudu.customer]
 |
 05:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
+|  runtime filters: RF005 <- o_orderkey
 |
 |--01:SCAN KUDU [tpch_kudu.orders]
+|     runtime filters: RF001 -> o_orderkey, RF003 -> o_custkey
 |
 02:SCAN KUDU [tpch_kudu.lineitem]
+   runtime filters: RF001 -> tpch_kudu.lineitem.l_orderkey, RF005 -> l_orderkey
 ====
 # Q19 - Discounted Revenue Query
 select
@@ -1076,12 +1167,14 @@ PLAN-ROOT SINK
 02:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = p_partkey
 |  other predicates: ((p_brand = 'Brand#12' AND p_container IN ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') AND l_quantity >= 1 AND l_quantity <= 11 AND p_size <= 5) OR (p_brand = 'Brand#23' AND p_container IN ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') AND l_quantity >= 10 AND l_quantity <= 20 AND p_size <= 10) OR (p_brand = 'Brand#34' AND p_container IN ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') AND l_quantity >= 20 AND l_quantity <= 30 AND p_size <= 15))
+|  runtime filters: RF001 <- p_partkey
 |
 |--01:SCAN KUDU [tpch_kudu.part]
 |     kudu predicates: p_size >= 1
 |
 00:SCAN KUDU [tpch_kudu.lineitem]
    kudu predicates: l_shipmode IN ('AIR', 'AIR REG'), l_shipinstruct = 'DELIVER IN PERSON'
+   runtime filters: RF001 -> l_partkey
 ====
 # Q20 - Potential Part Promotion Query
 select
@@ -1128,26 +1221,32 @@ PLAN-ROOT SINK
 |
 09:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: ps_suppkey = s_suppkey
+|  runtime filters: RF001 <- s_suppkey
 |
 |--08:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
+|  |  runtime filters: RF009 <- n_nationkey
 |  |
 |  |--01:SCAN KUDU [tpch_kudu.nation]
 |  |     kudu predicates: n_name = 'CANADA'
 |  |
 |  00:SCAN KUDU [tpch_kudu.supplier]
+|     runtime filters: RF009 -> s_nationkey
 |
 07:HASH JOIN [RIGHT SEMI JOIN]
 |  hash predicates: l_partkey = ps_partkey, l_suppkey = ps_suppkey
 |  other join predicates: ps_availqty > 0.5 * sum(l_quantity)
+|  runtime filters: RF004 <- ps_partkey, RF005 <- ps_suppkey
 |
 |--06:HASH JOIN [LEFT SEMI JOIN]
 |  |  hash predicates: ps_partkey = p_partkey
+|  |  runtime filters: RF007 <- p_partkey
 |  |
 |  |--03:SCAN KUDU [tpch_kudu.part]
 |  |     predicates: p_name LIKE 'forest%'
 |  |
 |  02:SCAN KUDU [tpch_kudu.partsupp]
+|     runtime filters: RF001 -> ps_suppkey, RF007 -> ps_partkey
 |
 05:AGGREGATE [FINALIZE]
 |  output: sum(l_quantity)
@@ -1155,6 +1254,7 @@ PLAN-ROOT SINK
 |
 04:SCAN KUDU [tpch_kudu.lineitem]
    kudu predicates: l_shipdate < '1995-01-01', l_shipdate >= '1994-01-01'
+   runtime filters: RF001 -> tpch_kudu.lineitem.l_suppkey, RF004 -> tpch_kudu.lineitem.l_partkey, RF005 -> tpch_kudu.lineitem.l_suppkey
 ====
 # Q21 - Suppliers Who Kept Orders Waiting Query
 select
@@ -1214,28 +1314,35 @@ PLAN-ROOT SINK
 |--09:HASH JOIN [RIGHT SEMI JOIN]
 |  |  hash predicates: l2.l_orderkey = l1.l_orderkey
 |  |  other join predicates: l2.l_suppkey != l1.l_suppkey
+|  |  runtime filters: RF001 <- l1.l_orderkey
 |  |
 |  |--08:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: s_nationkey = n_nationkey
+|  |  |  runtime filters: RF003 <- n_nationkey
 |  |  |
 |  |  |--03:SCAN KUDU [tpch_kudu.nation]
 |  |  |     kudu predicates: n_name = 'SAUDI ARABIA'
 |  |  |
 |  |  07:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: l1.l_suppkey = s_suppkey
+|  |  |  runtime filters: RF005 <- s_suppkey
 |  |  |
 |  |  |--00:SCAN KUDU [tpch_kudu.supplier]
+|  |  |     runtime filters: RF003 -> s_nationkey
 |  |  |
 |  |  06:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: l1.l_orderkey = o_orderkey
+|  |  |  runtime filters: RF007 <- o_orderkey
 |  |  |
 |  |  |--02:SCAN KUDU [tpch_kudu.orders]
 |  |  |     kudu predicates: o_orderstatus = 'F'
 |  |  |
 |  |  01:SCAN KUDU [tpch_kudu.lineitem l1]
 |  |     predicates: l1.l_receiptdate > l1.l_commitdate
+|  |     runtime filters: RF005 -> l1.l_suppkey, RF007 -> l1.l_orderkey
 |  |
 |  04:SCAN KUDU [tpch_kudu.lineitem l2]
+|     runtime filters: RF001 -> l2.l_orderkey
 |
 05:SCAN KUDU [tpch_kudu.lineitem l3]
    predicates: l3.l_receiptdate > l3.l_commitdate

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
index aad75e0..1aaabce 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-nested.test
@@ -115,7 +115,7 @@ PLAN-ROOT SINK
 |  |
 |  21:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF004 <- n_nationkey
+|  |  runtime filters: RF008 <- n_nationkey
 |  |
 |  |--17:SUBPLAN
 |  |  |
@@ -140,11 +140,11 @@ PLAN-ROOT SINK
 |  11:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
 |     predicates: !empty(s.s_partsupps)
-|     runtime filters: RF004 -> s_nationkey
+|     runtime filters: RF008 -> s_nationkey
 |
 24:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF002 <- n_nationkey
+|  runtime filters: RF004 <- n_nationkey
 |
 |--07:SUBPLAN
 |  |
@@ -177,7 +177,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [tpch_nested_parquet.supplier s]
    partitions=1/1 files=1 size=43.00MB
    predicates: !empty(s.s_partsupps)
-   runtime filters: RF002 -> s_nationkey
+   runtime filters: RF004 -> s_nationkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -206,7 +206,7 @@ PLAN-ROOT SINK
 |  |
 |  21:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF004 <- n_nationkey
+|  |  runtime filters: RF008 <- n_nationkey
 |  |
 |  |--29:EXCHANGE [BROADCAST]
 |  |  |
@@ -233,11 +233,11 @@ PLAN-ROOT SINK
 |  11:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
 |     predicates: !empty(s.s_partsupps)
-|     runtime filters: RF004 -> s_nationkey
+|     runtime filters: RF008 -> s_nationkey
 |
 24:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n_nationkey
-|  runtime filters: RF002 <- n_nationkey
+|  runtime filters: RF004 <- n_nationkey
 |
 |--28:EXCHANGE [BROADCAST]
 |  |
@@ -274,7 +274,7 @@ PLAN-ROOT SINK
 00:SCAN HDFS [tpch_nested_parquet.supplier s]
    partitions=1/1 files=1 size=43.00MB
    predicates: !empty(s.s_partsupps)
-   runtime filters: RF002 -> s_nationkey
+   runtime filters: RF004 -> s_nationkey
 ====
 # TPCH-Q3
 # Q3 - Shipping Priority Query
@@ -509,7 +509,7 @@ PLAN-ROOT SINK
 |
 15:HASH JOIN [INNER JOIN]
 |  hash predicates: c.c_nationkey = n.n_nationkey
-|  runtime filters: RF002 <- n.n_nationkey
+|  runtime filters: RF004 <- n.n_nationkey
 |
 |--11:SUBPLAN
 |  |
@@ -543,7 +543,7 @@ PLAN-ROOT SINK
    partitions=1/1 files=4 size=292.36MB
    predicates: !empty(c.c_orders)
    predicates on o: !empty(o.o_lineitems), o_orderdate >= '1994-01-01', o_orderdate < '1995-01-01'
-   runtime filters: RF000 -> c_nationkey, RF002 -> c.c_nationkey
+   runtime filters: RF000 -> c_nationkey, RF004 -> c.c_nationkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -574,7 +574,7 @@ PLAN-ROOT SINK
 |
 15:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c.c_nationkey = n.n_nationkey
-|  runtime filters: RF002 <- n.n_nationkey
+|  runtime filters: RF004 <- n.n_nationkey
 |
 |--19:EXCHANGE [BROADCAST]
 |  |
@@ -610,7 +610,7 @@ PLAN-ROOT SINK
    partitions=1/1 files=4 size=292.36MB
    predicates: !empty(c.c_orders)
    predicates on o: !empty(o.o_lineitems), o_orderdate >= '1994-01-01', o_orderdate < '1995-01-01'
-   runtime filters: RF000 -> c_nationkey, RF002 -> c.c_nationkey
+   runtime filters: RF000 -> c_nationkey, RF004 -> c.c_nationkey
 ====
 # TPCH-Q6
 # Q6 - Forecasting Revenue Change Query
@@ -705,7 +705,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [INNER JOIN]
 |  hash predicates: s_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--10:SCAN HDFS [tpch_nested_parquet.region.r_nations n1]
 |     partitions=1/1 files=1 size=3.24KB
@@ -715,7 +715,7 @@ PLAN-ROOT SINK
 |
 |--09:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 01:SUBPLAN
 |
@@ -770,7 +770,7 @@ PLAN-ROOT SINK
 |
 13:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: s_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--18:EXCHANGE [BROADCAST]
 |  |
@@ -784,7 +784,7 @@ PLAN-ROOT SINK
 |  |
 |  09:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 01:SUBPLAN
 |
@@ -864,7 +864,7 @@ PLAN-ROOT SINK
 |
 19:HASH JOIN [INNER JOIN]
 |  hash predicates: c_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--12:SUBPLAN
 |  |
@@ -912,7 +912,7 @@ PLAN-ROOT SINK
    partitions=1/1 files=4 size=292.36MB
    predicates: !empty(c.c_orders)
    predicates on o: !empty(o.o_lineitems), o_orderdate >= '1995-01-01', o_orderdate <= '1996-12-31'
-   runtime filters: RF001 -> c_nationkey
+   runtime filters: RF002 -> c_nationkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -943,7 +943,7 @@ PLAN-ROOT SINK
 |
 19:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: c_nationkey = n1.n_nationkey
-|  runtime filters: RF001 <- n1.n_nationkey
+|  runtime filters: RF002 <- n1.n_nationkey
 |
 |--25:EXCHANGE [BROADCAST]
 |  |
@@ -997,7 +997,7 @@ PLAN-ROOT SINK
    partitions=1/1 files=4 size=292.36MB
    predicates: !empty(c.c_orders)
    predicates on o: !empty(o.o_lineitems), o_orderdate >= '1995-01-01', o_orderdate <= '1996-12-31'
-   runtime filters: RF001 -> c_nationkey
+   runtime filters: RF002 -> c_nationkey
 ====
 # TPCH-Q9
 # Q9 - Product Type Measure Query
@@ -1319,7 +1319,7 @@ PLAN-ROOT SINK
 |  |
 |  14:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF001 <- n_nationkey
+|  |  runtime filters: RF002 <- n_nationkey
 |  |
 |  |--13:SCAN HDFS [tpch_nested_parquet.region.r_nations n]
 |  |     partitions=1/1 files=1 size=3.24KB
@@ -1336,7 +1336,7 @@ PLAN-ROOT SINK
 |  08:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
 |     predicates: !empty(s.s_partsupps)
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 07:AGGREGATE [FINALIZE]
 |  output: sum(ps_supplycost * ps_availqty)
@@ -1386,7 +1386,7 @@ PLAN-ROOT SINK
 |  |
 |  14:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF001 <- n_nationkey
+|  |  runtime filters: RF002 <- n_nationkey
 |  |
 |  |--21:EXCHANGE [BROADCAST]
 |  |  |
@@ -1405,7 +1405,7 @@ PLAN-ROOT SINK
 |  08:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
 |     predicates: !empty(s.s_partsupps)
-|     runtime filters: RF001 -> s_nationkey
+|     runtime filters: RF002 -> s_nationkey
 |
 20:AGGREGATE [FINALIZE]
 |  output: sum:merge(ps_supplycost * ps_availqty)
@@ -1938,7 +1938,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF001 <- p_partkey
+|  runtime filters: RF002 <- p_partkey
 |
 |--01:SCAN HDFS [tpch_nested_parquet.part p]
 |     partitions=1/1 files=1 size=6.24MB
@@ -1947,7 +1947,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l]
    partitions=1/1 files=4 size=292.36MB
-   runtime filters: RF000 -> l.l_partkey, RF001 -> l_partkey
+   runtime filters: RF000 -> l.l_partkey, RF002 -> l_partkey
 ---- DISTRIBUTEDPLAN
 PLAN-ROOT SINK
 |
@@ -1981,7 +1981,7 @@ PLAN-ROOT SINK
 |
 04:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_partkey = p_partkey
-|  runtime filters: RF001 <- p_partkey
+|  runtime filters: RF002 <- p_partkey
 |
 |--07:EXCHANGE [BROADCAST]
 |  |
@@ -1992,7 +1992,7 @@ PLAN-ROOT SINK
 |
 00:SCAN HDFS [tpch_nested_parquet.customer.c_orders.o_lineitems l]
    partitions=1/1 files=4 size=292.36MB
-   runtime filters: RF000 -> l.l_partkey, RF001 -> l_partkey
+   runtime filters: RF000 -> l.l_partkey, RF002 -> l_partkey
 ====
 # TPCH-Q18
 # Q18 - Large Value Customer Query
@@ -2216,7 +2216,7 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF003 <- n_nationkey
+|  |  runtime filters: RF006 <- n_nationkey
 |  |
 |  |--05:SCAN HDFS [tpch_nested_parquet.region.r_nations n]
 |  |     partitions=1/1 files=1 size=3.24KB
@@ -2233,7 +2233,7 @@ PLAN-ROOT SINK
 |  00:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
 |     predicates: !empty(s.s_partsupps)
-|     runtime filters: RF003 -> s_nationkey
+|     runtime filters: RF006 -> s_nationkey
 |
 08:AGGREGATE [FINALIZE]
 |  output: sum(l_quantity)
@@ -2278,7 +2278,7 @@ PLAN-ROOT SINK
 |  |
 |  09:HASH JOIN [INNER JOIN, BROADCAST]
 |  |  hash predicates: s_nationkey = n_nationkey
-|  |  runtime filters: RF003 <- n_nationkey
+|  |  runtime filters: RF006 <- n_nationkey
 |  |
 |  |--16:EXCHANGE [BROADCAST]
 |  |  |
@@ -2297,7 +2297,7 @@ PLAN-ROOT SINK
 |  00:SCAN HDFS [tpch_nested_parquet.supplier s]
 |     partitions=1/1 files=1 size=43.00MB
 |     predicates: !empty(s.s_partsupps)
-|     runtime filters: RF003 -> s_nationkey
+|     runtime filters: RF006 -> s_nationkey
 |
 15:AGGREGATE [FINALIZE]
 |  output: sum:merge(l_quantity)

[13/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
deleted file mode 100644
index d614ac7..0000000
--- a/be/src/runtime/disk-io-mgr.cc
+++ /dev/null
@@ -1,1190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "common/global-flags.h"
-#include "runtime/disk-io-mgr.h"
-#include "runtime/disk-io-mgr-handle-cache.inline.h"
-#include "runtime/disk-io-mgr-internal.h"
-
-#include <boost/algorithm/string.hpp>
-
-#include "gutil/strings/substitute.h"
-#include "util/bit-util.h"
-#include "util/hdfs-util.h"
-#include "util/time.h"
-
-DECLARE_bool(disable_mem_pools);
-#ifndef NDEBUG
-DECLARE_int32(stress_scratch_write_delay_ms);
-#endif
-
-#include "common/names.h"
-
-using namespace impala;
-using namespace strings;
-
-// Control the number of disks on the machine.  If 0, this comes from the system
-// settings.
-DEFINE_int32(num_disks, 0, "Number of disks on data node.");
-// Default IoMgr configs:
-// The maximum number of the threads per disk is also the max queue depth per disk.
-DEFINE_int32(num_threads_per_disk, 0, "Number of I/O threads per disk");
-
-// Rotational disks should have 1 thread per disk to minimize seeks.  Non-rotational
-// don't have this penalty and benefit from multiple concurrent IO requests.
-static const int THREADS_PER_ROTATIONAL_DISK = 1;
-static const int THREADS_PER_SOLID_STATE_DISK = 8;
-
-// The maximum number of the threads per rotational disk is also the max queue depth per
-// rotational disk.
-static const string num_io_threads_per_rotational_disk_help_msg = Substitute("Number of "
-    "I/O threads per rotational disk. Has priority over num_threads_per_disk. If neither"
-    " is set, defaults to $0 thread(s) per rotational disk", THREADS_PER_ROTATIONAL_DISK);
-DEFINE_int32(num_io_threads_per_rotational_disk, 0,
-    num_io_threads_per_rotational_disk_help_msg.c_str());
-// The maximum number of the threads per solid state disk is also the max queue depth per
-// solid state disk.
-static const string num_io_threads_per_solid_state_disk_help_msg = Substitute("Number of"
-    " I/O threads per solid state disk. Has priority over num_threads_per_disk. If "
-    "neither is set, defaults to $0 thread(s) per solid state disk",
-    THREADS_PER_SOLID_STATE_DISK);
-DEFINE_int32(num_io_threads_per_solid_state_disk, 0,
-    num_io_threads_per_solid_state_disk_help_msg.c_str());
-// The maximum number of remote HDFS I/O threads.  HDFS access that are expected to be
-// remote are placed on a separate remote disk queue.  This is the queue depth for that
-// queue.  If 0, then the remote queue is not used and instead ranges are round-robined
-// across the local disk queues.
-DEFINE_int32(num_remote_hdfs_io_threads, 8, "Number of remote HDFS I/O threads");
-// The maximum number of S3 I/O threads. The default value of 16 was chosen emperically
-// to maximize S3 throughput. Maximum throughput is achieved with multiple connections
-// open to S3 and use of multiple CPU cores since S3 reads are relatively compute
-// expensive (SSL and JNI buffer overheads).
-DEFINE_int32(num_s3_io_threads, 16, "Number of S3 I/O threads");
-// The maximum number of ADLS I/O threads. This number is a good default to have for
-// clusters that may vary widely in size, due to an undocumented concurrency limit
-// enforced by ADLS for a cluster, which spans between 500-700. For smaller clusters
-// (~10 nodes), 64 threads would be more ideal.
-DEFINE_int32(num_adls_io_threads, 16, "Number of ADLS I/O threads");
-
-DECLARE_int64(min_buffer_size);
-
-// With 1024B through 8MB buffers, this is up to ~2GB of buffers.
-DEFINE_int32(max_free_io_buffers, 128,
-    "For each io buffer size, the maximum number of buffers the IoMgr will hold onto");
-
-// The number of cached file handles defines how much memory can be used per backend for
-// caching frequently used file handles. Measurements indicate that a single file handle
-// uses about 6kB of memory. 20k file handles will thus reserve ~120MB of memory.
-// The actual amount of memory that is associated with a file handle can be larger
-// or smaller, depending on the replication factor for this file or the path name.
-DEFINE_uint64(max_cached_file_handles, 20000, "Maximum number of HDFS file handles "
-    "that will be cached. Disabled if set to 0.");
-
-// The unused file handle timeout specifies how long a file handle will remain in the
-// cache if it is not being used. Aging out unused handles ensures that the cache is not
-// wasting memory on handles that aren't useful. This allows users to specify a larger
-// cache size, as the system will only use the memory on useful file handles.
-// Additionally, cached file handles keep an open file descriptor for local files.
-// If a file is deleted through HDFS, this open file descriptor can keep the disk space
-// from being freed. When the metadata sees that a file has been deleted, the file handle
-// will no longer be used by future queries. Aging out this file handle allows the
-// disk space to be freed in an appropriate period of time.
-DEFINE_uint64(unused_file_handle_timeout_sec, 21600, "Maximum time, in seconds, that an "
-    "unused HDFS file handle will remain in the file handle cache. Disabled if set "
-    "to 0.");
-
-// The IoMgr is able to run with a wide range of memory usage. If a query has memory
-// remaining less than this value, the IoMgr will stop all buffering regardless of the
-// current queue size.
-static const int LOW_MEMORY = 64 * 1024 * 1024;
-
-const int DiskIoMgr::SCAN_RANGE_READY_BUFFER_LIMIT;
-
-AtomicInt32 DiskIoMgr::next_disk_id_;
-
-namespace detail {
-// Indicates if file handle caching should be used
-static inline bool is_file_handle_caching_enabled() {
-  return FLAGS_max_cached_file_handles > 0;
-}
-}
-
-string DiskIoMgr::DebugString() {
-  stringstream ss;
-  ss << "Disks: " << endl;
-  for (int i = 0; i < disk_queues_.size(); ++i) {
-    unique_lock<mutex> lock(disk_queues_[i]->lock);
-    ss << "  " << (void*) disk_queues_[i] << ":" ;
-    if (!disk_queues_[i]->request_contexts.empty()) {
-      ss << " Readers: ";
-      for (DiskIoRequestContext* req_context: disk_queues_[i]->request_contexts) {
-        ss << (void*)req_context;
-      }
-    }
-    ss << endl;
-  }
-  return ss.str();
-}
-
-DiskIoMgr::BufferDescriptor::BufferDescriptor(DiskIoMgr* io_mgr,
-    DiskIoRequestContext* reader, ScanRange* scan_range, uint8_t* buffer,
-    int64_t buffer_len, MemTracker* mem_tracker)
-  : io_mgr_(io_mgr),
-    reader_(reader),
-    mem_tracker_(mem_tracker),
-    scan_range_(scan_range),
-    buffer_(buffer),
-    buffer_len_(buffer_len) {
-  DCHECK(io_mgr != nullptr);
-  DCHECK(scan_range != nullptr);
-  DCHECK(buffer != nullptr);
-  DCHECK_GE(buffer_len, 0);
-  DCHECK_NE(scan_range->external_buffer_tag_ == ScanRange::ExternalBufferTag::NO_BUFFER,
-      mem_tracker == nullptr);
-}
-
-void DiskIoMgr::BufferDescriptor::TransferOwnership(MemTracker* dst) {
-  DCHECK(dst != nullptr);
-  DCHECK(!is_client_buffer());
-  // Memory of cached buffers is not tracked against a tracker.
-  if (is_cached()) return;
-  DCHECK(mem_tracker_ != nullptr);
-  dst->Consume(buffer_len_);
-  mem_tracker_->Release(buffer_len_);
-  mem_tracker_ = dst;
-}
-
-DiskIoMgr::WriteRange::WriteRange(
-    const string& file, int64_t file_offset, int disk_id, WriteDoneCallback callback)
-  : RequestRange(RequestType::WRITE), callback_(callback) {
-  SetRange(file, file_offset, disk_id);
-}
-
-void DiskIoMgr::WriteRange::SetRange(
-    const std::string& file, int64_t file_offset, int disk_id) {
-  file_ = file;
-  offset_ = file_offset;
-  disk_id_ = disk_id;
-}
-
-void DiskIoMgr::WriteRange::SetData(const uint8_t* buffer, int64_t len) {
-  data_ = buffer;
-  len_ = len;
-}
-
-static void CheckSseSupport() {
-  if (!CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
-    LOG(WARNING) << "This machine does not support sse4_2.  The default IO system "
-                    "configurations are suboptimal for this hardware.  Consider "
-                    "increasing the number of threads per disk by restarting impalad "
-                    "using the --num_threads_per_disk flag with a higher value";
-  }
-}
-
-// Utility function to select flag that is set (has a positive value) based on precedence
-static inline int GetFirstPositiveVal(const int first_val, const int second_val,
-    const int default_val) {
-  return first_val > 0 ? first_val : (second_val > 0 ? second_val : default_val);
-}
-
-DiskIoMgr::DiskIoMgr() :
-    num_io_threads_per_rotational_disk_(GetFirstPositiveVal(
-        FLAGS_num_io_threads_per_rotational_disk, FLAGS_num_threads_per_disk,
-        THREADS_PER_ROTATIONAL_DISK)),
-    num_io_threads_per_solid_state_disk_(GetFirstPositiveVal(
-        FLAGS_num_io_threads_per_solid_state_disk, FLAGS_num_threads_per_disk,
-        THREADS_PER_SOLID_STATE_DISK)),
-    max_buffer_size_(FLAGS_read_size),
-    min_buffer_size_(FLAGS_min_buffer_size),
-    shut_down_(false),
-    total_bytes_read_counter_(TUnit::BYTES),
-    read_timer_(TUnit::TIME_NS),
-    file_handle_cache_(min(FLAGS_max_cached_file_handles,
-        FileSystemUtil::MaxNumFileHandles()),
-        FLAGS_unused_file_handle_timeout_sec) {
-  DCHECK_LE(READ_SIZE_MIN_VALUE, FLAGS_read_size);
-  int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
-  free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
-  int num_local_disks = DiskInfo::num_disks();
-  if (FLAGS_num_disks < 0 || FLAGS_num_disks > DiskInfo::num_disks()) {
-    LOG(WARNING) << "Number of disks specified should be between 0 and the number of "
-        "logical disks on the system. Defaulting to system setting of " <<
-        DiskInfo::num_disks() << " disks";
-  } else if (FLAGS_num_disks > 0) {
-    num_local_disks = FLAGS_num_disks;
-  }
-  disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
-  CheckSseSupport();
-}
-
-DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_rotational_disk,
-    int threads_per_solid_state_disk, int min_buffer_size, int max_buffer_size) :
-    num_io_threads_per_rotational_disk_(threads_per_rotational_disk),
-    num_io_threads_per_solid_state_disk_(threads_per_solid_state_disk),
-    max_buffer_size_(max_buffer_size),
-    min_buffer_size_(min_buffer_size),
-    shut_down_(false),
-    total_bytes_read_counter_(TUnit::BYTES),
-    read_timer_(TUnit::TIME_NS),
-    file_handle_cache_(min(FLAGS_max_cached_file_handles,
-        FileSystemUtil::MaxNumFileHandles()),
-        FLAGS_unused_file_handle_timeout_sec) {
-  int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
-  free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
-  if (num_local_disks == 0) num_local_disks = DiskInfo::num_disks();
-  disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
-  CheckSseSupport();
-}
-
-DiskIoMgr::~DiskIoMgr() {
-  shut_down_ = true;
-  // Notify all worker threads and shut them down.
-  for (int i = 0; i < disk_queues_.size(); ++i) {
-    if (disk_queues_[i] == nullptr) continue;
-    {
-      // This lock is necessary to properly use the condition var to notify
-      // the disk worker threads.  The readers also grab this lock so updates
-      // to shut_down_ are protected.
-      unique_lock<mutex> disk_lock(disk_queues_[i]->lock);
-    }
-    disk_queues_[i]->work_available.NotifyAll();
-  }
-  disk_thread_group_.JoinAll();
-
-  for (int i = 0; i < disk_queues_.size(); ++i) {
-    if (disk_queues_[i] == nullptr) continue;
-    int disk_id = disk_queues_[i]->disk_id;
-    for (list<DiskIoRequestContext*>::iterator it = disk_queues_[i]->request_contexts.begin();
-        it != disk_queues_[i]->request_contexts.end(); ++it) {
-      DCHECK_EQ((*it)->disk_states_[disk_id].num_threads_in_op(), 0);
-      DCHECK((*it)->disk_states_[disk_id].done());
-      (*it)->DecrementDiskRefCount();
-    }
-  }
-
-  DCHECK_EQ(num_buffers_in_readers_.Load(), 0);
-
-  // Delete all allocated buffers
-  int num_free_buffers = 0;
-  for (int idx = 0; idx < free_buffers_.size(); ++idx) {
-    num_free_buffers += free_buffers_[idx].size();
-  }
-  DCHECK_EQ(num_allocated_buffers_.Load(), num_free_buffers);
-  GcIoBuffers();
-
-  for (int i = 0; i < disk_queues_.size(); ++i) {
-    delete disk_queues_[i];
-  }
-
-  if (free_buffer_mem_tracker_ != nullptr) free_buffer_mem_tracker_->Close();
-  if (cached_read_options_ != nullptr) hadoopRzOptionsFree(cached_read_options_);
-}
-
-Status DiskIoMgr::Init(MemTracker* process_mem_tracker) {
-  DCHECK(process_mem_tracker != nullptr);
-  free_buffer_mem_tracker_.reset(
-      new MemTracker(-1, "Free Disk IO Buffers", process_mem_tracker, false));
-
-  for (int i = 0; i < disk_queues_.size(); ++i) {
-    disk_queues_[i] = new DiskQueue(i);
-    int num_threads_per_disk;
-    if (i == RemoteDfsDiskId()) {
-      num_threads_per_disk = FLAGS_num_remote_hdfs_io_threads;
-    } else if (i == RemoteS3DiskId()) {
-      num_threads_per_disk = FLAGS_num_s3_io_threads;
-    } else if (i == RemoteAdlsDiskId()) {
-      num_threads_per_disk = FLAGS_num_adls_io_threads;
-    } else if (DiskInfo::is_rotational(i)) {
-      num_threads_per_disk = num_io_threads_per_rotational_disk_;
-    } else {
-      num_threads_per_disk = num_io_threads_per_solid_state_disk_;
-    }
-    for (int j = 0; j < num_threads_per_disk; ++j) {
-      stringstream ss;
-      ss << "work-loop(Disk: " << i << ", Thread: " << j << ")";
-      std::unique_ptr<Thread> t;
-      RETURN_IF_ERROR(Thread::Create("disk-io-mgr", ss.str(), &DiskIoMgr::WorkLoop,
-          this, disk_queues_[i], &t));
-      disk_thread_group_.AddThread(move(t));
-    }
-  }
-  RETURN_IF_ERROR(file_handle_cache_.Init());
-
-  cached_read_options_ = hadoopRzOptionsAlloc();
-  DCHECK(cached_read_options_ != nullptr);
-  // Disable checksumming for cached reads.
-  int ret = hadoopRzOptionsSetSkipChecksum(cached_read_options_, true);
-  DCHECK_EQ(ret, 0);
-  // Disable automatic fallback for cached reads.
-  ret = hadoopRzOptionsSetByteBufferPool(cached_read_options_, nullptr);
-  DCHECK_EQ(ret, 0);
-
-  return Status::OK();
-}
-
-unique_ptr<DiskIoRequestContext> DiskIoMgr::RegisterContext(MemTracker* mem_tracker) {
-  return unique_ptr<DiskIoRequestContext>(
-      new DiskIoRequestContext(this, num_total_disks(), mem_tracker));
-}
-
-void DiskIoMgr::UnregisterContext(DiskIoRequestContext* reader) {
-  reader->CancelAndMarkInactive();
-}
-
-// Cancellation requires coordination from multiple threads.  Each thread that currently
-// has a reference to the request context must notice the cancel and remove it from its
-// tracking structures.  The last thread to touch the context should deallocate (aka
-// recycle) the request context object.  Potential threads are:
-//  1. Disk threads that are currently reading for this reader.
-//  2. Caller threads that are waiting in GetNext.
-//
-// The steps are:
-// 1. Cancel will immediately set the context in the Cancelled state.  This prevents any
-// other thread from adding more ready buffers to the context (they all take a lock and
-// check the state before doing so), or any write ranges to the context.
-// 2. Cancel will call cancel on each ScanRange that is not yet complete, unblocking
-// any threads in GetNext(). The reader will see the cancelled Status returned. Cancel
-// also invokes the callback for the WriteRanges with the cancelled state.
-// 3. Disk threads notice the context is cancelled either when picking the next context
-// to process or when they try to enqueue a ready buffer.  Upon noticing the cancelled
-// state, removes the context from the disk queue.  The last thread per disk with an
-// outstanding reference to the context decrements the number of disk queues the context
-// is on.
-void DiskIoMgr::CancelContext(DiskIoRequestContext* context) {
-  context->Cancel(Status::CANCELLED);
-}
-
-void DiskIoMgr::set_read_timer(DiskIoRequestContext* r, RuntimeProfile::Counter* c) {
-  r->read_timer_ = c;
-}
-
-void DiskIoMgr::set_bytes_read_counter(DiskIoRequestContext* r, RuntimeProfile::Counter* c) {
-  r->bytes_read_counter_ = c;
-}
-
-void DiskIoMgr::set_active_read_thread_counter(DiskIoRequestContext* r,
-    RuntimeProfile::Counter* c) {
-  r->active_read_thread_counter_ = c;
-}
-
-void DiskIoMgr::set_disks_access_bitmap(DiskIoRequestContext* r,
-    RuntimeProfile::Counter* c) {
-  r->disks_accessed_bitmap_ = c;
-}
-
-int64_t DiskIoMgr::queue_size(DiskIoRequestContext* reader) const {
-  return reader->num_ready_buffers_.Load();
-}
-
-Status DiskIoMgr::context_status(DiskIoRequestContext* context) const {
-  unique_lock<mutex> lock(context->lock_);
-  return context->status_;
-}
-
-int64_t DiskIoMgr::bytes_read_local(DiskIoRequestContext* reader) const {
-  return reader->bytes_read_local_.Load();
-}
-
-int64_t DiskIoMgr::bytes_read_short_circuit(DiskIoRequestContext* reader) const {
-  return reader->bytes_read_short_circuit_.Load();
-}
-
-int64_t DiskIoMgr::bytes_read_dn_cache(DiskIoRequestContext* reader) const {
-  return reader->bytes_read_dn_cache_.Load();
-}
-
-int DiskIoMgr::num_remote_ranges(DiskIoRequestContext* reader) const {
-  return reader->num_remote_ranges_.Load();
-}
-
-int64_t DiskIoMgr::unexpected_remote_bytes(DiskIoRequestContext* reader) const {
-  return reader->unexpected_remote_bytes_.Load();
-}
-
-int DiskIoMgr::cached_file_handles_hit_count(DiskIoRequestContext* reader) const {
-  return reader->cached_file_handles_hit_count_.Load();
-}
-
-int DiskIoMgr::cached_file_handles_miss_count(DiskIoRequestContext* reader) const {
-  return reader->cached_file_handles_miss_count_.Load();
-}
-
-int64_t DiskIoMgr::GetReadThroughput() {
-  return RuntimeProfile::UnitsPerSecond(&total_bytes_read_counter_, &read_timer_);
-}
-
-Status DiskIoMgr::ValidateScanRange(ScanRange* range) {
-  int disk_id = range->disk_id_;
-  if (disk_id < 0 || disk_id >= disk_queues_.size()) {
-    return Status(TErrorCode::DISK_IO_ERROR,
-        Substitute("Invalid scan range.  Bad disk id: $0", disk_id));
-  }
-  if (range->offset_ < 0) {
-    return Status(TErrorCode::DISK_IO_ERROR,
-        Substitute("Invalid scan range. Negative offset $0", range->offset_));
-  }
-  if (range->len_ < 0) {
-    return Status(TErrorCode::DISK_IO_ERROR,
-        Substitute("Invalid scan range. Negative length $0", range->len_));
-  }
-  return Status::OK();
-}
-
-Status DiskIoMgr::AddScanRanges(DiskIoRequestContext* reader,
-    const vector<ScanRange*>& ranges, bool schedule_immediately) {
-  if (ranges.empty()) return Status::OK();
-
-  // Validate and initialize all ranges
-  for (int i = 0; i < ranges.size(); ++i) {
-    RETURN_IF_ERROR(ValidateScanRange(ranges[i]));
-    ranges[i]->InitInternal(this, reader);
-  }
-
-  // disks that this reader needs to be scheduled on.
-  unique_lock<mutex> reader_lock(reader->lock_);
-  DCHECK(reader->Validate()) << endl << reader->DebugString();
-
-  if (reader->state_ == DiskIoRequestContext::Cancelled) {
-    DCHECK(!reader->status_.ok());
-    return reader->status_;
-  }
-
-  // Add each range to the queue of the disk the range is on
-  for (int i = 0; i < ranges.size(); ++i) {
-    // Don't add empty ranges.
-    DCHECK_NE(ranges[i]->len(), 0);
-    ScanRange* range = ranges[i];
-
-    if (range->try_cache_) {
-      if (schedule_immediately) {
-        bool cached_read_succeeded;
-        RETURN_IF_ERROR(range->ReadFromCache(reader_lock, &cached_read_succeeded));
-        if (cached_read_succeeded) continue;
-        // Cached read failed, fall back to AddRequestRange() below.
-      } else {
-        reader->cached_ranges_.Enqueue(range);
-        continue;
-      }
-    }
-    reader->AddRequestRange(range, schedule_immediately);
-  }
-  DCHECK(reader->Validate()) << endl << reader->DebugString();
-
-  return Status::OK();
-}
-
-Status DiskIoMgr::AddScanRange(
-    DiskIoRequestContext* reader, ScanRange* range, bool schedule_immediately) {
-  return AddScanRanges(reader, vector<ScanRange*>({range}), schedule_immediately);
-}
-
-// This function returns the next scan range the reader should work on, checking
-// for eos and error cases. If there isn't already a cached scan range or a scan
-// range prepared by the disk threads, the caller waits on the disk threads.
-Status DiskIoMgr::GetNextRange(DiskIoRequestContext* reader, ScanRange** range) {
-  DCHECK(reader != nullptr);
-  DCHECK(range != nullptr);
-  *range = nullptr;
-  Status status = Status::OK();
-
-  unique_lock<mutex> reader_lock(reader->lock_);
-  DCHECK(reader->Validate()) << endl << reader->DebugString();
-
-  while (true) {
-    if (reader->state_ == DiskIoRequestContext::Cancelled) {
-      DCHECK(!reader->status_.ok());
-      status = reader->status_;
-      break;
-    }
-
-    if (reader->num_unstarted_scan_ranges_.Load() == 0 &&
-        reader->ready_to_start_ranges_.empty() && reader->cached_ranges_.empty()) {
-      // All ranges are done, just return.
-      break;
-    }
-
-    if (!reader->cached_ranges_.empty()) {
-      // We have a cached range.
-      *range = reader->cached_ranges_.Dequeue();
-      DCHECK((*range)->try_cache_);
-      bool cached_read_succeeded;
-      RETURN_IF_ERROR((*range)->ReadFromCache(reader_lock, &cached_read_succeeded));
-      if (cached_read_succeeded) return Status::OK();
-
-      // This range ended up not being cached. Loop again and pick up a new range.
-      reader->AddRequestRange(*range, false);
-      DCHECK(reader->Validate()) << endl << reader->DebugString();
-      *range = nullptr;
-      continue;
-    }
-
-    if (reader->ready_to_start_ranges_.empty()) {
-      reader->ready_to_start_ranges_cv_.Wait(reader_lock);
-    } else {
-      *range = reader->ready_to_start_ranges_.Dequeue();
-      DCHECK(*range != nullptr);
-      int disk_id = (*range)->disk_id();
-      DCHECK_EQ(*range, reader->disk_states_[disk_id].next_scan_range_to_start());
-      // Set this to nullptr, the next time this disk runs for this reader, it will
-      // get another range ready.
-      reader->disk_states_[disk_id].set_next_scan_range_to_start(nullptr);
-      reader->ScheduleScanRange(*range);
-      break;
-    }
-  }
-  return status;
-}
-
-Status DiskIoMgr::Read(DiskIoRequestContext* reader,
-    ScanRange* range, std::unique_ptr<BufferDescriptor>* buffer) {
-  DCHECK(range != nullptr);
-  DCHECK(buffer != nullptr);
-  *buffer = nullptr;
-
-  if (range->len() > max_buffer_size_
-      && range->external_buffer_tag_ != ScanRange::ExternalBufferTag::CLIENT_BUFFER) {
-    return Status(TErrorCode::DISK_IO_ERROR, Substitute("Internal error: cannot "
-        "perform sync read of '$0' bytes that is larger than the max read buffer size "
-        "'$1'.", range->len(), max_buffer_size_));
-  }
-
-  vector<DiskIoMgr::ScanRange*> ranges;
-  ranges.push_back(range);
-  RETURN_IF_ERROR(AddScanRanges(reader, ranges, true));
-  RETURN_IF_ERROR(range->GetNext(buffer));
-  DCHECK((*buffer) != nullptr);
-  DCHECK((*buffer)->eosr());
-  return Status::OK();
-}
-
-void DiskIoMgr::ReturnBuffer(unique_ptr<BufferDescriptor> buffer_desc) {
-  DCHECK(buffer_desc != nullptr);
-  if (!buffer_desc->status_.ok()) DCHECK(buffer_desc->buffer_ == nullptr);
-
-  DiskIoRequestContext* reader = buffer_desc->reader_;
-  if (buffer_desc->buffer_ != nullptr) {
-    if (!buffer_desc->is_cached() && !buffer_desc->is_client_buffer()) {
-      // Buffers the were not allocated by DiskIoMgr don't need to be freed.
-      FreeBufferMemory(buffer_desc.get());
-    }
-    buffer_desc->buffer_ = nullptr;
-    num_buffers_in_readers_.Add(-1);
-    reader->num_buffers_in_reader_.Add(-1);
-  } else {
-    // A nullptr buffer means there was an error in which case there is no buffer
-    // to return.
-  }
-
-  if (buffer_desc->eosr_ || buffer_desc->scan_range_->is_cancelled_) {
-    // Need to close the scan range if returning the last buffer or the scan range
-    // has been cancelled (and the caller might never get the last buffer).
-    // Close() is idempotent so multiple cancelled buffers is okay.
-    buffer_desc->scan_range_->Close();
-  }
-}
-
-unique_ptr<DiskIoMgr::BufferDescriptor> DiskIoMgr::GetFreeBuffer(
-    DiskIoRequestContext* reader, ScanRange* range, int64_t buffer_size) {
-  DCHECK_LE(buffer_size, max_buffer_size_);
-  DCHECK_GT(buffer_size, 0);
-  buffer_size = min(static_cast<int64_t>(max_buffer_size_), buffer_size);
-  int idx = free_buffers_idx(buffer_size);
-  // Quantize buffer size to nearest power of 2 greater than the specified buffer size and
-  // convert to bytes
-  buffer_size = (1LL << idx) * min_buffer_size_;
-
-  // Track memory against the reader. This is checked the next time we start
-  // a read for the next reader in DiskIoMgr::GetNextScanRange().
-  DCHECK(reader->mem_tracker_ != nullptr);
-  reader->mem_tracker_->Consume(buffer_size);
-
-  uint8_t* buffer = nullptr;
-  {
-    unique_lock<mutex> lock(free_buffers_lock_);
-    if (free_buffers_[idx].empty()) {
-      num_allocated_buffers_.Add(1);
-      if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != nullptr) {
-        ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(1L);
-      }
-      if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != nullptr) {
-        ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(buffer_size);
-      }
-      // We already tracked this memory against the reader's MemTracker.
-      buffer = new uint8_t[buffer_size];
-    } else {
-      if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != nullptr) {
-        ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(-1L);
-      }
-      buffer = free_buffers_[idx].front();
-      free_buffers_[idx].pop_front();
-      free_buffer_mem_tracker_->Release(buffer_size);
-    }
-  }
-
-  // Validate more invariants.
-  DCHECK(range != nullptr);
-  DCHECK(reader != nullptr);
-  DCHECK(buffer != nullptr);
-  return unique_ptr<BufferDescriptor>(new BufferDescriptor(
-      this, reader, range, buffer, buffer_size, reader->mem_tracker_));
-}
-
-void DiskIoMgr::GcIoBuffers(int64_t bytes_to_free) {
-  unique_lock<mutex> lock(free_buffers_lock_);
-  int buffers_freed = 0;
-  int bytes_freed = 0;
-  // Free small-to-large to avoid retaining many small buffers and fragmenting memory.
-  for (int idx = 0; idx < free_buffers_.size(); ++idx) {
-    deque<uint8_t*>* free_buffers = &free_buffers_[idx];
-    while (
-        !free_buffers->empty() && (bytes_to_free == -1 || bytes_freed <= bytes_to_free)) {
-      uint8_t* buffer = free_buffers->front();
-      free_buffers->pop_front();
-      int64_t buffer_size = (1LL << idx) * min_buffer_size_;
-      delete[] buffer;
-      free_buffer_mem_tracker_->Release(buffer_size);
-      num_allocated_buffers_.Add(-1);
-
-      ++buffers_freed;
-      bytes_freed += buffer_size;
-    }
-    if (bytes_to_free != -1 && bytes_freed >= bytes_to_free) break;
-  }
-
-  if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != nullptr) {
-    ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(-buffers_freed);
-  }
-  if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != nullptr) {
-    ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(-bytes_freed);
-  }
-  if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != nullptr) {
-    ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(-buffers_freed);
-  }
-}
-
-void DiskIoMgr::FreeBufferMemory(BufferDescriptor* desc) {
-  DCHECK(!desc->is_cached());
-  DCHECK(!desc->is_client_buffer());
-  uint8_t* buffer = desc->buffer_;
-  int64_t buffer_size = desc->buffer_len_;
-  int idx = free_buffers_idx(buffer_size);
-  DCHECK_EQ(BitUtil::Ceil(buffer_size, min_buffer_size_) & ~(1LL << idx), 0)
-      << "buffer_size_ / min_buffer_size_ should be power of 2, got buffer_size = "
-      << buffer_size << ", min_buffer_size_ = " << min_buffer_size_;
-
-  {
-    unique_lock<mutex> lock(free_buffers_lock_);
-    if (!FLAGS_disable_mem_pools &&
-        free_buffers_[idx].size() < FLAGS_max_free_io_buffers) {
-      free_buffers_[idx].push_back(buffer);
-      if (ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS != nullptr) {
-        ImpaladMetrics::IO_MGR_NUM_UNUSED_BUFFERS->Increment(1L);
-      }
-      // This consume call needs to be protected by 'free_buffers_lock_' to avoid a race
-      // with a Release() call for the same buffer that could make consumption negative.
-      // Note: we can't use TryConsume(), which can indirectly call GcIoBuffers().
-      // TODO: after IMPALA-3200 is completed, we should be able to leverage the buffer
-      // pool's free lists, and remove these free lists.
-      free_buffer_mem_tracker_->Consume(buffer_size);
-    } else {
-      num_allocated_buffers_.Add(-1);
-      delete[] buffer;
-      if (ImpaladMetrics::IO_MGR_NUM_BUFFERS != nullptr) {
-        ImpaladMetrics::IO_MGR_NUM_BUFFERS->Increment(-1L);
-      }
-      if (ImpaladMetrics::IO_MGR_TOTAL_BYTES != nullptr) {
-        ImpaladMetrics::IO_MGR_TOTAL_BYTES->Increment(-buffer_size);
-      }
-    }
-  }
-
-  // We transferred the buffer ownership from the BufferDescriptor to the DiskIoMgr.
-  desc->mem_tracker_->Release(buffer_size);
-  desc->buffer_ = nullptr;
-}
-
-// This function gets the next RequestRange to work on for this disk. It checks for
-// cancellation and
-// a) Updates ready_to_start_ranges if there are no scan ranges queued for this disk.
-// b) Adds an unstarted write range to in_flight_ranges_. The write range is processed
-//    immediately if there are no preceding scan ranges in in_flight_ranges_
-// It blocks until work is available or the thread is shut down.
-// Work is available if there is a DiskIoRequestContext with
-//  - A ScanRange with a buffer available, or
-//  - A WriteRange in unstarted_write_ranges_.
-bool DiskIoMgr::GetNextRequestRange(DiskQueue* disk_queue, RequestRange** range,
-    DiskIoRequestContext** request_context) {
-  int disk_id = disk_queue->disk_id;
-  *range = nullptr;
-
-  // This loops returns either with work to do or when the disk IoMgr shuts down.
-  while (true) {
-    *request_context = nullptr;
-    DiskIoRequestContext::PerDiskState* request_disk_state = nullptr;
-    {
-      unique_lock<mutex> disk_lock(disk_queue->lock);
-
-      while (!shut_down_ && disk_queue->request_contexts.empty()) {
-        // wait if there are no readers on the queue
-        disk_queue->work_available.Wait(disk_lock);
-      }
-      if (shut_down_) break;
-      DCHECK(!disk_queue->request_contexts.empty());
-
-      // Get the next reader and remove the reader so that another disk thread
-      // can't pick it up.  It will be enqueued before issuing the read to HDFS
-      // so this is not a big deal (i.e. multiple disk threads can read for the
-      // same reader).
-      // TODO: revisit.
-      *request_context = disk_queue->request_contexts.front();
-      disk_queue->request_contexts.pop_front();
-      DCHECK(*request_context != nullptr);
-      request_disk_state = &((*request_context)->disk_states_[disk_id]);
-      request_disk_state->IncrementRequestThreadAndDequeue();
-    }
-
-    // NOTE: no locks were taken in between.  We need to be careful about what state
-    // could have changed to the reader and disk in between.
-    // There are some invariants here.  Only one disk thread can have the
-    // same reader here (the reader is removed from the queue).  There can be
-    // other disk threads operating on this reader in other functions though.
-
-    // We just picked a reader. Before we may allocate a buffer on its behalf, check that
-    // it has not exceeded any memory limits (e.g. the query or process limit).
-    // TODO: once IMPALA-3200 is fixed, we should be able to remove the free lists and
-    // move these memory limit checks to GetFreeBuffer().
-    // Note that calling AnyLimitExceeded() can result in a call to GcIoBuffers().
-    // TODO: IMPALA-3209: we should not force a reader over its memory limit by
-    // pushing more buffers to it. Most readers can make progress and operate within
-    // a fixed memory limit.
-    if ((*request_context)->mem_tracker_ != nullptr
-        && (*request_context)->mem_tracker_->AnyLimitExceeded()) {
-      (*request_context)->Cancel(Status::MemLimitExceeded());
-    }
-
-    unique_lock<mutex> request_lock((*request_context)->lock_);
-    VLOG_FILE << "Disk (id=" << disk_id << ") reading for "
-        << (*request_context)->DebugString();
-
-    // Check if reader has been cancelled
-    if ((*request_context)->state_ == DiskIoRequestContext::Cancelled) {
-      request_disk_state->DecrementRequestThreadAndCheckDone(*request_context);
-      continue;
-    }
-
-    DCHECK_EQ((*request_context)->state_, DiskIoRequestContext::Active)
-        << (*request_context)->DebugString();
-
-    if (request_disk_state->next_scan_range_to_start() == nullptr &&
-        !request_disk_state->unstarted_scan_ranges()->empty()) {
-      // We don't have a range queued for this disk for what the caller should
-      // read next. Populate that.  We want to have one range waiting to minimize
-      // wait time in GetNextRange.
-      ScanRange* new_range = request_disk_state->unstarted_scan_ranges()->Dequeue();
-      (*request_context)->num_unstarted_scan_ranges_.Add(-1);
-      (*request_context)->ready_to_start_ranges_.Enqueue(new_range);
-      request_disk_state->set_next_scan_range_to_start(new_range);
-
-      if ((*request_context)->num_unstarted_scan_ranges_.Load() == 0) {
-        // All the ranges have been started, notify everyone blocked on GetNextRange.
-        // Only one of them will get work so make sure to return nullptr to the other
-        // caller threads.
-        (*request_context)->ready_to_start_ranges_cv_.NotifyAll();
-      } else {
-        (*request_context)->ready_to_start_ranges_cv_.NotifyOne();
-      }
-    }
-
-    // Always enqueue a WriteRange to be processed into in_flight_ranges_.
-    // This is done so in_flight_ranges_ does not exclusively contain ScanRanges.
-    // For now, enqueuing a WriteRange on each invocation of GetNextRequestRange()
-    // does not flood in_flight_ranges() with WriteRanges because the entire
-    // WriteRange is processed and removed from the queue after GetNextRequestRange()
-    // returns. (A DCHECK is used to ensure that writes do not exceed 8MB).
-    if (!request_disk_state->unstarted_write_ranges()->empty()) {
-      WriteRange* write_range = request_disk_state->unstarted_write_ranges()->Dequeue();
-      request_disk_state->in_flight_ranges()->Enqueue(write_range);
-    }
-
-    // Get the next scan range to work on from the reader. Only in_flight_ranges
-    // are eligible since the disk threads do not start new ranges on their own.
-
-    // There are no inflight ranges, nothing to do.
-    if (request_disk_state->in_flight_ranges()->empty()) {
-      request_disk_state->DecrementRequestThread();
-      continue;
-    }
-    DCHECK_GT(request_disk_state->num_remaining_ranges(), 0);
-    *range = request_disk_state->in_flight_ranges()->Dequeue();
-    DCHECK(*range != nullptr);
-
-    // Now that we've picked a request range, put the context back on the queue so
-    // another thread can pick up another request range for this context.
-    request_disk_state->ScheduleContext(*request_context, disk_id);
-    DCHECK((*request_context)->Validate()) << endl << (*request_context)->DebugString();
-    return true;
-  }
-
-  DCHECK(shut_down_);
-  return false;
-}
-
-void DiskIoMgr::HandleWriteFinished(
-    DiskIoRequestContext* writer, WriteRange* write_range, const Status& write_status) {
-  // Copy disk_id before running callback: the callback may modify write_range.
-  int disk_id = write_range->disk_id_;
-
-  // Execute the callback before decrementing the thread count. Otherwise CancelContext()
-  // that waits for the disk ref count to be 0 will return, creating a race, e.g. see
-  // IMPALA-1890.
-  // The status of the write does not affect the status of the writer context.
-  write_range->callback_(write_status);
-  {
-    unique_lock<mutex> writer_lock(writer->lock_);
-    DCHECK(writer->Validate()) << endl << writer->DebugString();
-    DiskIoRequestContext::PerDiskState& state = writer->disk_states_[disk_id];
-    if (writer->state_ == DiskIoRequestContext::Cancelled) {
-      state.DecrementRequestThreadAndCheckDone(writer);
-    } else {
-      state.DecrementRequestThread();
-    }
-    --state.num_remaining_ranges();
-  }
-}
-
-void DiskIoMgr::HandleReadFinished(DiskQueue* disk_queue, DiskIoRequestContext* reader,
-    unique_ptr<BufferDescriptor> buffer) {
-  unique_lock<mutex> reader_lock(reader->lock_);
-
-  DiskIoRequestContext::PerDiskState& state = reader->disk_states_[disk_queue->disk_id];
-  DCHECK(reader->Validate()) << endl << reader->DebugString();
-  DCHECK_GT(state.num_threads_in_op(), 0);
-  DCHECK(buffer->buffer_ != nullptr);
-
-  if (reader->state_ == DiskIoRequestContext::Cancelled) {
-    state.DecrementRequestThreadAndCheckDone(reader);
-    DCHECK(reader->Validate()) << endl << reader->DebugString();
-    if (!buffer->is_client_buffer()) FreeBufferMemory(buffer.get());
-    buffer->buffer_ = nullptr;
-    ScanRange* scan_range = buffer->scan_range_;
-    scan_range->Cancel(reader->status_);
-    // Enqueue the buffer to use the scan range's buffer cleanup path.
-    scan_range->EnqueueBuffer(reader_lock, move(buffer));
-    return;
-  }
-
-  DCHECK_EQ(reader->state_, DiskIoRequestContext::Active);
-  DCHECK(buffer->buffer_ != nullptr);
-
-  // Update the reader's scan ranges.  There are a three cases here:
-  //  1. Read error
-  //  2. End of scan range
-  //  3. Middle of scan range
-  if (!buffer->status_.ok()) {
-    // Error case
-    if (!buffer->is_client_buffer()) FreeBufferMemory(buffer.get());
-    buffer->buffer_ = nullptr;
-    buffer->eosr_ = true;
-    --state.num_remaining_ranges();
-    buffer->scan_range_->Cancel(buffer->status_);
-  } else if (buffer->eosr_) {
-    --state.num_remaining_ranges();
-  }
-
-  // After calling EnqueueBuffer(), it is no longer valid to read from buffer.
-  // Store the state we need before calling EnqueueBuffer().
-  bool eosr = buffer->eosr_;
-  ScanRange* scan_range = buffer->scan_range_;
-  bool is_cached = buffer->is_cached();
-  bool queue_full = scan_range->EnqueueBuffer(reader_lock, move(buffer));
-  if (eosr) {
-    // For cached buffers, we can't close the range until the cached buffer is returned.
-    // Close() is called from DiskIoMgr::ReturnBuffer().
-    if (!is_cached) scan_range->Close();
-  } else {
-    if (queue_full) {
-      reader->blocked_ranges_.Enqueue(scan_range);
-    } else {
-      reader->ScheduleScanRange(scan_range);
-    }
-  }
-  state.DecrementRequestThread();
-}
-
-void DiskIoMgr::WorkLoop(DiskQueue* disk_queue) {
-  // The thread waits until there is work or the entire system is being shut down.
-  // If there is work, performs the read or write requested and re-enqueues the
-  // requesting context.
-  // Locks are not taken when reading from or writing to disk.
-  // The main loop has three parts:
-  //   1. GetNextRequestContext(): get the next request context (read or write) to
-  //      process and dequeue it.
-  //   2. For the dequeued request, gets the next scan- or write-range to process and
-  //      re-enqueues the request.
-  //   3. Perform the read or write as specified.
-  // Cancellation checking needs to happen in both steps 1 and 3.
-  while (true) {
-    DiskIoRequestContext* worker_context = nullptr;;
-    RequestRange* range = nullptr;
-
-    if (!GetNextRequestRange(disk_queue, &range, &worker_context)) {
-      DCHECK(shut_down_);
-      break;
-    }
-
-    if (range->request_type() == RequestType::READ) {
-      ReadRange(disk_queue, worker_context, static_cast<ScanRange*>(range));
-    } else {
-      DCHECK(range->request_type() == RequestType::WRITE);
-      Write(worker_context, static_cast<WriteRange*>(range));
-    }
-  }
-
-  DCHECK(shut_down_);
-}
-
-// This function reads the specified scan range associated with the
-// specified reader context and disk queue.
-void DiskIoMgr::ReadRange(
-    DiskQueue* disk_queue, DiskIoRequestContext* reader, ScanRange* range) {
-  int64_t bytes_remaining = range->len_ - range->bytes_read_;
-  DCHECK_GT(bytes_remaining, 0);
-  unique_ptr<BufferDescriptor> buffer_desc;
-  if (range->external_buffer_tag_ == ScanRange::ExternalBufferTag::CLIENT_BUFFER) {
-    buffer_desc = unique_ptr<BufferDescriptor>(new BufferDescriptor(this, reader, range,
-        range->client_buffer_.data, range->client_buffer_.len, nullptr));
-  } else {
-    // Need to allocate a buffer to read into.
-    int64_t buffer_size = ::min(bytes_remaining, static_cast<int64_t>(max_buffer_size_));
-    buffer_desc = TryAllocateNextBufferForRange(disk_queue, reader, range, buffer_size);
-    if (buffer_desc == nullptr) return;
-  }
-  reader->num_used_buffers_.Add(1);
-
-  // No locks in this section.  Only working on local vars.  We don't want to hold a
-  // lock across the read call.
-  buffer_desc->status_ = range->Open(detail::is_file_handle_caching_enabled());
-  if (buffer_desc->status_.ok()) {
-    // Update counters.
-    if (reader->active_read_thread_counter_) {
-      reader->active_read_thread_counter_->Add(1L);
-    }
-    if (reader->disks_accessed_bitmap_) {
-      int64_t disk_bit = 1LL << disk_queue->disk_id;
-      reader->disks_accessed_bitmap_->BitOr(disk_bit);
-    }
-    SCOPED_TIMER(&read_timer_);
-    SCOPED_TIMER(reader->read_timer_);
-
-    buffer_desc->status_ = range->Read(buffer_desc->buffer_, buffer_desc->buffer_len_,
-        &buffer_desc->len_, &buffer_desc->eosr_);
-    buffer_desc->scan_range_offset_ = range->bytes_read_ - buffer_desc->len_;
-
-    if (reader->bytes_read_counter_ != nullptr) {
-      COUNTER_ADD(reader->bytes_read_counter_, buffer_desc->len_);
-    }
-
-    COUNTER_ADD(&total_bytes_read_counter_, buffer_desc->len_);
-    if (reader->active_read_thread_counter_) {
-      reader->active_read_thread_counter_->Add(-1L);
-    }
-  }
-
-  // Finished read, update reader/disk based on the results
-  HandleReadFinished(disk_queue, reader, move(buffer_desc));
-}
-
-unique_ptr<DiskIoMgr::BufferDescriptor> DiskIoMgr::TryAllocateNextBufferForRange(
-    DiskQueue* disk_queue, DiskIoRequestContext* reader, ScanRange* range,
-    int64_t buffer_size) {
-  DCHECK(reader->mem_tracker_ != nullptr);
-  bool enough_memory = reader->mem_tracker_->SpareCapacity() > LOW_MEMORY;
-  if (!enough_memory) {
-    // Low memory, GC all the buffers and try again.
-    GcIoBuffers();
-    enough_memory = reader->mem_tracker_->SpareCapacity() > LOW_MEMORY;
-  }
-
-  if (!enough_memory) {
-    DiskIoRequestContext::PerDiskState& state = reader->disk_states_[disk_queue->disk_id];
-    unique_lock<mutex> reader_lock(reader->lock_);
-
-    // Just grabbed the reader lock, check for cancellation.
-    if (reader->state_ == DiskIoRequestContext::Cancelled) {
-      DCHECK(reader->Validate()) << endl << reader->DebugString();
-      state.DecrementRequestThreadAndCheckDone(reader);
-      range->Cancel(reader->status_);
-      DCHECK(reader->Validate()) << endl << reader->DebugString();
-      return nullptr;
-    }
-
-    if (!range->ready_buffers_.empty()) {
-      // We have memory pressure and this range doesn't need another buffer
-      // (it already has one queued). Skip this range and pick it up later.
-      range->blocked_on_queue_ = true;
-      reader->blocked_ranges_.Enqueue(range);
-      state.DecrementRequestThread();
-      return nullptr;
-    } else {
-      // We need to get a buffer anyway since there are none queued. The query
-      // is likely to fail due to mem limits but there's nothing we can do about that
-      // now.
-    }
-  }
-  unique_ptr<BufferDescriptor> buffer_desc = GetFreeBuffer(reader, range, buffer_size);
-  DCHECK(buffer_desc != nullptr);
-  return buffer_desc;
-}
-
-void DiskIoMgr::Write(DiskIoRequestContext* writer_context, WriteRange* write_range) {
-  Status ret_status = Status::OK();
-  FILE* file_handle = nullptr;
-  // Raw open() syscall will create file if not present when passed these flags.
-  int fd = open(write_range->file(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
-  if (fd < 0) {
-    ret_status = Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
-        Substitute("Opening '$0' for write failed with errno=$1 description=$2",
-                                     write_range->file_, errno, GetStrErrMsg())));
-  } else {
-    file_handle = fdopen(fd, "wb");
-    if (file_handle == nullptr) {
-      ret_status = Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
-          Substitute("fdopen($0, \"wb\") failed with errno=$1 description=$2", fd, errno,
-                                       GetStrErrMsg())));
-    }
-  }
-
-  if (file_handle != nullptr) {
-    ret_status = WriteRangeHelper(file_handle, write_range);
-
-    int success = fclose(file_handle);
-    if (ret_status.ok() && success != 0) {
-      ret_status = Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
-          Substitute("fclose($0) failed", write_range->file_)));
-    }
-  }
-
-  HandleWriteFinished(writer_context, write_range, ret_status);
-}
-
-Status DiskIoMgr::WriteRangeHelper(FILE* file_handle, WriteRange* write_range) {
-  // Seek to the correct offset and perform the write.
-  int success = fseek(file_handle, write_range->offset(), SEEK_SET);
-  if (success != 0) {
-    return Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
-        Substitute("fseek($0, $1, SEEK_SET) failed with errno=$2 description=$3",
-        write_range->file_, write_range->offset(), errno, GetStrErrMsg())));
-  }
-
-#ifndef NDEBUG
-  if (FLAGS_stress_scratch_write_delay_ms > 0) {
-    SleepForMs(FLAGS_stress_scratch_write_delay_ms);
-  }
-#endif
-  int64_t bytes_written = fwrite(write_range->data_, 1, write_range->len_, file_handle);
-  if (bytes_written < write_range->len_) {
-    return Status(ErrorMsg(TErrorCode::DISK_IO_ERROR,
-        Substitute("fwrite(buffer, 1, $0, $1) failed with errno=$2 description=$3",
-        write_range->len_, write_range->file_, errno, GetStrErrMsg())));
-  }
-  if (ImpaladMetrics::IO_MGR_BYTES_WRITTEN != nullptr) {
-    ImpaladMetrics::IO_MGR_BYTES_WRITTEN->Increment(write_range->len_);
-  }
-
-  return Status::OK();
-}
-
-int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
-  int64_t buffer_size_scaled = BitUtil::Ceil(buffer_size, min_buffer_size_);
-  int idx = BitUtil::Log2Ceiling64(buffer_size_scaled);
-  DCHECK_GE(idx, 0);
-  DCHECK_LT(idx, free_buffers_.size());
-  return idx;
-}
-
-Status DiskIoMgr::AddWriteRange(DiskIoRequestContext* writer, WriteRange* write_range) {
-  unique_lock<mutex> writer_lock(writer->lock_);
-
-  if (writer->state_ == DiskIoRequestContext::Cancelled) {
-    DCHECK(!writer->status_.ok());
-    return writer->status_;
-  }
-
-  writer->AddRequestRange(write_range, false);
-  return Status::OK();
-}
-
-int DiskIoMgr::AssignQueue(const char* file, int disk_id, bool expected_local) {
-  // If it's a remote range, check for an appropriate remote disk queue.
-  if (!expected_local) {
-    if (IsHdfsPath(file) && FLAGS_num_remote_hdfs_io_threads > 0) {
-      return RemoteDfsDiskId();
-    }
-    if (IsS3APath(file)) return RemoteS3DiskId();
-    if (IsADLSPath(file)) return RemoteAdlsDiskId();
-  }
-  // Assign to a local disk queue.
-  DCHECK(!IsS3APath(file)); // S3 is always remote.
-  DCHECK(!IsADLSPath(file)); // ADLS is always remote.
-  if (disk_id == -1) {
-    // disk id is unknown, assign it an arbitrary one.
-    disk_id = next_disk_id_.Add(1);
-  }
-  // TODO: we need to parse the config for the number of dirs configured for this
-  // data node.
-  return disk_id % num_local_disks();
-}
-
-HdfsFileHandle* DiskIoMgr::GetCachedHdfsFileHandle(const hdfsFS& fs,
-    std::string* fname, int64_t mtime, DiskIoRequestContext *reader,
-    bool require_new) {
-  bool cache_hit;
-  HdfsFileHandle* fh = file_handle_cache_.GetFileHandle(fs, fname, mtime, require_new,
-      &cache_hit);
-  if (fh == nullptr) return nullptr;
-  if (cache_hit) {
-    DCHECK(!require_new);
-    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_HIT_RATIO->Update(1L);
-    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_HIT_COUNT->Increment(1L);
-    reader->cached_file_handles_hit_count_.Add(1L);
-  } else {
-    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_HIT_RATIO->Update(0L);
-    ImpaladMetrics::IO_MGR_CACHED_FILE_HANDLES_MISS_COUNT->Increment(1L);
-    reader->cached_file_handles_miss_count_.Add(1L);
-  }
-  return fh;
-}
-
-void DiskIoMgr::ReleaseCachedHdfsFileHandle(std::string* fname, HdfsFileHandle* fid,
-    bool destroy_handle) {
-  file_handle_cache_.ReleaseFileHandle(fname, fid, destroy_handle);
-}
-
-Status DiskIoMgr::ReopenCachedHdfsFileHandle(const hdfsFS& fs, std::string* fname,
-    int64_t mtime, HdfsFileHandle** fid) {
-  bool cache_hit;
-  file_handle_cache_.ReleaseFileHandle(fname, *fid, true);
-  // The old handle has been destroyed, so *fid must be overwritten before returning.
-  *fid = file_handle_cache_.GetFileHandle(fs, fname, mtime, true,
-      &cache_hit);
-  if (*fid == nullptr) {
-    return Status(TErrorCode::DISK_IO_ERROR,
-        GetHdfsErrorMsg("Failed to open HDFS file ", fname->data()));
-  }
-  DCHECK(!cache_hit);
-  return Status::OK();
-}

[07/16] incubator-impala git commit: IMPALA-4252: Min-max runtime filters for Kudu

Posted by ta...@apache.org.

IMPALA-4252: Min-max runtime filters for Kudu

This patch implements min-max filters for runtime filters. Each
runtime filter generates a bloom filter or a min-max filter,
depending on if it has HDFS or Kudu targets, respectively.

In RuntimeFilterGenerator in the planner, each hash join node
generates a bloom and min-max filter for each equi-join predicate, but
only those filters that end up being assigned to a target make it into
the final plan.

Min-max filters are only assigned to Kudu scans if the target expr is
a column, as Kudu doesn't support bounds on general exprs, and only if
the join op is '=' and not 'is distinct from', as Kudu doesn't support
returning NULLs if a bound is set.

Min-max filters are inserted into by the PartitionedHashJoinBuilder.
Codegen is used to eliminate branching on the type of filter. String
min-max filters truncate their bounds at 1024 chars, so that the max
amount of memory used by min-max filters is negligible.

For now, min-max filters are only applied at the KuduScanner, which
passes them into the Kudu client.

Future work will address applying min-max filters at HDFS scan nodes
and applying bloom filters at Kudu scan nodes.

Functional Testing:
- Added new planner tests and updated the old ones. (in old tests, a
  lot of runtime filters are renumbered as we always generate min-max
  filters even if they don't end up getting assigned and they take up
  some of the RF ids).
- Updated existing runtime filter tests to work with Kudu.
- Added e2e tests for min-max filter specific functionality.

Perf Testing:
- All tests run on Kudu stress cluster (10 nodes) and tpch_100_kudu,
  timings are averages of 3 runs.
- Ran a contrived query with a filter that does not eliminate any rows
  (full self join of lineitem). The difference in running time was
  negligible - 24.46s with filters on, 24.15s with filters off for
  a ~1% slowdown.
- Ran a contrived query with a filter that elimiates all rows (self
  join on lineitem with a join condition that never matches). The
  filters resulted in a significant speedup - 0.26s with filters on,
  1.46s with filters off for a ~5.6x speedup. This query is added to
  targeted-perf.

Change-Id: I02bad890f5b5f78388a3041bf38f89369b5e2f1c
Reviewed-on: http://gerrit.cloudera.org:8080/7793
Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2510fe0a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2510fe0a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2510fe0a

Branch: refs/heads/master
Commit: 2510fe0aa0c86f460af9040eb413aad76c13cc84
Parents: 3ddafcd
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Mon Oct 23 07:58:34 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Nov 17 21:33:51 2017 +0000

----------------------------------------------------------------------
 be/src/codegen/gen_ir_descriptions.py           |  11 +-
 be/src/codegen/impala-ir.cc                     |   1 +
 be/src/exec/filter-context.cc                   | 158 ++--
 be/src/exec/filter-context.h                    |  25 +-
 be/src/exec/hdfs-parquet-scanner-ir.cc          |   2 +-
 be/src/exec/hdfs-scan-node-base.cc              |   2 +-
 be/src/exec/kudu-scan-node-base.cc              |   2 +-
 be/src/exec/kudu-scan-node-mt.cc                |   5 +-
 be/src/exec/kudu-scan-node.cc                   |   7 +-
 be/src/exec/kudu-scanner.cc                     |  65 +-
 be/src/exec/kudu-scanner.h                      |   6 +-
 be/src/exec/kudu-util.cc                        |  60 +-
 be/src/exec/kudu-util.h                         |   6 +
 be/src/exec/partitioned-hash-join-builder-ir.cc |   1 +
 be/src/exec/partitioned-hash-join-builder.cc    |  37 +-
 be/src/exec/scan-node.cc                        |  16 +-
 be/src/runtime/coordinator-filter-state.h       |  25 +-
 be/src/runtime/coordinator.cc                   |  91 ++-
 be/src/runtime/fragment-instance-state.cc       |   7 +-
 be/src/runtime/fragment-instance-state.h        |   2 +-
 be/src/runtime/query-state.cc                   |   9 +-
 be/src/runtime/query-state.h                    |   3 +-
 be/src/runtime/runtime-filter-bank.cc           | 102 ++-
 be/src/runtime/runtime-filter-bank.h            |  36 +-
 be/src/runtime/runtime-filter-ir.cc             |   7 +-
 be/src/runtime/runtime-filter.cc                |   4 +-
 be/src/runtime/runtime-filter.h                 |  61 +-
 be/src/runtime/runtime-filter.inline.h          |  35 +-
 be/src/runtime/timestamp-value.h                |  15 +
 be/src/service/impala-internal-service.cc       |   6 +-
 be/src/util/CMakeLists.txt                      |   3 +
 be/src/util/min-max-filter-ir.cc                |  76 ++
 be/src/util/min-max-filter-test.cc              | 364 +++++++++
 be/src/util/min-max-filter.cc                   | 529 ++++++++++++
 be/src/util/min-max-filter.h                    | 231 ++++++
 common/thrift/Data.thrift                       |   1 +
 common/thrift/ImpalaInternalService.thrift      |  23 +-
 common/thrift/ImpalaService.thrift              |   8 +-
 common/thrift/PlanNodes.thrift                  |  13 +
 .../org/apache/impala/planner/HashJoinNode.java |   2 +-
 .../org/apache/impala/planner/HdfsScanNode.java |   2 +-
 .../org/apache/impala/planner/KuduScanNode.java |   4 +
 .../org/apache/impala/planner/PlanNode.java     |  27 +-
 .../impala/planner/RuntimeFilterGenerator.java  |  94 ++-
 .../org/apache/impala/planner/PlannerTest.java  |   7 +
 .../queries/PlannerTest/aggregation.test        |   4 +-
 .../PlannerTest/fk-pk-join-detection.test       |  48 +-
 .../queries/PlannerTest/implicit-joins.test     |   4 +-
 .../queries/PlannerTest/inline-view-limit.test  |  16 +-
 .../queries/PlannerTest/inline-view.test        |  44 +-
 .../queries/PlannerTest/join-order.test         | 188 ++---
 .../queries/PlannerTest/joins.test              |  88 +-
 .../queries/PlannerTest/kudu-delete.test        |   8 +-
 .../queries/PlannerTest/kudu-update.test        |  10 +
 .../queries/PlannerTest/kudu.test               |   2 +
 .../queries/PlannerTest/max-row-size.test       |   8 +-
 .../PlannerTest/min-max-runtime-filters.test    | 142 ++++
 .../queries/PlannerTest/nested-collections.test |  20 +-
 .../queries/PlannerTest/order.test              |   8 +-
 .../queries/PlannerTest/outer-joins.test        |  24 +-
 .../PlannerTest/predicate-propagation.test      |  28 +-
 .../PlannerTest/resource-requirements.test      | 126 +--
 .../PlannerTest/runtime-filter-propagation.test |  96 +--
 .../runtime-filter-query-options.test           |  76 +-
 .../PlannerTest/spillable-buffer-sizing.test    |  32 +-
 .../queries/PlannerTest/subquery-rewrite.test   |  82 +-
 .../queries/PlannerTest/tablesample.test        |   4 +-
 .../queries/PlannerTest/tpcds-all.test          | 800 +++++++++----------
 .../queries/PlannerTest/tpch-all.test           | 444 +++++-----
 .../queries/PlannerTest/tpch-kudu.test          | 107 +++
 .../queries/PlannerTest/tpch-nested.test        |  64 +-
 .../queries/PlannerTest/tpch-views.test         | 148 ++--
 .../queries/PlannerTest/union.test              |   8 +-
 .../queries/PlannerTest/views.test              |  40 +-
 .../queries/PlannerTest/with-clause.test        |  32 +-
 .../queries/QueryTest/bloom_filters.test        | 126 +++
 .../queries/QueryTest/bloom_filters_wait.test   |  22 +
 .../queries/QueryTest/explain-level2.test       |   4 +-
 .../queries/QueryTest/explain-level3.test       |   4 +-
 .../queries/QueryTest/min_max_filters.test      | 121 +++
 .../queries/QueryTest/runtime_filters.test      | 177 ++--
 .../queries/QueryTest/runtime_filters_wait.test |  23 -
 .../primitive_min_max_runtime_filter.test       |   9 +
 tests/common/impala_test_suite.py               |   8 +-
 tests/query_test/test_kudu.py                   |   6 +
 tests/query_test/test_runtime_filters.py        |  95 ++-
 tests/util/test_file_parser.py                  |  17 +
 87 files changed, 3855 insertions(+), 1649 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index b3ad25d..1d0f38e 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -210,7 +210,16 @@ ir_functions = [
   "_ZN6impala9UnionNode16MaterializeBatchEPNS_8RowBatchEPPh"],
   ["BLOOM_FILTER_INSERT_NO_AVX2", "_ZN6impala11BloomFilter12InsertNoAvx2Ej"],
   ["BLOOM_FILTER_INSERT_AVX2", "_ZN6impala11BloomFilter10InsertAvx2Ej"],
-  ["SELECT_NODE_COPY_ROWS", "_ZN6impala10SelectNode8CopyRowsEPNS_8RowBatchE"]
+  ["SELECT_NODE_COPY_ROWS", "_ZN6impala10SelectNode8CopyRowsEPNS_8RowBatchE"],
+  ["BOOL_MIN_MAX_FILTER_INSERT", "_ZN6impala16BoolMinMaxFilter6InsertEPv"],
+  ["TINYINT_MIN_MAX_FILTER_INSERT", "_ZN6impala19TinyIntMinMaxFilter6InsertEPv"],
+  ["SMALLINT_MIN_MAX_FILTER_INSERT", "_ZN6impala20SmallIntMinMaxFilter6InsertEPv"],
+  ["INT_MIN_MAX_FILTER_INSERT", "_ZN6impala15IntMinMaxFilter6InsertEPv"],
+  ["BIGINT_MIN_MAX_FILTER_INSERT", "_ZN6impala18BigIntMinMaxFilter6InsertEPv"],
+  ["FLOAT_MIN_MAX_FILTER_INSERT", "_ZN6impala17FloatMinMaxFilter6InsertEPv"],
+  ["DOUBLE_MIN_MAX_FILTER_INSERT", "_ZN6impala18DoubleMinMaxFilter6InsertEPv"],
+  ["STRING_MIN_MAX_FILTER_INSERT", "_ZN6impala18StringMinMaxFilter6InsertEPv"],
+  ["TIMESTAMP_MIN_MAX_FILTER_INSERT", "_ZN6impala21TimestampMinMaxFilter6InsertEPv"]
 ]
 
 enums_preamble = '\

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/codegen/impala-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/impala-ir.cc b/be/src/codegen/impala-ir.cc
index 2ae10a4..4b79e8b 100644
--- a/be/src/codegen/impala-ir.cc
+++ b/be/src/codegen/impala-ir.cc
@@ -61,6 +61,7 @@
 #include "udf/udf-ir.cc"
 #include "util/bloom-filter-ir.cc"
 #include "util/hash-util-ir.cc"
+#include "util/min-max-filter-ir.cc"
 
 #pragma clang diagnostic pop
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/filter-context.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/filter-context.cc b/be/src/exec/filter-context.cc
index 7f7318e..70618df 100644
--- a/be/src/exec/filter-context.cc
+++ b/be/src/exec/filter-context.cc
@@ -20,6 +20,7 @@
 #include "codegen/codegen-anyval.h"
 #include "runtime/runtime-filter.inline.h"
 #include "runtime/tuple-row.h"
+#include "util/min-max-filter.h"
 #include "util/runtime-profile-counters.h"
 
 using namespace impala;
@@ -77,11 +78,24 @@ bool FilterContext::Eval(TupleRow* row) const noexcept {
 }
 
 void FilterContext::Insert(TupleRow* row) const noexcept {
-  if (local_bloom_filter == NULL) return;
-  void* val = expr_eval->GetValue(row);
-  uint32_t filter_hash = RawValue::GetHashValue(
-      val, expr_eval->root().type(), RuntimeFilterBank::DefaultHashSeed());
-  local_bloom_filter->Insert(filter_hash);
+  if (filter->is_bloom_filter()) {
+    if (local_bloom_filter == nullptr) return;
+    void* val = expr_eval->GetValue(row);
+    uint32_t filter_hash = RawValue::GetHashValue(
+        val, expr_eval->root().type(), RuntimeFilterBank::DefaultHashSeed());
+    local_bloom_filter->Insert(filter_hash);
+  } else {
+    DCHECK(filter->is_min_max_filter());
+    if (local_min_max_filter == nullptr) return;
+    void* val = expr_eval->GetValue(row);
+    local_min_max_filter->Insert(val);
+  }
+}
+
+void FilterContext::MaterializeValues() const {
+  if (filter->is_min_max_filter() && local_min_max_filter != nullptr) {
+    local_min_max_filter->MaterializeValues();
+  }
 }
 
 // An example of the generated code for TPCH-Q2: RF002 -> n_regionkey
@@ -219,17 +233,17 @@ Status FilterContext::CodegenEval(
 //     %"class.std::vector.101" zeroinitializer }
 //
 // define void @FilterContextInsert(%"struct.impala::FilterContext"* %this,
-//     %"class.impala::TupleRow"* %row) #43 {
+//     %"class.impala::TupleRow"* %row) #37 {
 // entry:
 //   %0 = alloca i16
 //   %local_bloom_filter_ptr = getelementptr inbounds %"struct.impala::FilterContext",
 //       %"struct.impala::FilterContext"* %this, i32 0, i32 3
 //   %local_bloom_filter_arg = load %"class.impala::BloomFilter"*,
 //       %"class.impala::BloomFilter"** %local_bloom_filter_ptr
-//   %bloom_is_null = icmp eq %"class.impala::BloomFilter"* %local_bloom_filter_arg, null
-//   br i1 %bloom_is_null, label %bloom_is_null1, label %bloom_not_null
+//   %filter_is_null = icmp eq %"class.impala::BloomFilter"* %local_bloom_filter_arg, null
+//   br i1 %filter_is_null, label %filters_null, label %filters_not_null
 //
-// bloom_not_null:                                   ; preds = %entry
+// filters_not_null:                                 ; preds = %entry
 //   %expr_eval_ptr = getelementptr inbounds %"struct.impala::FilterContext",
 //       %"struct.impala::FilterContext"* %this, i32 0, i32 0
 //   %expr_eval_arg = load %"class.impala::ScalarExprEvaluator"*,
@@ -240,29 +254,29 @@ Status FilterContext::CodegenEval(
 //   %is_null = trunc i32 %result to i1
 //   br i1 %is_null, label %val_is_null, label %val_not_null
 //
-// bloom_is_null1:                                   ; preds = %entry
+// filters_null:                                     ; preds = %entry
 //   ret void
 //
-// val_not_null:                                     ; preds = %bloom_not_null
+// val_not_null:                                     ; preds = %filters_not_null
 //   %1 = ashr i32 %result, 16
 //   %2 = trunc i32 %1 to i16
 //   store i16 %2, i16* %0
 //   %native_ptr = bitcast i16* %0 to i8*
 //   br label %insert_filter
 //
-// val_is_null:                                      ; preds = %bloom_not_null
+// val_is_null:                                      ; preds = %filters_not_null
 //   br label %insert_filter
 //
 // insert_filter:                                    ; preds = %val_not_null, %val_is_null
 //   %val_ptr_phi = phi i8* [ %native_ptr, %val_not_null ], [ null, %val_is_null ]
 //   %hash_value = call i32 @_ZN6impala8RawValue12GetHashValueEPKvRKNS_10ColumnTypeEj(
 //       i8* %val_ptr_phi, %"struct.impala::ColumnType"* @expr_type_arg, i32 1234)
-//   call void @_ZN6impala11BloomFilter9InsertAvxEj(
+//   call void @_ZN6impala11BloomFilter10InsertAvx2Ej(
 //       %"class.impala::BloomFilter"* %local_bloom_filter_arg, i32 %hash_value)
 //   ret void
 // }
-Status FilterContext::CodegenInsert(
-    LlvmCodeGen* codegen, ScalarExpr* filter_expr, llvm::Function** fn) {
+Status FilterContext::CodegenInsert(LlvmCodeGen* codegen, ScalarExpr* filter_expr,
+    FilterContext* ctx, llvm::Function** fn) {
   llvm::LLVMContext& context = codegen->context();
   LlvmBuilder builder(context);
 
@@ -279,23 +293,38 @@ Status FilterContext::CodegenInsert(
   llvm::Value* this_arg = args[0];
   llvm::Value* row_arg = args[1];
 
-  // Load 'local_bloom_filter' from 'this_arg' FilterContext object.
-  llvm::Value* local_bloom_filter_ptr =
-      builder.CreateStructGEP(nullptr, this_arg, 3, "local_bloom_filter_ptr");
-  llvm::Value* local_bloom_filter_arg =
-      builder.CreateLoad(local_bloom_filter_ptr, "local_bloom_filter_arg");
-
-  // Check if 'local_bloom_filter' is NULL and return if so.
-  llvm::Value* bloom_is_null =
-      builder.CreateIsNull(local_bloom_filter_arg, "bloom_is_null");
-  llvm::BasicBlock* bloom_not_null_block =
-      llvm::BasicBlock::Create(context, "bloom_not_null", insert_filter_fn);
-  llvm::BasicBlock* bloom_is_null_block =
-      llvm::BasicBlock::Create(context, "bloom_is_null", insert_filter_fn);
-  builder.CreateCondBr(bloom_is_null, bloom_is_null_block, bloom_not_null_block);
-  builder.SetInsertPoint(bloom_is_null_block);
+  llvm::Value* local_filter_arg;
+  if (ctx->filter->is_bloom_filter()) {
+    // Load 'local_bloom_filter' from 'this_arg' FilterContext object.
+    llvm::Value* local_bloom_filter_ptr =
+        builder.CreateStructGEP(nullptr, this_arg, 3, "local_bloom_filter_ptr");
+    local_filter_arg =
+        builder.CreateLoad(local_bloom_filter_ptr, "local_bloom_filter_arg");
+  } else {
+    DCHECK(ctx->filter->is_min_max_filter());
+    // Load 'local_min_max_filter' from 'this_arg' FilterContext object.
+    llvm::Value* local_min_max_filter_ptr =
+        builder.CreateStructGEP(nullptr, this_arg, 4, "local_min_max_filter_ptr");
+    llvm::PointerType* min_max_filter_type =
+        codegen->GetPtrType(MinMaxFilter::GetLlvmClassName(filter_expr->type().type))
+            ->getPointerTo();
+    local_min_max_filter_ptr = builder.CreatePointerCast(
+        local_min_max_filter_ptr, min_max_filter_type, "cast_min_max_filter_ptr");
+    local_filter_arg =
+        builder.CreateLoad(local_min_max_filter_ptr, "local_min_max_filter_arg");
+  }
+
+  // Check if 'local_bloom_filter' or 'local_min_max_filter' are NULL (depending on
+  // filter desc) and return if so.
+  llvm::Value* filter_null = builder.CreateIsNull(local_filter_arg, "filter_is_null");
+  llvm::BasicBlock* filter_not_null_block =
+      llvm::BasicBlock::Create(context, "filters_not_null", insert_filter_fn);
+  llvm::BasicBlock* filter_null_block =
+      llvm::BasicBlock::Create(context, "filters_null", insert_filter_fn);
+  builder.CreateCondBr(filter_null, filter_null_block, filter_not_null_block);
+  builder.SetInsertPoint(filter_null_block);
   builder.CreateRetVoid();
-  builder.SetInsertPoint(bloom_not_null_block);
+  builder.SetInsertPoint(filter_not_null_block);
 
   llvm::BasicBlock* val_not_null_block =
       llvm::BasicBlock::Create(context, "val_not_null", insert_filter_fn);
@@ -327,47 +356,60 @@ Status FilterContext::CodegenInsert(
   llvm::Value* null_ptr = codegen->null_ptr_value();
   builder.CreateBr(insert_filter_block);
 
-  // Saves 'result' on the stack and passes a pointer to it to 'insert_bloom_filter_fn'.
+  // Saves 'result' on the stack and passes a pointer to it to Insert().
   builder.SetInsertPoint(val_not_null_block);
   llvm::Value* native_ptr = result.ToNativePtr();
   native_ptr = builder.CreatePointerCast(native_ptr, codegen->ptr_type(), "native_ptr");
   builder.CreateBr(insert_filter_block);
 
-  // Get the arguments in place to call 'get_hash_value_fn'.
+  // Get the arguments in place to call Insert().
   builder.SetInsertPoint(insert_filter_block);
   llvm::PHINode* val_ptr_phi = builder.CreatePHI(codegen->ptr_type(), 2, "val_ptr_phi");
   val_ptr_phi->addIncoming(native_ptr, val_not_null_block);
   val_ptr_phi->addIncoming(null_ptr, val_is_null_block);
 
-  // Create a global constant of the filter expression's ColumnType. It needs to be a
-  // constant for constant propagation and dead code elimination in 'get_hash_value_fn'.
-  llvm::Type* col_type = codegen->GetType(ColumnType::LLVM_CLASS_NAME);
-  llvm::Constant* expr_type_arg = codegen->ConstantToGVPtr(
-      col_type, filter_expr->type().ToIR(codegen), "expr_type_arg");
+  // Insert into the bloom filter.
+  if (ctx->filter->is_bloom_filter()) {
+    // Create a global constant of the filter expression's ColumnType. It needs to be a
+    // constant for constant propagation and dead code elimination in 'get_hash_value_fn'.
+    llvm::Type* col_type = codegen->GetType(ColumnType::LLVM_CLASS_NAME);
+    llvm::Constant* expr_type_arg = codegen->ConstantToGVPtr(
+        col_type, filter_expr->type().ToIR(codegen), "expr_type_arg");
+
+    // Call RawValue::GetHashValue() on the result of the filter's expression.
+    llvm::Value* seed_arg =
+        codegen->GetIntConstant(TYPE_INT, RuntimeFilterBank::DefaultHashSeed());
+    llvm::Value* get_hash_value_args[] = {val_ptr_phi, expr_type_arg, seed_arg};
+    llvm::Function* get_hash_value_fn =
+        codegen->GetFunction(IRFunction::RAW_VALUE_GET_HASH_VALUE, false);
+    DCHECK(get_hash_value_fn != nullptr);
+    llvm::Value* hash_value =
+        builder.CreateCall(get_hash_value_fn, get_hash_value_args, "hash_value");
+
+    // Call Insert() on the bloom filter.
+    llvm::Function* insert_bloom_filter_fn;
+    if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+      insert_bloom_filter_fn =
+          codegen->GetFunction(IRFunction::BLOOM_FILTER_INSERT_AVX2, false);
+    } else {
+      insert_bloom_filter_fn =
+          codegen->GetFunction(IRFunction::BLOOM_FILTER_INSERT_NO_AVX2, false);
+    }
+    DCHECK(insert_bloom_filter_fn != nullptr);
 
-  // Call RawValue::GetHashValue() on the result of the filter's expression.
-  llvm::Value* seed_arg =
-      codegen->GetIntConstant(TYPE_INT, RuntimeFilterBank::DefaultHashSeed());
-  llvm::Value* get_hash_value_args[] = {val_ptr_phi, expr_type_arg, seed_arg};
-  llvm::Function* get_hash_value_fn =
-      codegen->GetFunction(IRFunction::RAW_VALUE_GET_HASH_VALUE, false);
-  DCHECK(get_hash_value_fn != nullptr);
-  llvm::Value* hash_value =
-      builder.CreateCall(get_hash_value_fn, get_hash_value_args, "hash_value");
-
-  // Call Insert() on the bloom filter.
-  llvm::Value* insert_args[] = {local_bloom_filter_arg, hash_value};
-  llvm::Function* insert_bloom_filter_fn;
-  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
-    insert_bloom_filter_fn =
-        codegen->GetFunction(IRFunction::BLOOM_FILTER_INSERT_AVX2, false);
+    llvm::Value* insert_args[] = {local_filter_arg, hash_value};
+    builder.CreateCall(insert_bloom_filter_fn, insert_args);
   } else {
-    insert_bloom_filter_fn =
-        codegen->GetFunction(IRFunction::BLOOM_FILTER_INSERT_NO_AVX2, false);
+    DCHECK(ctx->filter->is_min_max_filter());
+    // The function for inserting into the min-max filter.
+    llvm::Function* min_max_insert_fn = codegen->GetFunction(
+        MinMaxFilter::GetInsertIRFunctionType(filter_expr->type().type), false);
+    DCHECK(min_max_insert_fn != nullptr);
+
+    llvm::Value* insert_filter_args[] = {local_filter_arg, val_ptr_phi};
+    builder.CreateCall(min_max_insert_fn, insert_filter_args);
   }
 
-  DCHECK(insert_bloom_filter_fn != nullptr);
-  builder.CreateCall(insert_bloom_filter_fn, insert_args);
   builder.CreateRetVoid();
 
   *fn = codegen->FinalizeFunction(insert_filter_fn);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/filter-context.h
----------------------------------------------------------------------
diff --git a/be/src/exec/filter-context.h b/be/src/exec/filter-context.h
index 0806fd3..e740b80 100644
--- a/be/src/exec/filter-context.h
+++ b/be/src/exec/filter-context.h
@@ -21,13 +21,14 @@
 
 #include <boost/unordered_map.hpp>
 #include "exprs/scalar-expr-evaluator.h"
+#include "runtime/runtime-filter.h"
 #include "util/runtime-profile.h"
 
 namespace impala {
 
 class BloomFilter;
 class LlvmCodeGen;
-class RuntimeFilter;
+class MinMaxFilter;
 class ScalarExpr;
 class TupleRow;
 
@@ -94,6 +95,9 @@ struct FilterContext {
   /// Working copy of local bloom filter
   BloomFilter* local_bloom_filter = nullptr;
 
+  /// Working copy of local min-max filter
+  MinMaxFilter* local_min_max_filter = nullptr;
+
   /// Struct name in LLVM IR.
   static const char* LLVM_CLASS_NAME;
 
@@ -107,10 +111,15 @@ struct FilterContext {
   /// a match in 'filter'. Returns false otherwise.
   bool Eval(TupleRow* row) const noexcept;
 
-  /// Evaluates 'row' with 'expr_eval' and hashes the resulting value.
-  /// The hash value is then used for setting some bits in 'local_bloom_filter'.
+  /// Evaluates 'row' with 'expr_eval' and inserts the value into 'local_bloom_filter'
+  /// or 'local_min_max_filter' as appropriate.
   void Insert(TupleRow* row) const noexcept;
 
+  /// Materialize filter values by copying any values stored by filters into memory owned
+  /// by the filter. Filters may assume that the memory for Insert()-ed values stays valid
+  /// until this is called.
+  void MaterializeValues() const;
+
   /// Codegen Eval() by codegen'ing the expression 'filter_expr' and replacing the type
   /// argument to RuntimeFilter::Eval() with a constant. On success, 'fn' is set to
   /// the generated function. On failure, an error status is returned.
@@ -119,10 +128,14 @@ struct FilterContext {
 
   /// Codegen Insert() by codegen'ing the expression 'filter_expr', replacing the type
   /// argument to RawValue::GetHashValue() with a constant, and calling into the correct
-  /// version of BloomFilter::Insert(), depending on the presence of AVX.  On success,
-  /// 'fn' is set to the generated function. On failure, an error status is returned.
+  /// version of BloomFilter::Insert() or MinMaxFilter::Insert(), depending on the filter
+  /// desc and if 'local_bloom_filter' or 'local_min_max_filter' are null.
+  /// For bloom filters, it also selects the correct Insert() based on the presence of
+  /// AVX, and for min-max filters it selects the correct Insert() based on type.
+  /// On success, 'fn' is set to the generated function. On failure, an error status is
+  /// returned.
   static Status CodegenInsert(LlvmCodeGen* codegen, ScalarExpr* filter_expr,
-      llvm::Function** fn) WARN_UNUSED_RESULT;
+      FilterContext* ctx, llvm::Function** fn) WARN_UNUSED_RESULT;
 
   // Returns if there is any always_false filter in ctxs. If there is, the counter stats
   // is updated.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/hdfs-parquet-scanner-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner-ir.cc b/be/src/exec/hdfs-parquet-scanner-ir.cc
index c1574d1..f2355d8 100644
--- a/be/src/exec/hdfs-parquet-scanner-ir.cc
+++ b/be/src/exec/hdfs-parquet-scanner-ir.cc
@@ -70,7 +70,7 @@ bool HdfsParquetScanner::EvalRuntimeFilter(int i, TupleRow* row) {
   LocalFilterStats* stats = &filter_stats_[i];
   const FilterContext* ctx = filter_ctxs_[i];
   ++stats->total_possible;
-  if (stats->enabled && ctx->filter->HasBloomFilter()) {
+  if (stats->enabled && ctx->filter->HasFilter()) {
     ++stats->considered;
     if (!ctx->Eval(row)) {
       ++stats->rejected;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/hdfs-scan-node-base.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.cc b/be/src/exec/hdfs-scan-node-base.cc
index 8ec76e0..9149097 100644
--- a/be/src/exec/hdfs-scan-node-base.cc
+++ b/be/src/exec/hdfs-scan-node-base.cc
@@ -661,7 +661,7 @@ bool HdfsScanNodeBase::PartitionPassesFilters(int32_t partition_id,
       continue;
     }
 
-    bool has_filter = ctx.filter->HasBloomFilter();
+    bool has_filter = ctx.filter->HasFilter();
     bool passed_filter = !has_filter || ctx.Eval(tuple_row_mem);
     ctx.stats->IncrCounters(stats_name, 1, has_filter, !passed_filter);
     if (!passed_filter) return false;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-scan-node-base.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scan-node-base.cc b/be/src/exec/kudu-scan-node-base.cc
index feb0af7..0e7cdfa 100644
--- a/be/src/exec/kudu-scan-node-base.cc
+++ b/be/src/exec/kudu-scan-node-base.cc
@@ -84,7 +84,7 @@ Status KuduScanNodeBase::Prepare(RuntimeState* state) {
 }
 
 Status KuduScanNodeBase::Open(RuntimeState* state) {
-  RETURN_IF_ERROR(ExecNode::Open(state));
+  RETURN_IF_ERROR(ScanNode::Open(state));
   RETURN_IF_CANCELLED(state);
   RETURN_IF_ERROR(QueryMaintenance(state));
   SCOPED_TIMER(runtime_profile_->total_time_counter());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-scan-node-mt.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scan-node-mt.cc b/be/src/exec/kudu-scan-node-mt.cc
index 2cb7619..22f00e7 100644
--- a/be/src/exec/kudu-scan-node-mt.cc
+++ b/be/src/exec/kudu-scan-node-mt.cc
@@ -59,7 +59,8 @@ Status KuduScanNodeMt::GetNext(RuntimeState* state, RowBatch* row_batch, bool* e
   RETURN_IF_ERROR(QueryMaintenance(state));
   *eos = false;
 
-  if (scan_token_ == nullptr) {
+  bool scan_token_eos = scan_token_ == nullptr;
+  while (scan_token_eos) {
     scan_token_ = GetNextScanToken();
     if (scan_token_ == nullptr) {
       runtime_profile_->StopPeriodicCounters();
@@ -68,7 +69,7 @@ Status KuduScanNodeMt::GetNext(RuntimeState* state, RowBatch* row_batch, bool* e
       *eos = true;
       return Status::OK();
     }
-    RETURN_IF_ERROR(scanner_->OpenNextScanToken(*scan_token_));
+    RETURN_IF_ERROR(scanner_->OpenNextScanToken(*scan_token_, &scan_token_eos));
   }
 
   bool scanner_eos = false;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scan-node.cc b/be/src/exec/kudu-scan-node.cc
index 7f18710..77fac89 100644
--- a/be/src/exec/kudu-scan-node.cc
+++ b/be/src/exec/kudu-scan-node.cc
@@ -75,6 +75,8 @@ Status KuduScanNode::Open(RuntimeState* state) {
         state->query_options().num_scanner_threads);
   }
 
+  if (filter_ctxs_.size() > 0) WaitForRuntimeFilters();
+
   thread_avail_cb_id_ = state->resource_pool()->AddThreadAvailableCb(
       bind<void>(mem_fn(&KuduScanNode::ThreadAvailableCb), this, _1));
   ThreadAvailableCb(state->resource_pool());
@@ -179,8 +181,9 @@ void KuduScanNode::ThreadAvailableCb(ThreadResourceMgr::ResourcePool* pool) {
 }
 
 Status KuduScanNode::ProcessScanToken(KuduScanner* scanner, const string& scan_token) {
-  RETURN_IF_ERROR(scanner->OpenNextScanToken(scan_token));
-  bool eos = false;
+  bool eos;
+  RETURN_IF_ERROR(scanner->OpenNextScanToken(scan_token, &eos));
+  if (eos) return Status::OK();
   while (!eos && !done_) {
     unique_ptr<RowBatch> row_batch = std::make_unique<RowBatch>(row_desc(),
         runtime_state_->batch_size(), mem_tracker());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.cc b/be/src/exec/kudu-scanner.cc
index deb9c84..7db8878 100644
--- a/be/src/exec/kudu-scanner.cc
+++ b/be/src/exec/kudu-scanner.cc
@@ -18,6 +18,7 @@
 #include "exec/kudu-scanner.h"
 
 #include <kudu/client/row_result.h>
+#include <kudu/client/value.h>
 #include <thrift/protocol/TDebugProtocol.h>
 #include <vector>
 #include <string>
@@ -25,9 +26,11 @@
 #include "exec/kudu-util.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
+#include "exprs/slot-ref.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/raw-value.h"
+#include "runtime/runtime-filter.h"
 #include "runtime/runtime-state.h"
 #include "runtime/row-batch.h"
 #include "runtime/string-value.h"
@@ -36,15 +39,18 @@
 #include "gutil/gscoped_ptr.h"
 #include "gutil/strings/substitute.h"
 #include "util/jni-util.h"
+#include "util/min-max-filter.h"
 #include "util/periodic-counter-updater.h"
 #include "util/runtime-profile-counters.h"
 
 #include "common/names.h"
 
 using kudu::client::KuduClient;
+using kudu::client::KuduPredicate;
 using kudu::client::KuduScanBatch;
 using kudu::client::KuduSchema;
 using kudu::client::KuduTable;
+using kudu::client::KuduValue;
 
 DEFINE_string(kudu_read_mode, "READ_LATEST", "(Advanced) Sets the Kudu scan ReadMode. "
     "Supported Kudu read modes are READ_LATEST and READ_AT_SNAPSHOT.");
@@ -136,7 +142,7 @@ void KuduScanner::Close() {
   expr_results_pool_->FreeAll();
 }
 
-Status KuduScanner::OpenNextScanToken(const string& scan_token)  {
+Status KuduScanner::OpenNextScanToken(const string& scan_token, bool* eos) {
   DCHECK(scanner_ == NULL);
   kudu::client::KuduScanner* scanner;
   KUDU_RETURN_IF_ERROR(kudu::client::KuduScanToken::DeserializeIntoScanner(
@@ -164,10 +170,67 @@ Status KuduScanner::OpenNextScanToken(const string& scan_token)  {
     scanner_->SetRowFormatFlags(row_format_flags);
   }
 
+  if (scan_node_->filter_ctxs_.size() > 0) {
+    for (const FilterContext& ctx : scan_node_->filter_ctxs_) {
+      MinMaxFilter* filter = ctx.filter->get_min_max();
+      if (filter != nullptr && !filter->AlwaysTrue()) {
+        if (filter->AlwaysFalse()) {
+          // We can skip this entire scan.
+          CloseCurrentClientScanner();
+          *eos = true;
+          return Status::OK();
+        } else {
+          auto it = ctx.filter->filter_desc().planid_to_target_ndx.find(scan_node_->id());
+          const TRuntimeFilterTargetDesc& target_desc =
+              ctx.filter->filter_desc().targets[it->second];
+          const string& col_name = target_desc.kudu_col_name;
+          DCHECK(col_name != "");
+          ColumnType col_type = ColumnType::FromThrift(target_desc.kudu_col_type);
+
+          void* min = filter->GetMin();
+          void* max = filter->GetMax();
+          // If the type of the filter is not the same as the type of the target column,
+          // there must be an implicit integer cast and we need to ensure the min/max we
+          // pass to Kudu are within the range of the target column.
+          int64_t int_min;
+          int64_t int_max;
+          if (col_type.type != filter->type()) {
+            DCHECK(col_type.IsIntegerType());
+
+            if (!filter->GetCastIntMinMax(col_type, &int_min, &int_max)) {
+              // The min/max for this filter is outside the range for the target column,
+              // so all rows are filtered out and we can skip the scan.
+              CloseCurrentClientScanner();
+              *eos = true;
+              return Status::OK();
+            }
+            min = &int_min;
+            max = &int_max;
+          }
+
+          KuduValue* min_value;
+          RETURN_IF_ERROR(CreateKuduValue(filter->type(), min, &min_value));
+          KUDU_RETURN_IF_ERROR(
+              scanner_->AddConjunctPredicate(scan_node_->table_->NewComparisonPredicate(
+                  col_name, KuduPredicate::ComparisonOp::GREATER_EQUAL, min_value)),
+              "Failed to add min predicate");
+
+          KuduValue* max_value;
+          RETURN_IF_ERROR(CreateKuduValue(filter->type(), max, &max_value));
+          KUDU_RETURN_IF_ERROR(
+              scanner_->AddConjunctPredicate(scan_node_->table_->NewComparisonPredicate(
+                  col_name, KuduPredicate::ComparisonOp::LESS_EQUAL, max_value)),
+              "Failed to add max predicate");
+        }
+      }
+    }
+  }
+
   {
     SCOPED_TIMER(state_->total_storage_wait_timer());
     KUDU_RETURN_IF_ERROR(scanner_->Open(), "Unable to open scanner");
   }
+  *eos = false;
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-scanner.h b/be/src/exec/kudu-scanner.h
index 125881c..e6d4ca9 100644
--- a/be/src/exec/kudu-scanner.h
+++ b/be/src/exec/kudu-scanner.h
@@ -43,8 +43,10 @@ class KuduScanner {
   /// Does not actually open a kudu::client::KuduScanner.
   Status Open();
 
-  /// Opens a new kudu::client::KuduScanner using 'scan_token'.
-  Status OpenNextScanToken(const std::string& scan_token);
+  /// Opens a new kudu::client::KuduScanner using 'scan_token'. If there are no rows to
+  /// scan (eg. because there is a runtime filter that rejects all rows) 'eos' will
+  /// be set to true, otherwise if the return status is OK it will be false.
+  Status OpenNextScanToken(const std::string& scan_token, bool* eos);
 
   /// Fetches the next batch from the current kudu::client::KuduScanner.
   Status GetNext(RowBatch* row_batch, bool* eos);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-util.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-util.cc b/be/src/exec/kudu-util.cc
index 320a77d..03cb51f 100644
--- a/be/src/exec/kudu-util.cc
+++ b/be/src/exec/kudu-util.cc
@@ -35,6 +35,7 @@ using kudu::client::KuduSchema;
 using kudu::client::KuduClient;
 using kudu::client::KuduClientBuilder;
 using kudu::client::KuduColumnSchema;
+using kudu::client::KuduValue;
 using DataType = kudu::client::KuduColumnSchema::DataType;
 
 DECLARE_bool(disable_kudu);
@@ -111,17 +112,14 @@ void InitKuduLogging() {
   kudu::client::SetVerboseLogLevel(std::max(0, FLAGS_v - 1));
 }
 
-Status WriteKuduTimestampValue(int col, const TimestampValue* tv,
-    kudu::KuduPartialRow* row) {
-  int64_t ts_micros;
-  bool success = tv->UtcToUnixTimeMicros(&ts_micros);
+// Converts a TimestampValue to Kudu's representation which is returned in 'ts_micros'.
+static Status ConvertTimestampValue(const TimestampValue* tv, int64_t* ts_micros) {
+  bool success = tv->UtcToUnixTimeMicros(ts_micros);
   DCHECK(success); // If the value was invalid the slot should've been null.
   if (UNLIKELY(!success)) {
     return Status(TErrorCode::RUNTIME_ERROR,
         "Invalid TimestampValue: " + tv->ToString());
   }
-  KUDU_RETURN_IF_ERROR(row->SetUnixTimeMicros(col, ts_micros),
-      "Could not add Kudu WriteOp.");
   return Status::OK();
 }
 
@@ -170,8 +168,11 @@ Status WriteKuduValue(int col, PrimitiveType type, const void* value,
           "Could not set Kudu row value.");
       break;
     case TYPE_TIMESTAMP:
-      RETURN_IF_ERROR(WriteKuduTimestampValue(col,
-          reinterpret_cast<const TimestampValue*>(value), row));
+      int64_t ts_micros;
+      RETURN_IF_ERROR(ConvertTimestampValue(
+          reinterpret_cast<const TimestampValue*>(value), &ts_micros));
+      KUDU_RETURN_IF_ERROR(
+          row->SetUnixTimeMicros(col, ts_micros), "Could not add Kudu WriteOp.");
       break;
     default:
       return Status(TErrorCode::IMPALA_KUDU_TYPE_MISSING, TypeToString(type));
@@ -196,4 +197,47 @@ ColumnType KuduDataTypeToColumnType(DataType type) {
   return ColumnType(PrimitiveType::INVALID_TYPE);
 }
 
+Status CreateKuduValue(PrimitiveType type, void* value, KuduValue** out) {
+  switch (type) {
+    case TYPE_VARCHAR:
+    case TYPE_STRING: {
+      const StringValue* sv = reinterpret_cast<const StringValue*>(value);
+      kudu::Slice slice(reinterpret_cast<uint8_t*>(sv->ptr), sv->len);
+      *out = KuduValue::CopyString(slice);
+      break;
+    }
+    case TYPE_FLOAT:
+      *out = KuduValue::FromFloat(*reinterpret_cast<const float*>(value));
+      break;
+    case TYPE_DOUBLE:
+      *out = KuduValue::FromDouble(*reinterpret_cast<const double*>(value));
+      break;
+    case TYPE_BOOLEAN:
+      *out = KuduValue::FromBool(*reinterpret_cast<const bool*>(value));
+      break;
+    case TYPE_TINYINT:
+      *out = KuduValue::FromInt(*reinterpret_cast<const int8_t*>(value));
+      break;
+    case TYPE_SMALLINT:
+      *out = KuduValue::FromInt(*reinterpret_cast<const int16_t*>(value));
+      break;
+    case TYPE_INT:
+      *out = KuduValue::FromInt(*reinterpret_cast<const int32_t*>(value));
+      break;
+    case TYPE_BIGINT:
+      *out = KuduValue::FromInt(*reinterpret_cast<const int64_t*>(value));
+      break;
+    case TYPE_TIMESTAMP: {
+      int64_t ts_micros;
+      RETURN_IF_ERROR(ConvertTimestampValue(
+          reinterpret_cast<const TimestampValue*>(value), &ts_micros));
+      *out = KuduValue::FromInt(ts_micros);
+      break;
+    }
+    default:
+      return Status(TErrorCode::IMPALA_KUDU_TYPE_MISSING, TypeToString(type));
+  }
+  return Status::OK();
+}
+
 }  // namespace impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/kudu-util.h
----------------------------------------------------------------------
diff --git a/be/src/exec/kudu-util.h b/be/src/exec/kudu-util.h
index 11cf16a..5fd1140 100644
--- a/be/src/exec/kudu-util.h
+++ b/be/src/exec/kudu-util.h
@@ -23,6 +23,7 @@ struct tm;
 
 #include <kudu/client/callbacks.h>
 #include <kudu/client/client.h>
+#include <kudu/client/value.h>
 
 #include "common/status.h"
 #include "runtime/string-value.h"
@@ -84,6 +85,11 @@ void LogKuduMessage(kudu::client::KuduLogSeverity severity, const char* filename
 Status WriteKuduValue(int col, PrimitiveType type, const void* value,
     bool copy_strings, kudu::KuduPartialRow* row) WARN_UNUSED_RESULT;
 
+/// Casts 'value' according to 'type' and create a new KuduValue containing 'value' which
+/// is returned in 'out'.
+Status CreateKuduValue(
+    PrimitiveType type, void* value, kudu::client::KuduValue** out) WARN_UNUSED_RESULT;
+
 /// Takes a Kudu client DataType and returns the corresponding Impala ColumnType.
 ColumnType KuduDataTypeToColumnType(kudu::client::KuduColumnSchema::DataType type);
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/partitioned-hash-join-builder-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder-ir.cc b/be/src/exec/partitioned-hash-join-builder-ir.cc
index 8481212..8d8e42d 100644
--- a/be/src/exec/partitioned-hash-join-builder-ir.cc
+++ b/be/src/exec/partitioned-hash-join-builder-ir.cc
@@ -66,6 +66,7 @@ Status PhjBuilder::ProcessBuildBatch(
       return status;
     }
   }
+  for (const FilterContext& ctx : filter_ctxs_) ctx.MaterializeValues();
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/partitioned-hash-join-builder.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.cc b/be/src/exec/partitioned-hash-join-builder.cc
index 584b42d..a86b87c 100644
--- a/be/src/exec/partitioned-hash-join-builder.cc
+++ b/be/src/exec/partitioned-hash-join-builder.cc
@@ -34,6 +34,7 @@
 #include "runtime/runtime-filter.h"
 #include "runtime/runtime-state.h"
 #include "util/bloom-filter.h"
+#include "util/min-max-filter.h"
 #include "util/runtime-profile-counters.h"
 
 #include "gen-cpp/PlanNodes_types.h"
@@ -469,9 +470,16 @@ void PhjBuilder::AllocateRuntimeFilters() {
       << "Runtime filters not supported with NULL_AWARE_LEFT_ANTI_JOIN";
   DCHECK(ht_ctx_ != NULL);
   for (int i = 0; i < filter_ctxs_.size(); ++i) {
-    filter_ctxs_[i].local_bloom_filter =
-        runtime_state_->filter_bank()->AllocateScratchBloomFilter(
-            filter_ctxs_[i].filter->id());
+    if (filter_ctxs_[i].filter->is_bloom_filter()) {
+      filter_ctxs_[i].local_bloom_filter =
+          runtime_state_->filter_bank()->AllocateScratchBloomFilter(
+              filter_ctxs_[i].filter->id());
+    } else {
+      DCHECK(filter_ctxs_[i].filter->is_min_max_filter());
+      filter_ctxs_[i].local_min_max_filter =
+          runtime_state_->filter_bank()->AllocateScratchMinMaxFilter(
+              filter_ctxs_[i].filter->id(), filter_ctxs_[i].expr_eval->root().type());
+    }
   }
 }
 
@@ -491,12 +499,22 @@ void PhjBuilder::PublishRuntimeFilters(int64_t num_build_rows) {
   for (const FilterContext& ctx : filter_ctxs_) {
     // TODO: Consider checking actual number of bits set in filter to compute FP rate.
     // TODO: Consider checking this every few batches or so.
-    bool fp_rate_too_high = runtime_state_->filter_bank()->FpRateTooHigh(
-        ctx.filter->filter_size(), num_build_rows);
-    runtime_state_->filter_bank()->UpdateFilterFromLocal(ctx.filter->id(),
-        fp_rate_too_high ? BloomFilter::ALWAYS_TRUE_FILTER : ctx.local_bloom_filter);
+    BloomFilter* bloom_filter = nullptr;
+    if (ctx.local_bloom_filter != nullptr) {
+      if (runtime_state_->filter_bank()->FpRateTooHigh(
+              ctx.filter->filter_size(), num_build_rows)) {
+        bloom_filter = BloomFilter::ALWAYS_TRUE_FILTER;
+      } else {
+        bloom_filter = ctx.local_bloom_filter;
+        ++num_enabled_filters;
+      }
+    } else if (ctx.local_min_max_filter != nullptr
+        && !ctx.local_min_max_filter->AlwaysTrue()) {
+      ++num_enabled_filters;
+    }
 
-    num_enabled_filters += !fp_rate_too_high;
+    runtime_state_->filter_bank()->UpdateFilterFromLocal(
+        ctx.filter->id(), bloom_filter, ctx.local_min_max_filter);
   }
 
   if (filter_ctxs_.size() > 0) {
@@ -959,7 +977,8 @@ Status PhjBuilder::CodegenInsertRuntimeFilters(
   int num_filters = filter_exprs.size();
   for (int i = 0; i < num_filters; ++i) {
     llvm::Function* insert_fn;
-    RETURN_IF_ERROR(FilterContext::CodegenInsert(codegen, filter_exprs_[i], &insert_fn));
+    RETURN_IF_ERROR(FilterContext::CodegenInsert(
+        codegen, filter_exprs_[i], &filter_ctxs_[i], &insert_fn));
     llvm::PointerType* filter_context_type =
         codegen->GetPtrType(FilterContext::LLVM_CLASS_NAME);
     llvm::Value* filter_context_ptr =

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/exec/scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/scan-node.cc b/be/src/exec/scan-node.cc
index 18fc473..27726f8 100644
--- a/be/src/exec/scan-node.cc
+++ b/be/src/exec/scan-node.cc
@@ -75,12 +75,16 @@ Status ScanNode::Init(const TPlanNode& tnode, RuntimeState* state) {
     filter_ctxs_.emplace_back();
     FilterContext& filter_ctx = filter_ctxs_.back();
     filter_ctx.filter = state->filter_bank()->RegisterFilter(filter_desc, false);
-    string filter_profile_title = Substitute("Filter $0 ($1)", filter_desc.filter_id,
-        PrettyPrinter::Print(filter_ctx.filter->filter_size(), TUnit::BYTES));
-    RuntimeProfile* profile =
-        RuntimeProfile::Create(state->obj_pool(), filter_profile_title);
-    runtime_profile_->AddChild(profile);
-    filter_ctx.stats = state->obj_pool()->Add(new FilterStats(profile));
+    // TODO: Enable stats for min-max filters when Kudu exposes info about filters
+    // (KUDU-2162).
+    if (filter_ctx.filter->is_bloom_filter()) {
+      string filter_profile_title = Substitute("Filter $0 ($1)", filter_desc.filter_id,
+          PrettyPrinter::Print(filter_ctx.filter->filter_size(), TUnit::BYTES));
+      RuntimeProfile* profile =
+          RuntimeProfile::Create(state->obj_pool(), filter_profile_title);
+      runtime_profile_->AddChild(profile);
+      filter_ctx.stats = state->obj_pool()->Add(new FilterStats(profile));
+    }
   }
 
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/coordinator-filter-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-filter-state.h b/be/src/runtime/coordinator-filter-state.h
index 08944b8..2cb0602 100644
--- a/be/src/runtime/coordinator-filter-state.h
+++ b/be/src/runtime/coordinator-filter-state.h
@@ -43,13 +43,14 @@ struct Coordinator::FilterTarget {
       fragment_idx(f_idx) {}
 };
 
-/// State of filters that are received for aggregation.
+/// State of runtime filters that are received for aggregation. A runtime filter will
+/// contain a bloom or min-max filter.
 ///
 /// A broadcast join filter is published as soon as the first update is received for it
 /// and subsequent updates are ignored (as they will be the same).
-/// Updates for a partitioned join filter are aggregated in 'bloom_filter' and this is
-/// published once 'pending_count' reaches 0 and if the filter was not disabled before
-/// that.
+/// Updates for a partitioned join filter are aggregated and then published once
+/// 'pending_count' reaches 0 and if the filter was not disabled before that.
+///
 ///
 /// A filter is disabled if an always_true filter update is received, an OOM is hit,
 /// filter aggregation is complete or if the query is complete.
@@ -61,9 +62,11 @@ class Coordinator::FilterState {
       completion_time_(0L) {
     // bloom_filter_ is a disjunction so the unit value is always_false.
     bloom_filter_.always_false = true;
+    min_max_filter_.always_false = true;
   }
 
   TBloomFilter& bloom_filter() { return bloom_filter_; }
+  TMinMaxFilter& min_max_filter() { return min_max_filter_; }
   boost::unordered_set<int>* src_fragment_instance_idxs() {
     return &src_fragment_instance_idxs_;
   }
@@ -76,9 +79,18 @@ class Coordinator::FilterState {
   int64_t completion_time() const { return completion_time_; }
   const TPlanNodeId& src() const { return src_; }
   const TRuntimeFilterDesc& desc() const { return desc_; }
+  bool is_bloom_filter() const { return desc_.type == TRuntimeFilterType::BLOOM; }
+  bool is_min_max_filter() const { return desc_.type == TRuntimeFilterType::MIN_MAX; }
   int pending_count() const { return pending_count_; }
   void set_pending_count(int pending_count) { pending_count_ = pending_count; }
-  bool disabled() const { return bloom_filter_.always_true; }
+  bool disabled() const {
+    if (is_bloom_filter()) {
+      return bloom_filter_.always_true;
+    } else {
+      DCHECK(is_min_max_filter());
+      return min_max_filter_.always_true;
+    }
+  }
 
   /// Aggregates partitioned join filters and updates memory consumption.
   /// Disables filter if always_true filter is received or OOM is hit.
@@ -100,13 +112,14 @@ class Coordinator::FilterState {
   /// Number of remaining backends to hear from before filter is complete.
   int pending_count_;
 
-  /// BloomFilter aggregated from all source plan nodes, to be broadcast to all
+  /// Filters aggregated from all source plan nodes, to be broadcast to all
   /// destination plan fragment instances. Only set for partitioned joins (broadcast joins
   /// need no aggregation).
   /// In order to avoid memory spikes, an incoming filter is moved (vs. copied) to the
   /// output structure in the case of a broadcast join. Similarly, for partitioned joins,
   /// the filter is moved from the following member to the output structure.
   TBloomFilter bloom_filter_;
+  TMinMaxFilter min_max_filter_;
 
   /// Time at which first local filter arrived.
   int64_t first_arrival_time_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/coordinator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 772da33..d0e7b90 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -43,6 +43,7 @@
 #include "util/hdfs-bulk-ops.h"
 #include "util/hdfs-util.h"
 #include "util/histogram-metric.h"
+#include "util/min-max-filter.h"
 #include "util/table-printer.h"
 
 #include "common/names.h"
@@ -321,7 +322,7 @@ void Coordinator::InitFilterRoutingTable() {
           f->src_fragment_instance_idxs()->insert(src_idxs.begin(), src_idxs.end());
 
         // target plan node of filter
-        } else if (plan_node.__isset.hdfs_scan_node) {
+        } else if (plan_node.__isset.hdfs_scan_node || plan_node.__isset.kudu_scan_node) {
           auto it = filter.planid_to_target_ndx.find(plan_node.node_id);
           DCHECK(it != filter.planid_to_target_ndx.end());
           const TRuntimeFilterTargetDesc& t_target = filter.targets[it->second];
@@ -1125,16 +1126,23 @@ void Coordinator::UpdateFilter(const TUpdateFilterParams& params) {
       target_fragment_idxs.insert(target.fragment_idx);
     }
 
-    // Assign outgoing bloom filter.
-    TBloomFilter& aggregated_filter = state->bloom_filter();
-    filter_mem_tracker_->Release(aggregated_filter.directory.size());
+    if (state->is_bloom_filter()) {
+      // Assign outgoing bloom filter.
+      TBloomFilter& aggregated_filter = state->bloom_filter();
+      filter_mem_tracker_->Release(aggregated_filter.directory.size());
+
+      // TODO: Track memory used by 'rpc_params'.
+      swap(rpc_params.bloom_filter, aggregated_filter);
+      DCHECK(rpc_params.bloom_filter.always_false || rpc_params.bloom_filter.always_true
+          || !rpc_params.bloom_filter.directory.empty());
+      DCHECK(aggregated_filter.directory.empty());
+      rpc_params.__isset.bloom_filter = true;
+    } else {
+      DCHECK(state->is_min_max_filter());
+      MinMaxFilter::Copy(state->min_max_filter(), &rpc_params.min_max_filter);
+      rpc_params.__isset.min_max_filter = true;
+    }
 
-    // TODO: Track memory used by 'rpc_params'.
-    swap(rpc_params.bloom_filter, aggregated_filter);
-    DCHECK(rpc_params.bloom_filter.always_false || rpc_params.bloom_filter.always_true ||
-        !rpc_params.bloom_filter.directory.empty());
-    DCHECK(aggregated_filter.directory.empty());
-    rpc_params.__isset.bloom_filter = true;
     // Filter is complete, and can be released.
     state->Disable(filter_mem_tracker_);
   }
@@ -1160,27 +1168,40 @@ void Coordinator::FilterState::ApplyUpdate(const TUpdateFilterParams& params,
   }
 
   --pending_count_;
-  if (params.bloom_filter.always_true) {
-    Disable(coord->filter_mem_tracker_);
-  } else if (bloom_filter_.always_false) {
-    int64_t heap_space = params.bloom_filter.directory.size();
-    if (!coord->filter_mem_tracker_->TryConsume(heap_space)) {
-      VLOG_QUERY << "Not enough memory to allocate filter: "
-                 << PrettyPrinter::Print(heap_space, TUnit::BYTES)
-                 << " (query: " << coord->query_id() << ")";
-      // Disable, as one missing update means a correct filter cannot be produced.
+  if (is_bloom_filter()) {
+    DCHECK(params.__isset.bloom_filter);
+    if (params.bloom_filter.always_true) {
       Disable(coord->filter_mem_tracker_);
+    } else if (bloom_filter_.always_false) {
+      int64_t heap_space = params.bloom_filter.directory.size();
+      if (!coord->filter_mem_tracker_->TryConsume(heap_space)) {
+        VLOG_QUERY << "Not enough memory to allocate filter: "
+                   << PrettyPrinter::Print(heap_space, TUnit::BYTES)
+                   << " (query: " << coord->query_id() << ")";
+        // Disable, as one missing update means a correct filter cannot be produced.
+        Disable(coord->filter_mem_tracker_);
+      } else {
+        // Workaround for fact that parameters are const& for Thrift RPCs - yet we want to
+        // move the payload from the request rather than copy it and take double the
+        // memory cost. After this point, params.bloom_filter is an empty filter and
+        // should not be read.
+        TBloomFilter* non_const_filter = &const_cast<TBloomFilter&>(params.bloom_filter);
+        swap(bloom_filter_, *non_const_filter);
+        DCHECK_EQ(non_const_filter->directory.size(), 0);
+      }
     } else {
-      // Workaround for fact that parameters are const& for Thrift RPCs - yet we want to
-      // move the payload from the request rather than copy it and take double the memory
-      // cost. After this point, params.bloom_filter is an empty filter and should not be
-      // read.
-      TBloomFilter* non_const_filter = &const_cast<TBloomFilter&>(params.bloom_filter);
-      swap(bloom_filter_, *non_const_filter);
-      DCHECK_EQ(non_const_filter->directory.size(), 0);
+      BloomFilter::Or(params.bloom_filter, &bloom_filter_);
     }
   } else {
-    BloomFilter::Or(params.bloom_filter, &bloom_filter_);
+    DCHECK(is_min_max_filter());
+    DCHECK(params.__isset.min_max_filter);
+    if (params.min_max_filter.always_true) {
+      Disable(coord->filter_mem_tracker_);
+    } else if (min_max_filter_.always_false) {
+      MinMaxFilter::Copy(params.min_max_filter, &min_max_filter_);
+    } else {
+      MinMaxFilter::Or(params.min_max_filter, &min_max_filter_);
+    }
   }
 
   if (pending_count_ == 0 || disabled()) {
@@ -1189,11 +1210,17 @@ void Coordinator::FilterState::ApplyUpdate(const TUpdateFilterParams& params,
 }
 
 void Coordinator::FilterState::Disable(MemTracker* tracker) {
-  bloom_filter_.always_true = true;
-  bloom_filter_.always_false = false;
-  tracker->Release(bloom_filter_.directory.size());
-  bloom_filter_.directory.clear();
-  bloom_filter_.directory.shrink_to_fit();
+  if (is_bloom_filter()) {
+    bloom_filter_.always_true = true;
+    bloom_filter_.always_false = false;
+    tracker->Release(bloom_filter_.directory.size());
+    bloom_filter_.directory.clear();
+    bloom_filter_.directory.shrink_to_fit();
+  } else {
+    DCHECK(is_min_max_filter());
+    min_max_filter_.always_true = true;
+    min_max_filter_.always_false = false;
+  }
 }
 
 const TUniqueId& Coordinator::query_id() const {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/fragment-instance-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/fragment-instance-state.cc b/be/src/runtime/fragment-instance-state.cc
index f957dd1..bf437ba 100644
--- a/be/src/runtime/fragment-instance-state.cc
+++ b/be/src/runtime/fragment-instance-state.cc
@@ -414,13 +414,12 @@ Status FragmentInstanceState::WaitForOpen() {
   return opened_promise_.Get();
 }
 
-void FragmentInstanceState::PublishFilter(
-    int32_t filter_id, const TBloomFilter& thrift_bloom_filter) {
+void FragmentInstanceState::PublishFilter(const TPublishFilterParams& params) {
   VLOG_FILE << "PublishFilter(): instance_id=" << PrintId(instance_id())
-            << " filter_id=" << filter_id;
+            << " filter_id=" << params.filter_id;
   // Wait until Prepare() is done, so we know that the filter bank is set up.
   if (!WaitForPrepare().ok()) return;
-  runtime_state_->filter_bank()->PublishGlobalFilter(filter_id, thrift_bloom_filter);
+  runtime_state_->filter_bank()->PublishGlobalFilter(params);
 }
 
 const TQueryCtx& FragmentInstanceState::query_ctx() const {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/fragment-instance-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/fragment-instance-state.h b/be/src/runtime/fragment-instance-state.h
index f540b56..4e832f6 100644
--- a/be/src/runtime/fragment-instance-state.h
+++ b/be/src/runtime/fragment-instance-state.h
@@ -99,7 +99,7 @@ class FragmentInstanceState {
   Status WaitForOpen();
 
   /// Publishes filter with ID 'filter_id' to this fragment instance's filter bank.
-  void PublishFilter(int32_t filter_id, const TBloomFilter& thrift_bloom_filter);
+  void PublishFilter(const TPublishFilterParams& params);
 
   /// Returns fragment instance's sink if this is the root fragment instance. Valid after
   /// the Prepare phase. May be nullptr.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/query-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.cc b/be/src/runtime/query-state.cc
index 6796c82..6bc2591 100644
--- a/be/src/runtime/query-state.cc
+++ b/be/src/runtime/query-state.cc
@@ -397,12 +397,11 @@ void QueryState::Cancel() {
   for (auto entry: fis_map_) entry.second->Cancel();
 }
 
-void QueryState::PublishFilter(int32_t filter_id, int fragment_idx,
-    const TBloomFilter& thrift_bloom_filter) {
+void QueryState::PublishFilter(const TPublishFilterParams& params) {
   if (!instances_prepared_promise_.Get().ok()) return;
-  DCHECK_EQ(fragment_map_.count(fragment_idx), 1);
-  for (FragmentInstanceState* fis: fragment_map_[fragment_idx]) {
-    fis->PublishFilter(filter_id, thrift_bloom_filter);
+  DCHECK_EQ(fragment_map_.count(params.dst_fragment_idx), 1);
+  for (FragmentInstanceState* fis : fragment_map_[params.dst_fragment_idx]) {
+    fis->PublishFilter(params);
   }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/query-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.h b/be/src/runtime/query-state.h
index 82f2c52..f7b83a7 100644
--- a/be/src/runtime/query-state.h
+++ b/be/src/runtime/query-state.h
@@ -142,8 +142,7 @@ class QueryState {
   FragmentInstanceState* GetFInstanceState(const TUniqueId& instance_id);
 
   /// Blocks until all fragment instances have finished their Prepare phase.
-  void PublishFilter(int32_t filter_id, int fragment_idx,
-      const TBloomFilter& thrift_bloom_filter);
+  void PublishFilter(const TPublishFilterParams& params);
 
   /// Cancels all actively executing fragment instances. Blocks until all fragment
   /// instances have finished their Prepare phase. Idempotent.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/runtime-filter-bank.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter-bank.cc b/be/src/runtime/runtime-filter-bank.cc
index 2ae65c8..178aef1 100644
--- a/be/src/runtime/runtime-filter-bank.cc
+++ b/be/src/runtime/runtime-filter-bank.cc
@@ -27,6 +27,7 @@
 #include "service/impala-server.h"
 #include "util/bit-util.h"
 #include "util/bloom-filter.h"
+#include "util/min-max-filter.h"
 
 #include "common/names.h"
 
@@ -41,8 +42,12 @@ const int64_t RuntimeFilterBank::MIN_BLOOM_FILTER_SIZE;
 const int64_t RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE;
 
 RuntimeFilterBank::RuntimeFilterBank(const TQueryCtx& query_ctx, RuntimeState* state)
-    : state_(state), closed_(false) {
-  memory_allocated_ =
+  : state_(state),
+    filter_mem_tracker_(
+        new MemTracker(-1, "Runtime Filter Bank", state->instance_mem_tracker(), false)),
+    mem_pool_(filter_mem_tracker_.get()),
+    closed_(false) {
+  bloom_memory_allocated_ =
       state->runtime_profile()->AddCounter("BloomFilterBytes", TUnit::BYTES);
 
   // Clamp bloom filter size down to the limits {MIN,MAX}_BLOOM_FILTER_SIZE
@@ -66,9 +71,6 @@ RuntimeFilterBank::RuntimeFilterBank(const TQueryCtx& query_ctx, RuntimeState* s
   default_filter_size_ = max<int64_t>(default_filter_size_, min_filter_size_);
   default_filter_size_ =
       BitUtil::RoundUpToPowerOfTwo(min<int64_t>(default_filter_size_, max_filter_size_));
-
-  filter_mem_tracker_.reset(
-      new MemTracker(-1, "Runtime Filter Bank", state->instance_mem_tracker(), false));
 }
 
 RuntimeFilter* RuntimeFilterBank::RegisterFilter(const TRuntimeFilterDesc& filter_desc,
@@ -115,27 +117,28 @@ void SendFilterToCoordinator(TNetworkAddress address, TUpdateFilterParams params
 
 }
 
-void RuntimeFilterBank::UpdateFilterFromLocal(int32_t filter_id,
-    BloomFilter* bloom_filter) {
+void RuntimeFilterBank::UpdateFilterFromLocal(
+    int32_t filter_id, BloomFilter* bloom_filter, MinMaxFilter* min_max_filter) {
   DCHECK_NE(state_->query_options().runtime_filter_mode, TRuntimeFilterMode::OFF)
       << "Should not be calling UpdateFilterFromLocal() if filtering is disabled";
   TUpdateFilterParams params;
   // A runtime filter may have both local and remote targets.
   bool has_local_target = false;
   bool has_remote_target = false;
+  TRuntimeFilterType::type type;
   {
     lock_guard<mutex> l(runtime_filter_lock_);
     RuntimeFilterMap::iterator it = produced_filters_.find(filter_id);
     DCHECK(it != produced_filters_.end()) << "Tried to update unregistered filter: "
                                           << filter_id;
-    it->second->SetBloomFilter(bloom_filter);
+    it->second->SetFilter(bloom_filter, min_max_filter);
     has_local_target = it->second->filter_desc().has_local_targets;
     has_remote_target = it->second->filter_desc().has_remote_targets;
+    type = it->second->filter_desc().type;
   }
 
   if (has_local_target) {
-    // Do a short circuit publication by pushing the same BloomFilter to the consumer
-    // side.
+    // Do a short circuit publication by pushing the same filter to the consumer side.
     RuntimeFilter* filter;
     {
       lock_guard<mutex> l(runtime_filter_lock_);
@@ -143,7 +146,7 @@ void RuntimeFilterBank::UpdateFilterFromLocal(int32_t filter_id,
       if (it == consumed_filters_.end()) return;
       filter = it->second;
     }
-    filter->SetBloomFilter(bloom_filter);
+    filter->SetFilter(bloom_filter, min_max_filter);
     state_->runtime_profile()->AddInfoString(
         Substitute("Filter $0 arrival", filter_id),
         PrettyPrinter::Print(filter->arrival_delay(), TUnit::TIME_MS));
@@ -153,8 +156,14 @@ void RuntimeFilterBank::UpdateFilterFromLocal(int32_t filter_id,
       && state_->query_options().runtime_filter_mode == TRuntimeFilterMode::GLOBAL) {
     params.__set_filter_id(filter_id);
     params.__set_query_id(state_->query_id());
-    BloomFilter::ToThrift(bloom_filter, &params.bloom_filter);
-    params.__isset.bloom_filter = true;
+    if (type == TRuntimeFilterType::BLOOM) {
+      BloomFilter::ToThrift(bloom_filter, &params.bloom_filter);
+      params.__isset.bloom_filter = true;
+    } else {
+      DCHECK(type == TRuntimeFilterType::MIN_MAX);
+      min_max_filter->ToThrift(&params.min_max_filter);
+      params.__isset.min_max_filter = true;
+    }
 
     ExecEnv::GetInstance()->rpc_pool()->Offer(bind<void>(
         SendFilterToCoordinator, state_->query_ctx().coord_address, params,
@@ -162,32 +171,43 @@ void RuntimeFilterBank::UpdateFilterFromLocal(int32_t filter_id,
   }
 }
 
-void RuntimeFilterBank::PublishGlobalFilter(int32_t filter_id,
-    const TBloomFilter& thrift_filter) {
+void RuntimeFilterBank::PublishGlobalFilter(const TPublishFilterParams& params) {
   lock_guard<mutex> l(runtime_filter_lock_);
   if (closed_) return;
-  RuntimeFilterMap::iterator it = consumed_filters_.find(filter_id);
+  RuntimeFilterMap::iterator it = consumed_filters_.find(params.filter_id);
   DCHECK(it != consumed_filters_.end()) << "Tried to publish unregistered filter: "
-                                        << filter_id;
-  if (thrift_filter.always_true) {
-    it->second->SetBloomFilter(BloomFilter::ALWAYS_TRUE_FILTER);
-  } else {
-    int64_t required_space =
-        BloomFilter::GetExpectedHeapSpaceUsed(thrift_filter.log_heap_space);
-    // Silently fail to publish the filter (replacing it with a 0-byte complete one) if
-    // there's not enough memory for it.
-    if (!filter_mem_tracker_->TryConsume(required_space)) {
-      VLOG_QUERY << "No memory for global filter: " << filter_id
-                 << " (fragment instance: " << state_->fragment_instance_id() << ")";
-      it->second->SetBloomFilter(BloomFilter::ALWAYS_TRUE_FILTER);
+                                        << params.filter_id;
+
+  BloomFilter* bloom_filter = nullptr;
+  MinMaxFilter* min_max_filter = nullptr;
+  if (it->second->is_bloom_filter()) {
+    DCHECK(params.__isset.bloom_filter);
+    if (params.bloom_filter.always_true) {
+      bloom_filter = BloomFilter::ALWAYS_TRUE_FILTER;
     } else {
-      BloomFilter* bloom_filter = obj_pool_.Add(new BloomFilter(thrift_filter));
-      DCHECK_EQ(required_space, bloom_filter->GetHeapSpaceUsed());
-      memory_allocated_->Add(bloom_filter->GetHeapSpaceUsed());
-      it->second->SetBloomFilter(bloom_filter);
+      int64_t required_space =
+          BloomFilter::GetExpectedHeapSpaceUsed(params.bloom_filter.log_heap_space);
+      // Silently fail to publish the filter (replacing it with a 0-byte complete one) if
+      // there's not enough memory for it.
+      if (!filter_mem_tracker_->TryConsume(required_space)) {
+        VLOG_QUERY << "No memory for global filter: " << params.filter_id
+                   << " (fragment instance: " << state_->fragment_instance_id() << ")";
+        bloom_filter = BloomFilter::ALWAYS_TRUE_FILTER;
+      } else {
+        bloom_filter = obj_pool_.Add(new BloomFilter(params.bloom_filter));
+        DCHECK_EQ(required_space, bloom_filter->GetHeapSpaceUsed());
+        bloom_memory_allocated_->Add(bloom_filter->GetHeapSpaceUsed());
+      }
     }
+  } else {
+    DCHECK(it->second->is_min_max_filter());
+    DCHECK(params.__isset.min_max_filter);
+    min_max_filter = MinMaxFilter::Create(
+        params.min_max_filter, it->second->type(), &obj_pool_, &mem_pool_);
   }
-  state_->runtime_profile()->AddInfoString(Substitute("Filter $0 arrival", filter_id),
+  it->second->SetFilter(bloom_filter, min_max_filter);
+  state_->runtime_profile()->AddInfoString(
+      Substitute("Filter $0 arrival", params.filter_id),
       PrettyPrinter::Print(it->second->arrival_delay(), TUnit::TIME_MS));
 }
 
@@ -204,10 +224,21 @@ BloomFilter* RuntimeFilterBank::AllocateScratchBloomFilter(int32_t filter_id) {
   if (!filter_mem_tracker_->TryConsume(required_space)) return NULL;
   BloomFilter* bloom_filter = obj_pool_.Add(new BloomFilter(log_filter_size));
   DCHECK_EQ(required_space, bloom_filter->GetHeapSpaceUsed());
-  memory_allocated_->Add(bloom_filter->GetHeapSpaceUsed());
+  bloom_memory_allocated_->Add(bloom_filter->GetHeapSpaceUsed());
   return bloom_filter;
 }
 
+MinMaxFilter* RuntimeFilterBank::AllocateScratchMinMaxFilter(
+    int32_t filter_id, ColumnType type) {
+  lock_guard<mutex> l(runtime_filter_lock_);
+  if (closed_) return nullptr;
+
+  RuntimeFilterMap::iterator it = produced_filters_.find(filter_id);
+  DCHECK(it != produced_filters_.end()) << "Filter ID " << filter_id << " not registered";
+
+  return MinMaxFilter::Create(type, &obj_pool_, &mem_pool_);
+}
+
 int64_t RuntimeFilterBank::GetFilterSizeForNdv(int64_t ndv) {
   if (ndv == -1) return default_filter_size_;
   int64_t required_space =
@@ -227,6 +258,7 @@ void RuntimeFilterBank::Close() {
   lock_guard<mutex> l(runtime_filter_lock_);
   closed_ = true;
   obj_pool_.Clear();
-  filter_mem_tracker_->Release(memory_allocated_->value());
+  mem_pool_.FreeAll();
+  filter_mem_tracker_->Release(bloom_memory_allocated_->value());
   filter_mem_tracker_->Close();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/runtime-filter-bank.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter-bank.h b/be/src/runtime/runtime-filter-bank.h
index d8be8ab..8f6bb42 100644
--- a/be/src/runtime/runtime-filter-bank.h
+++ b/be/src/runtime/runtime-filter-bank.h
@@ -20,6 +20,7 @@
 
 #include "codegen/impala-ir.h"
 #include "common/object-pool.h"
+#include "runtime/mem-pool.h"
 #include "runtime/types.h"
 #include "util/runtime-profile.h"
 
@@ -31,6 +32,7 @@ namespace impala {
 
 class BloomFilter;
 class MemTracker;
+class MinMaxFilter;
 class RuntimeFilter;
 class RuntimeState;
 class TBloomFilter;
@@ -47,9 +49,10 @@ class TQueryCtx;
 /// RuntimeFilterBank treats each filter independently.
 ///
 /// All filters must be registered with the filter bank via RegisterFilter(). Local plan
-/// fragments update the bloom filters by calling UpdateFilterFromLocal()
-/// (UpdateFilterFromLocal() may only be called once per filter ID per filter bank). The
-/// bloom_filter that is passed into UpdateFilterFromLocal() must have been allocated by
+/// fragments update the filters by calling UpdateFilterFromLocal() (which may only be
+/// called once per filter ID per filter bank), with either a bloom filter or a min-max
+/// filter, depending on the filter's type. The 'bloom_filter' or 'min_max_filter' that is
+/// passed into UpdateFilterFromLocal() must have been allocated by
 /// AllocateScratchBloomFilter(); this allows RuntimeFilterBank to manage all memory
 /// associated with filters.
 ///
@@ -58,9 +61,10 @@ class TQueryCtx;
 ///
 /// After PublishGlobalFilter() has been called (and again, it may only be called once per
 /// filter_id), the RuntimeFilter object associated with filter_id will have a valid
-/// bloom_filter, and may be used for filter evaluation. This operation occurs without
-/// synchronisation, and neither the thread that calls PublishGlobalFilter() nor the
-/// thread that may call RuntimeFilter::Eval() need to coordinate in any way.
+/// bloom_filter or min_max_filter, and may be used for filter evaluation. This
+/// operation occurs without synchronisation, and neither the thread that calls
+/// PublishGlobalFilter() nor the thread that may call RuntimeFilter::Eval() need to
+/// coordinate in any way.
 class RuntimeFilterBank {
  public:
   RuntimeFilterBank(const TQueryCtx& query_ctx, RuntimeState* state);
@@ -70,14 +74,16 @@ class RuntimeFilterBank {
   /// bloom_filter itself is unallocated until the first call to PublishGlobalFilter().
   RuntimeFilter* RegisterFilter(const TRuntimeFilterDesc& filter_desc, bool is_producer);
 
-  /// Updates a filter's bloom_filter with 'bloom_filter' which has been produced by some
-  /// operator in the local fragment instance. 'bloom_filter' may be NULL, representing a
-  /// full filter that contains all elements.
-  void UpdateFilterFromLocal(int32_t filter_id, BloomFilter* bloom_filter);
+  /// Updates a filter's 'bloom_filter' or 'min_max_filter' which has been produced by
+  /// some operator in the local fragment instance. At most one of 'bloom_filter' and
+  /// 'min_max_filter' may be non-NULL, depending on the filter's type. They may both be
+  /// NULL, representing a filter that allows all rows to pass.
+  void UpdateFilterFromLocal(
+      int32_t filter_id, BloomFilter* bloom_filter, MinMaxFilter* min_max_filter);
 
   /// Makes a bloom_filter (aggregated globally from all producer fragments) available for
   /// consumption by operators that wish to use it for filtering.
-  void PublishGlobalFilter(int32_t filter_id, const TBloomFilter& thrift_filter);
+  void PublishGlobalFilter(const TPublishFilterParams& params);
 
   /// Returns true if, according to the observed NDV in 'observed_ndv', a filter of size
   /// 'filter_size' would have an expected false-positive rate which would exceed
@@ -100,6 +106,9 @@ class RuntimeFilterBank {
   /// If there is not enough memory, or if Close() has been called first, returns NULL.
   BloomFilter* AllocateScratchBloomFilter(int32_t filter_id);
 
+  /// Returns a new MinMaxFilter. Handles memory the same as AllocateScratchBloomFilter().
+  MinMaxFilter* AllocateScratchMinMaxFilter(int32_t filter_id, ColumnType type);
+
   /// Default hash seed to use when computing hashed values to insert into filters.
   static int32_t IR_ALWAYS_INLINE DefaultHashSeed() { return 1234; }
 
@@ -136,12 +145,15 @@ class RuntimeFilterBank {
   /// MemTracker to track Bloom filter memory.
   boost::scoped_ptr<MemTracker> filter_mem_tracker_;
 
+  // Mem pool to track allocations made by filters.
+  MemPool mem_pool_;
+
   /// True iff Close() has been called. Used to prevent races between
   /// AllocateScratchBloomFilter() and Close().
   bool closed_;
 
   /// Total amount of memory allocated to Bloom Filters
-  RuntimeProfile::Counter* memory_allocated_;
+  RuntimeProfile::Counter* bloom_memory_allocated_;
 
   /// Precomputed default BloomFilter size.
   int64_t default_filter_size_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/runtime-filter-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter-ir.cc b/be/src/runtime/runtime-filter-ir.cc
index 4e386cb..6436213 100644
--- a/be/src/runtime/runtime-filter-ir.cc
+++ b/be/src/runtime/runtime-filter-ir.cc
@@ -21,10 +21,9 @@ using namespace impala;
 
 bool IR_ALWAYS_INLINE RuntimeFilter::Eval(
     void* val, const ColumnType& col_type) const noexcept {
-  // Safe to read bloom_filter_ concurrently with any ongoing SetBloomFilter() thanks
-  // to a) the atomicity of / pointer assignments and b) the x86 TSO memory model.
-  if (bloom_filter_ == BloomFilter::ALWAYS_TRUE_FILTER) return true;
+  DCHECK(is_bloom_filter());
+  if (bloom_filter_.Load() == BloomFilter::ALWAYS_TRUE_FILTER) return true;
   uint32_t h = RawValue::GetHashValue(val, col_type,
       RuntimeFilterBank::DefaultHashSeed());
-  return bloom_filter_->Find(h);
+  return bloom_filter_.Load()->Find(h);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/runtime-filter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter.cc b/be/src/runtime/runtime-filter.cc
index 228094e..a2fd30e 100644
--- a/be/src/runtime/runtime-filter.cc
+++ b/be/src/runtime/runtime-filter.cc
@@ -29,9 +29,9 @@ const char* RuntimeFilter::LLVM_CLASS_NAME = "class.impala::RuntimeFilter";
 
 bool RuntimeFilter::WaitForArrival(int32_t timeout_ms) const {
   do {
-    if (HasBloomFilter()) return true;
+    if (HasFilter()) return true;
     SleepForMs(SLEEP_PERIOD_MS);
   } while ((MonotonicMillis() - registration_time_) < timeout_ms);
 
-  return HasBloomFilter();
+  return HasFilter();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/runtime-filter.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter.h b/be/src/runtime/runtime-filter.h
index 5d9531b..40c5f23 100644
--- a/be/src/runtime/runtime-filter.h
+++ b/be/src/runtime/runtime-filter.h
@@ -29,33 +29,45 @@ namespace impala {
 
 class BloomFilter;
 
-/// RuntimeFilters represent set-membership predicates (implemented with bloom filters)
-/// that are computed during query execution (rather than during planning). They can then
-/// be sent to other operators to reduce their output. For example, a RuntimeFilter might
-/// compute a predicate corresponding to set membership, where the members of that set can
-/// only be computed at runtime (for example, the distinct values of the build side of a
-/// hash table). Other plan nodes can use that predicate by testing for membership of that
-/// set to filter rows early on in the plan tree (e.g. the scan that feeds the probe side
-/// of that join node could eliminate rows from consideration for join matching).
+/// RuntimeFilters represent set-membership predicates that are computed during query
+/// execution (rather than during planning). They can then be sent to other operators to
+/// reduce their output. For example, a RuntimeFilter might compute a predicate
+/// corresponding to set membership, where the members of that set can only be computed at
+/// runtime (for example, the distinct values of the build side of a hash table). Other
+/// plan nodes can use that predicate by testing for membership of that set to filter rows
+/// early on in the plan tree (e.g. the scan that feeds the probe side of that join node
+/// could eliminate rows from consideration for join matching).
+///
+/// A RuntimeFilter may compute its set-membership predicate as a bloom filters or a
+/// min-max filter, depending on its filter description.
 class RuntimeFilter {
  public:
   RuntimeFilter(const TRuntimeFilterDesc& filter, int64_t filter_size)
-      : bloom_filter_(NULL), filter_desc_(filter), arrival_time_(0L),
+      : bloom_filter_(nullptr), min_max_filter_(nullptr), filter_desc_(filter),
+        registration_time_(MonotonicMillis()), arrival_time_(0L),
         filter_size_(filter_size) {
     DCHECK_GT(filter_size_, 0);
-    registration_time_ = MonotonicMillis();
   }
 
-  /// Returns true if SetBloomFilter() has been called.
-  bool HasBloomFilter() const { return arrival_time_ != 0; }
+  /// Returns true if SetFilter() has been called.
+  bool HasFilter() const { return arrival_time_.Load() != 0; }
 
   const TRuntimeFilterDesc& filter_desc() const { return filter_desc_; }
   int32_t id() const { return filter_desc().filter_id; }
   int64_t filter_size() const { return filter_size_; }
+  ColumnType type() const {
+    return ColumnType::FromThrift(filter_desc().src_expr.nodes[0].type);
+  }
+  bool is_bloom_filter() const { return filter_desc().type == TRuntimeFilterType::BLOOM; }
+  bool is_min_max_filter() const {
+    return filter_desc().type == TRuntimeFilterType::MIN_MAX;
+  }
+
+  MinMaxFilter* get_min_max() const { return min_max_filter_.Load(); }
 
   /// Sets the internal filter bloom_filter to 'bloom_filter'. Can only legally be called
   /// once per filter. Does not acquire the memory associated with 'bloom_filter'.
-  inline void SetBloomFilter(BloomFilter* bloom_filter);
+  inline void SetFilter(BloomFilter* bloom_filter, MinMaxFilter* min_max_filter);
 
   /// Returns false iff 'bloom_filter_' has been set via SetBloomFilter() and hash[val] is
   /// not in that 'bloom_filter_'. Otherwise returns true. Is safe to call concurrently
@@ -67,8 +79,8 @@ class RuntimeFilter {
   /// Returns the amount of time waited since registration for the filter to
   /// arrive. Returns 0 if filter has not yet arrived.
   int32_t arrival_delay() const {
-    if (arrival_time_ == 0L) return 0L;
-    return arrival_time_ - registration_time_;
+    if (arrival_time_.Load() == 0L) return 0L;
+    return arrival_time_.Load() - registration_time_;
   }
 
   /// Periodically (every 20ms) checks to see if the global filter has arrived. Waits for
@@ -88,21 +100,26 @@ class RuntimeFilter {
   static const char* LLVM_CLASS_NAME;
 
  private:
-  /// Membership bloom_filter. May be NULL even after arrival_time_ is set. This is a
-  /// compact way of representing a full Bloom filter that contains every element.
-  BloomFilter* bloom_filter_;
+  /// Membership bloom_filter. May be NULL even after arrival_time_ is set, meaning that
+  /// it does not filter any rows, either because it was not created
+  /// (filter_desc_.bloom_filter is false), there was not enough memory, or the false
+  /// positive rate was determined to be too high.
+  AtomicPtr<BloomFilter> bloom_filter_;
+
+  /// May be NULL even after arrival_time_ is set if filter_desc_.min_max_filter is false.
+  AtomicPtr<MinMaxFilter> min_max_filter_;
 
   /// Reference to the filter's thrift descriptor in the thrift Plan tree.
   const TRuntimeFilterDesc& filter_desc_;
 
   /// Time, in ms, that the filter was registered.
-  int64_t registration_time_;
+  const int64_t registration_time_;
 
-  /// Time, in ms, that the global fiter arrived. Set in SetBloomFilter().
-  int64_t arrival_time_;
+  /// Time, in ms, that the global filter arrived. Set in SetFilter().
+  AtomicInt64 arrival_time_;
 
   /// The size of the Bloom filter, in bytes.
-  int64_t filter_size_;
+  const int64_t filter_size_;
 };
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/runtime-filter.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter.inline.h b/be/src/runtime/runtime-filter.inline.h
index 128cafd..b2de81d 100644
--- a/be/src/runtime/runtime-filter.inline.h
+++ b/be/src/runtime/runtime-filter.inline.h
@@ -25,6 +25,7 @@
 
 #include "runtime/raw-value.inline.h"
 #include "util/bloom-filter.h"
+#include "util/min-max-filter.h"
 #include "util/time.h"
 
 namespace impala {
@@ -36,21 +37,35 @@ inline const RuntimeFilter* RuntimeFilterBank::GetRuntimeFilter(int32_t filter_i
   return it->second;
 }
 
-inline void RuntimeFilter::SetBloomFilter(BloomFilter* bloom_filter) {
-  DCHECK(bloom_filter_ == NULL);
-  // TODO: Barrier required here to ensure compiler does not both inline and re-order
-  // this assignment. Not an issue for correctness (as assignment is atomic), but
-  // potentially confusing.
-  bloom_filter_ = bloom_filter;
-  arrival_time_ = MonotonicMillis();
+inline void RuntimeFilter::SetFilter(
+    BloomFilter* bloom_filter, MinMaxFilter* min_max_filter) {
+  DCHECK(bloom_filter_.Load() == nullptr && min_max_filter_.Load() == nullptr);
+  if (is_bloom_filter()) {
+    bloom_filter_.Store(bloom_filter);
+  } else {
+    DCHECK(is_min_max_filter());
+    min_max_filter_.Store(min_max_filter);
+  }
+  arrival_time_.Store(MonotonicMillis());
 }
 
-inline bool RuntimeFilter::AlwaysTrue() const  {
-  return HasBloomFilter() && bloom_filter_ == BloomFilter::ALWAYS_TRUE_FILTER;
+inline bool RuntimeFilter::AlwaysTrue() const {
+  if (is_bloom_filter()) {
+    return HasFilter() && bloom_filter_.Load() == BloomFilter::ALWAYS_TRUE_FILTER;
+  } else {
+    DCHECK(is_min_max_filter());
+    return HasFilter() && min_max_filter_.Load()->AlwaysTrue();
+  }
 }
 
 inline bool RuntimeFilter::AlwaysFalse() const {
-  return bloom_filter_ != BloomFilter::ALWAYS_TRUE_FILTER && bloom_filter_->AlwaysFalse();
+  if (is_bloom_filter()) {
+    return bloom_filter_.Load() != BloomFilter::ALWAYS_TRUE_FILTER
+        && bloom_filter_.Load()->AlwaysFalse();
+  } else {
+    DCHECK(is_min_max_filter());
+    return min_max_filter_.Load() != nullptr && min_max_filter_.Load()->AlwaysFalse();
+  }
 }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/runtime/timestamp-value.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/timestamp-value.h b/be/src/runtime/timestamp-value.h
index 556225b..445189a 100644
--- a/be/src/runtime/timestamp-value.h
+++ b/be/src/runtime/timestamp-value.h
@@ -26,6 +26,7 @@
 #include <gflags/gflags.h>
 #include <string>
 
+#include "gen-cpp/Data_types.h"
 #include "udf/udf.h"
 #include "util/hash-util.h"
 
@@ -150,6 +151,20 @@ class TimestampValue {
     *ptp = boost::posix_time::ptime(date_, time_);
   }
 
+  // Store the binary representation of this TimestampValue in 'tvalue'.
+  void ToTColumnValue(TColumnValue* tvalue) const {
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(this);
+    tvalue->timestamp_val.assign(data, data + Size());
+    tvalue->__isset.timestamp_val = true;
+  }
+
+  // Returns a new TimestampValue created from the value in 'tvalue'.
+  static TimestampValue FromTColumnValue(const TColumnValue& tvalue) {
+    TimestampValue value;
+    memcpy(&value, tvalue.timestamp_val.c_str(), Size());
+    return value;
+  }
+
   bool HasDate() const { return !date_.is_special(); }
   bool HasTime() const { return !time_.is_special(); }
   bool HasDateOrTime() const { return HasDate() || HasTime(); }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/service/impala-internal-service.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-internal-service.cc b/be/src/service/impala-internal-service.cc
index d8a2a4a..5be8765 100644
--- a/be/src/service/impala-internal-service.cc
+++ b/be/src/service/impala-internal-service.cc
@@ -93,7 +93,7 @@ void ImpalaInternalService::UpdateFilter(TUpdateFilterResult& return_val,
   FAULT_INJECTION_RPC_DELAY(RPC_UPDATEFILTER);
   DCHECK(params.__isset.filter_id);
   DCHECK(params.__isset.query_id);
-  DCHECK(params.__isset.bloom_filter);
+  DCHECK(params.__isset.bloom_filter || params.__isset.min_max_filter);
   impala_server_->UpdateFilter(return_val, params);
 }
 
@@ -103,8 +103,8 @@ void ImpalaInternalService::PublishFilter(TPublishFilterResult& return_val,
   DCHECK(params.__isset.filter_id);
   DCHECK(params.__isset.dst_query_id);
   DCHECK(params.__isset.dst_fragment_idx);
-  DCHECK(params.__isset.bloom_filter);
+  DCHECK(params.__isset.bloom_filter || params.__isset.min_max_filter);
   QueryState::ScopedRef qs(params.dst_query_id);
   if (qs.get() == nullptr) return;
-  qs->PublishFilter(params.filter_id, params.dst_fragment_idx, params.bloom_filter);
+  qs->PublishFilter(params);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2510fe0a/be/src/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index 4ff03d6..08002ed 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -55,6 +55,8 @@ add_library(Util
   mem-info.cc
   memory-metrics.cc
   metrics.cc
+  min-max-filter.cc
+  min-max-filter-ir.cc
   minidump.cc
   network-util.cc
   openssl-util.cc
@@ -120,6 +122,7 @@ ADD_BE_TEST(internal-queue-test)
 ADD_BE_TEST(logging-support-test)
 ADD_BE_TEST(lru-cache-test)
 ADD_BE_TEST(metrics-test)
+ADD_BE_TEST(min-max-filter-test)
 ADD_BE_TEST(openssl-util-test)
 ADD_BE_TEST(parse-util-test)
 #ADD_BE_TEST(perf-counters-test)

[15/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-reader-context.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-reader-context.h b/be/src/runtime/disk-io-mgr-reader-context.h
deleted file mode 100644
index 90426d9..0000000
--- a/be/src/runtime/disk-io-mgr-reader-context.h
+++ /dev/null
@@ -1,406 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_DISK_IO_MGR_READER_CONTEXT_H
-#define IMPALA_RUNTIME_DISK_IO_MGR_READER_CONTEXT_H
-
-#include "runtime/disk-io-mgr.h"
-#include "util/condition-variable.h"
-
-namespace impala {
-
-/// A request context is used to group together I/O requests belonging to a client of the
-/// I/O manager for management and scheduling. For most I/O manager clients it is an
-/// opaque pointer, but some clients may need to include this header, e.g. to make the
-/// unique_ptr<DiskIoRequestContext> destructor work correctly.
-///
-/// Implementation Details
-/// ======================
-/// This object maintains a lot of state that is carefully synchronized. The context
-/// maintains state across all disks as well as per disk state.
-/// The unit for an IO request is a RequestRange, which may be a ScanRange or a
-/// WriteRange.
-/// A scan range for the reader is on one of five states:
-/// 1) PerDiskState's unstarted_ranges: This range has only been queued
-///    and nothing has been read from it.
-/// 2) DiskIoRequestContext's ready_to_start_ranges_: This range is about to be started.
-///    As soon as the reader picks it up, it will move to the in_flight_ranges
-///    queue.
-/// 3) PerDiskState's in_flight_ranges: This range is being processed and will
-///    be read from the next time a disk thread picks it up in GetNextRequestRange()
-/// 4) ScanRange's outgoing ready buffers is full. We can't read for this range
-///    anymore. We need the caller to pull a buffer off which will put this in
-///    the in_flight_ranges queue. These ranges are in the DiskIoRequestContext's
-///    blocked_ranges_ queue.
-/// 5) ScanRange is cached and in the cached_ranges_ queue.
-//
-/// If the scan range is read and does not get blocked on the outgoing queue, the
-/// transitions are: 1 -> 2 -> 3.
-/// If the scan range does get blocked, the transitions are
-/// 1 -> 2 -> 3 -> (4 -> 3)*
-//
-/// In the case of a cached scan range, the range is immediately put in cached_ranges_.
-/// When the caller asks for the next range to process, we first pull ranges from
-/// the cache_ranges_ queue. If the range was cached, the range is removed and
-/// done (ranges are either entirely cached or not at all). If the cached read attempt
-/// fails, we put the range in state 1.
-//
-/// A write range for a context may be in one of two lists:
-/// 1) unstarted_write_ranges_ : Ranges that have been queued but not processed.
-/// 2) in_flight_ranges_: The write range is ready to be processed by the next disk thread
-///    that picks it up in GetNextRequestRange().
-//
-/// AddWriteRange() adds WriteRanges for a disk.
-/// It is the responsibility of the client to pin the data to be written via a WriteRange
-/// in memory. After a WriteRange has been written, a callback is invoked to inform the
-/// client that the write has completed.
-//
-/// An important assumption is that write does not exceed the maximum read size and that
-/// the entire range is written when the write request is handled. (In other words, writes
-/// are not broken up.)
-//
-/// When a DiskIoRequestContext is processed by a disk thread in GetNextRequestRange(),
-/// a write range is always removed from the list of unstarted write ranges and appended
-/// to the in_flight_ranges_ queue. This is done to alternate reads and writes - a read
-/// that is scheduled (by calling GetNextRange()) is always followed by a write (if one
-/// exists).  And since at most one WriteRange can be present in in_flight_ranges_ at any
-/// time (once a write range is returned from GetNetxRequestRange() it is completed an
-/// not re-enqueued), a scan range scheduled via a call to GetNextRange() can be queued up
-/// behind at most one write range.
-class DiskIoRequestContext {
-  using RequestRange = DiskIoMgr::RequestRange;
-  using ScanRange = DiskIoMgr::ScanRange;
-  using WriteRange = DiskIoMgr::WriteRange;
-  using RequestType = DiskIoMgr::RequestType;
-
- public:
-  ~DiskIoRequestContext() { DCHECK_EQ(state_, Inactive) << "Must be unregistered."; }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(DiskIoRequestContext);
-  friend class DiskIoMgr;
-
-  class PerDiskState;
-
-  enum State {
-    /// Reader is initialized and maps to a client
-    Active,
-
-    /// Reader is in the process of being cancelled.  Cancellation is coordinated between
-    /// different threads and when they are all complete, the reader context is moved to
-    /// the inactive state.
-    Cancelled,
-
-    /// Reader context does not map to a client.  Accessing memory in this context
-    /// is invalid (i.e. it is equivalent to a dangling pointer).
-    Inactive,
-  };
-
-  DiskIoRequestContext(DiskIoMgr* parent, int num_disks, MemTracker* tracker);
-
-  /// Decrements the number of active disks for this reader.  If the disk count
-  /// goes to 0, the disk complete condition variable is signaled.
-  /// Reader lock must be taken before this call.
-  void DecrementDiskRefCount() {
-    // boost doesn't let us dcheck that the reader lock is taken
-    DCHECK_GT(num_disks_with_ranges_, 0);
-    if (--num_disks_with_ranges_ == 0) {
-      disks_complete_cond_var_.NotifyAll();
-    }
-    DCHECK(Validate()) << std::endl << DebugString();
-  }
-
-  /// Reader & Disk Scheduling: Readers that currently can't do work are not on
-  /// the disk's queue. These readers are ones that don't have any ranges in the
-  /// in_flight_queue AND have not prepared a range by setting next_range_to_start.
-  /// The rule to make sure readers are scheduled correctly is to ensure anytime a
-  /// range is put on the in_flight_queue or anytime next_range_to_start is set to
-  /// NULL, the reader is scheduled.
-
-  /// Adds range to in_flight_ranges, scheduling this reader on the disk threads
-  /// if necessary.
-  /// Reader lock must be taken before this.
-  void ScheduleScanRange(ScanRange* range) {
-    DCHECK_EQ(state_, Active);
-    DCHECK(range != NULL);
-    DiskIoRequestContext::PerDiskState& state = disk_states_[range->disk_id()];
-    state.in_flight_ranges()->Enqueue(range);
-    state.ScheduleContext(this, range->disk_id());
-  }
-
-  /// Cancels the context with status code 'status'
-  void Cancel(const Status& status);
-
-  /// Cancel the context if not already cancelled, wait for all scan ranges to finish
-  /// and mark the context as inactive, after which it cannot be used.
-  void CancelAndMarkInactive();
-
-  /// Adds request range to disk queue for this request context. Currently,
-  /// schedule_immediately must be false is RequestRange is a write range.
-  void AddRequestRange(RequestRange* range, bool schedule_immediately);
-
-  /// Validates invariants of reader.  Reader lock must be taken beforehand.
-  bool Validate() const;
-
-  /// Dumps out reader information.  Lock should be taken by caller
-  std::string DebugString() const;
-
-  /// Parent object
-  DiskIoMgr* const parent_;
-
-  /// Memory used for this reader.  This is unowned by this object.
-  MemTracker* const mem_tracker_;
-
-  /// Total bytes read for this reader
-  RuntimeProfile::Counter* bytes_read_counter_ = nullptr;
-
-  /// Total time spent in hdfs reading
-  RuntimeProfile::Counter* read_timer_ = nullptr;
-
-  /// Number of active read threads
-  RuntimeProfile::Counter* active_read_thread_counter_ = nullptr;
-
-  /// Disk access bitmap. The counter's bit[i] is set if disk id i has been accessed.
-  /// TODO: we can only support up to 64 disks with this bitmap but it lets us use a
-  /// builtin atomic instruction. Probably good enough for now.
-  RuntimeProfile::Counter* disks_accessed_bitmap_ = nullptr;
-
-  /// Total number of bytes read locally, updated at end of each range scan
-  AtomicInt64 bytes_read_local_{0};
-
-  /// Total number of bytes read via short circuit read, updated at end of each range scan
-  AtomicInt64 bytes_read_short_circuit_{0};
-
-  /// Total number of bytes read from date node cache, updated at end of each range scan
-  AtomicInt64 bytes_read_dn_cache_{0};
-
-  /// Total number of bytes from remote reads that were expected to be local.
-  AtomicInt64 unexpected_remote_bytes_{0};
-
-  /// The number of buffers that have been returned to the reader (via GetNext) that the
-  /// reader has not returned. Only included for debugging and diagnostics.
-  AtomicInt32 num_buffers_in_reader_{0};
-
-  /// The number of scan ranges that have been completed for this reader.
-  AtomicInt32 num_finished_ranges_{0};
-
-  /// The number of scan ranges that required a remote read, updated at the end of each
-  /// range scan. Only used for diagnostics.
-  AtomicInt32 num_remote_ranges_{0};
-
-  /// The total number of scan ranges that have not been started. Only used for
-  /// diagnostics. This is the sum of all unstarted_scan_ranges across all disks.
-  AtomicInt32 num_unstarted_scan_ranges_{0};
-
-  /// Total number of file handle opens where the file handle was present in the cache
-  AtomicInt32 cached_file_handles_hit_count_{0};
-
-  /// Total number of file handle opens where the file handle was not in the cache
-  AtomicInt32 cached_file_handles_miss_count_{0};
-
-  /// The number of buffers that are being used for this reader. This is the sum
-  /// of all buffers in ScanRange queues and buffers currently being read into (i.e. about
-  /// to be queued). This includes both IOMgr-allocated buffers and client-provided
-  /// buffers.
-  AtomicInt32 num_used_buffers_{0};
-
-  /// The total number of ready buffers across all ranges.  Ready buffers are buffers
-  /// that have been read from disk but not retrieved by the caller.
-  /// This is the sum of all queued buffers in all ranges for this reader context.
-  AtomicInt32 num_ready_buffers_{0};
-
-  /// All fields below are accessed by multiple threads and the lock needs to be
-  /// taken before accessing them. Must be acquired before ScanRange::lock_ if both
-  /// are held simultaneously.
-  boost::mutex lock_;
-
-  /// Current state of the reader
-  State state_ = Active;
-
-  /// Status of this reader.  Set to non-ok if cancelled.
-  Status status_;
-
-  /// The number of disks with scan ranges remaining (always equal to the sum of
-  /// disks with ranges).
-  int num_disks_with_ranges_ = 0;
-
-  /// This is the list of ranges that are expected to be cached on the DN.
-  /// When the reader asks for a new range (GetNextScanRange()), we first
-  /// return ranges from this list.
-  InternalQueue<ScanRange> cached_ranges_;
-
-  /// A list of ranges that should be returned in subsequent calls to
-  /// GetNextRange.
-  /// There is a trade-off with when to populate this list.  Populating it on
-  /// demand means consumers need to wait (happens in DiskIoMgr::GetNextRange()).
-  /// Populating it preemptively means we make worse scheduling decisions.
-  /// We currently populate one range per disk.
-  /// TODO: think about this some more.
-  InternalQueue<ScanRange> ready_to_start_ranges_;
-  ConditionVariable ready_to_start_ranges_cv_; // used with lock_
-
-  /// Ranges that are blocked due to back pressure on outgoing buffers.
-  InternalQueue<ScanRange> blocked_ranges_;
-
-  /// Condition variable for UnregisterContext() to wait for all disks to complete
-  ConditionVariable disks_complete_cond_var_;
-
-  /// Struct containing state per disk. See comments in the disk read loop on how
-  /// they are used.
-  class PerDiskState {
-   public:
-    bool done() const { return done_; }
-    void set_done(bool b) { done_ = b; }
-
-    int num_remaining_ranges() const { return num_remaining_ranges_; }
-    int& num_remaining_ranges() { return num_remaining_ranges_; }
-
-    ScanRange* next_scan_range_to_start() { return next_scan_range_to_start_; }
-    void set_next_scan_range_to_start(ScanRange* range) {
-      next_scan_range_to_start_ = range;
-    }
-
-    /// We need to have a memory barrier to prevent this load from being reordered
-    /// with num_threads_in_op(), since these variables are set without the reader
-    /// lock taken
-    bool is_on_queue() const {
-      bool b = is_on_queue_;
-      __sync_synchronize();
-      return b;
-    }
-
-    int num_threads_in_op() const {
-      int v = num_threads_in_op_.Load();
-      // TODO: determine whether this barrier is necessary for any callsites.
-      AtomicUtil::MemoryBarrier();
-      return v;
-    }
-
-    const InternalQueue<ScanRange>* unstarted_scan_ranges() const {
-      return &unstarted_scan_ranges_;
-    }
-    const InternalQueue<WriteRange>* unstarted_write_ranges() const {
-      return &unstarted_write_ranges_;
-    }
-    const InternalQueue<RequestRange>* in_flight_ranges() const {
-      return &in_flight_ranges_;
-    }
-
-    InternalQueue<ScanRange>* unstarted_scan_ranges() { return &unstarted_scan_ranges_; }
-    InternalQueue<WriteRange>* unstarted_write_ranges() {
-      return &unstarted_write_ranges_;
-    }
-    InternalQueue<RequestRange>* in_flight_ranges() { return &in_flight_ranges_; }
-
-    /// Schedules the request context on this disk if it's not already on the queue.
-    /// Context lock must be taken before this.
-    void ScheduleContext(DiskIoRequestContext* context, int disk_id);
-
-    /// Increment the ref count on reader.  We need to track the number of threads per
-    /// reader per disk that are in the unlocked hdfs read code section. This is updated
-    /// by multiple threads without a lock so we need to use an atomic int.
-    void IncrementRequestThreadAndDequeue() {
-      num_threads_in_op_.Add(1);
-      is_on_queue_ = false;
-    }
-
-    void DecrementRequestThread() { num_threads_in_op_.Add(-1); }
-
-    /// Decrement request thread count and do final cleanup if this is the last
-    /// thread. RequestContext lock must be taken before this.
-    void DecrementRequestThreadAndCheckDone(DiskIoRequestContext* context) {
-      num_threads_in_op_.Add(-1); // Also acts as a barrier.
-      if (!is_on_queue_ && num_threads_in_op_.Load() == 0 && !done_) {
-        // This thread is the last one for this reader on this disk, do final cleanup
-        context->DecrementDiskRefCount();
-        done_ = true;
-      }
-    }
-
-   private:
-    /// If true, this disk is all done for this request context, including any cleanup.
-    /// If done is true, it means that this request must not be on this disk's queue
-    /// *AND* there are no threads currently working on this context. To satisfy
-    /// this, only the last thread (per disk) can set this to true.
-    bool done_ = true;
-
-    /// For each disk, keeps track if the context is on this disk's queue, indicating
-    /// the disk must do some work for this context. The disk needs to do work in 4 cases:
-    ///  1) in_flight_ranges is not empty, the disk needs to read for this reader.
-    ///  2) next_range_to_start is NULL, the disk needs to prepare a scan range to be
-    ///     read next.
-    ///  3) the reader has been cancelled and this disk needs to participate in the
-    ///     cleanup.
-    ///  4) A write range is added to queue.
-    /// In general, we only want to put a context on the disk queue if there is something
-    /// useful that can be done. If there's nothing useful, the disk queue will wake up
-    /// and then remove the reader from the queue. Doing this causes thrashing of the
-    /// threads.
-    bool is_on_queue_ = false;
-
-    /// For each disks, the number of request ranges that have not been fully read.
-    /// In the non-cancellation path, this will hit 0, and done will be set to true
-    /// by the disk thread. This is undefined in the cancellation path (the various
-    /// threads notice by looking at the DiskIoRequestContext's state_).
-    int num_remaining_ranges_ = 0;
-
-    /// Queue of ranges that have not started being read.  This list is exclusive
-    /// with in_flight_ranges.
-    InternalQueue<ScanRange> unstarted_scan_ranges_;
-
-    /// Queue of pending IO requests for this disk in the order that they will be
-    /// processed. A ScanRange is added to this queue when it is returned in
-    /// GetNextRange(), or when it is added with schedule_immediately = true.
-    /// A WriteRange is added to this queue from unstarted_write_ranges_ for each
-    /// invocation of GetNextRequestRange() in WorkLoop().
-    /// The size of this queue is always less than or equal to num_remaining_ranges.
-    InternalQueue<RequestRange> in_flight_ranges_;
-
-    /// The next range to start for this reader on this disk. Each disk (for each reader)
-    /// picks the next range to start. The range is set here and also added to the
-    /// ready_to_start_ranges_ queue. The reader pulls from the queue in FIFO order,
-    /// so the ranges from different disks are round-robined. When the range is pulled
-    /// off the ready_to_start_ranges_ queue, it sets this variable to NULL, so the disk
-    /// knows to populate it again and add it to ready_to_start_ranges_ i.e. it is used
-    /// as a flag by DiskIoMgr::GetNextScanRange to determine if it needs to add another
-    /// range to ready_to_start_ranges_.
-    ScanRange* next_scan_range_to_start_ = nullptr;
-
-    /// For each disk, the number of threads issuing the underlying read/write on behalf
-    /// of this context. There are a few places where we release the context lock, do some
-    /// work, and then grab the lock again.  Because we don't hold the lock for the
-    /// entire operation, we need this ref count to keep track of which thread should do
-    /// final resource cleanup during cancellation.
-    /// Only the thread that sees the count at 0 should do the final cleanup.
-    AtomicInt32 num_threads_in_op_{0};
-
-    /// Queue of write ranges to process for this disk. A write range is always added
-    /// to in_flight_ranges_ in GetNextRequestRange(). There is a separate
-    /// unstarted_read_ranges_ and unstarted_write_ranges_ to alternate between reads
-    /// and writes. (Otherwise, since next_scan_range_to_start is set
-    /// in GetNextRequestRange() whenever it is null, repeated calls to
-    /// GetNextRequestRange() and GetNextRange() may result in only reads being processed)
-    InternalQueue<WriteRange> unstarted_write_ranges_;
-  };
-
-  /// Per disk states to synchronize multiple disk threads accessing the same request
-  /// context.
-  std::vector<PerDiskState> disk_states_;
-};
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-scan-range.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-scan-range.cc b/be/src/runtime/disk-io-mgr-scan-range.cc
deleted file mode 100644
index 7f0692e..0000000
--- a/be/src/runtime/disk-io-mgr-scan-range.cc
+++ /dev/null
@@ -1,591 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/disk-io-mgr.h"
-#include "runtime/disk-io-mgr-internal.h"
-#include "util/error-util.h"
-#include "util/hdfs-util.h"
-
-#include "common/names.h"
-
-using namespace impala;
-
-DEFINE_bool(use_hdfs_pread, false, "Enables using hdfsPread() instead of hdfsRead() "
-    "when performing HDFS read operations. This is necessary to use HDFS hedged reads "
-    "(assuming the HDFS client is configured to do so).");
-
-// TODO: Run perf tests and empirically settle on the most optimal default value for the
-// read buffer size. Currently setting it as 128k for the same reason as for S3, i.e.
-// due to JNI array allocation and memcpy overhead, 128k was emperically found to have the
-// least overhead.
-DEFINE_int64(adls_read_chunk_size, 128 * 1024, "The maximum read chunk size to use when "
-    "reading from ADLS.");
-
-// Implementation of the ScanRange functionality. Each ScanRange contains a queue
-// of ready buffers. For each ScanRange, there is only a single producer and
-// consumer thread, i.e. only one disk thread will push to a scan range at
-// any time and only one thread will remove from the queue. This is to guarantee
-// that buffers are queued and read in file order.
-
-bool DiskIoMgr::ScanRange::EnqueueBuffer(
-    const unique_lock<mutex>& reader_lock, unique_ptr<BufferDescriptor> buffer) {
-  DCHECK(reader_lock.mutex() == &reader_->lock_ && reader_lock.owns_lock());
-  {
-    unique_lock<mutex> scan_range_lock(lock_);
-    DCHECK(Validate()) << DebugString();
-    DCHECK(!eosr_returned_);
-    DCHECK(!eosr_queued_);
-    if (is_cancelled_) {
-      // Return the buffer, this range has been cancelled
-      if (buffer->buffer_ != nullptr) {
-        io_mgr_->num_buffers_in_readers_.Add(1);
-        reader_->num_buffers_in_reader_.Add(1);
-      }
-      reader_->num_used_buffers_.Add(-1);
-      io_mgr_->ReturnBuffer(move(buffer));
-      return false;
-    }
-    reader_->num_ready_buffers_.Add(1);
-    eosr_queued_ = buffer->eosr();
-    ready_buffers_.emplace_back(move(buffer));
-
-    DCHECK_LE(ready_buffers_.size(), SCAN_RANGE_READY_BUFFER_LIMIT);
-    blocked_on_queue_ = ready_buffers_.size() == SCAN_RANGE_READY_BUFFER_LIMIT;
-  }
-
-  buffer_ready_cv_.NotifyOne();
-
-  return blocked_on_queue_;
-}
-
-Status DiskIoMgr::ScanRange::GetNext(unique_ptr<BufferDescriptor>* buffer) {
-  DCHECK(*buffer == nullptr);
-  bool eosr;
-  {
-    unique_lock<mutex> scan_range_lock(lock_);
-    if (eosr_returned_) return Status::OK();
-    DCHECK(Validate()) << DebugString();
-
-    while (ready_buffers_.empty() && !is_cancelled_) {
-      buffer_ready_cv_.Wait(scan_range_lock);
-    }
-
-    if (is_cancelled_) {
-      DCHECK(!status_.ok());
-      return status_;
-    }
-
-    // Remove the first ready buffer from the queue and return it
-    DCHECK(!ready_buffers_.empty());
-    DCHECK_LE(ready_buffers_.size(), SCAN_RANGE_READY_BUFFER_LIMIT);
-    *buffer = move(ready_buffers_.front());
-    ready_buffers_.pop_front();
-    eosr_returned_ = (*buffer)->eosr();
-    eosr = (*buffer)->eosr();
-  }
-
-  // Update tracking counters. The buffer has now moved from the IoMgr to the
-  // caller.
-  io_mgr_->num_buffers_in_readers_.Add(1);
-  reader_->num_buffers_in_reader_.Add(1);
-  reader_->num_ready_buffers_.Add(-1);
-  reader_->num_used_buffers_.Add(-1);
-  if (eosr) reader_->num_finished_ranges_.Add(1);
-
-  Status status = (*buffer)->status_;
-  if (!status.ok()) {
-    io_mgr_->ReturnBuffer(move(*buffer));
-    return status;
-  }
-
-  unique_lock<mutex> reader_lock(reader_->lock_);
-
-  DCHECK(reader_->Validate()) << endl << reader_->DebugString();
-  if (reader_->state_ == DiskIoRequestContext::Cancelled) {
-    reader_->blocked_ranges_.Remove(this);
-    Cancel(reader_->status_);
-    io_mgr_->ReturnBuffer(move(*buffer));
-    return status_;
-  }
-
-  {
-    // Check to see if we can re-schedule a blocked range. Note that EnqueueBuffer()
-    // may have been called after we released 'lock_' above so we need to re-check
-    // whether the queue is full.
-    unique_lock<mutex> scan_range_lock(lock_);
-    if (blocked_on_queue_ && ready_buffers_.size() < SCAN_RANGE_READY_BUFFER_LIMIT
-        && !eosr_queued_) {
-      blocked_on_queue_ = false;
-      // This scan range was blocked and is no longer, add it to the reader
-      // queue again.
-      reader_->blocked_ranges_.Remove(this);
-      reader_->ScheduleScanRange(this);
-    }
-  }
-  return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::Cancel(const Status& status) {
-  // Cancelling a range that was never started, ignore.
-  if (io_mgr_ == nullptr) return;
-
-  DCHECK(!status.ok());
-  {
-    // Grab both locks to make sure that all working threads see is_cancelled_.
-    unique_lock<mutex> scan_range_lock(lock_);
-    unique_lock<mutex> hdfs_lock(hdfs_lock_);
-    DCHECK(Validate()) << DebugString();
-    if (is_cancelled_) return;
-    is_cancelled_ = true;
-    status_ = status;
-  }
-  buffer_ready_cv_.NotifyAll();
-  CleanupQueuedBuffers();
-
-  // For cached buffers, we can't close the range until the cached buffer is returned.
-  // Close() is called from DiskIoMgr::ReturnBuffer().
-  if (external_buffer_tag_ != ExternalBufferTag::CACHED_BUFFER) Close();
-}
-
-void DiskIoMgr::ScanRange::CleanupQueuedBuffers() {
-  DCHECK(is_cancelled_);
-  io_mgr_->num_buffers_in_readers_.Add(ready_buffers_.size());
-  reader_->num_buffers_in_reader_.Add(ready_buffers_.size());
-  reader_->num_used_buffers_.Add(-ready_buffers_.size());
-  reader_->num_ready_buffers_.Add(-ready_buffers_.size());
-
-  while (!ready_buffers_.empty()) {
-    io_mgr_->ReturnBuffer(move(ready_buffers_.front()));
-    ready_buffers_.pop_front();
-  }
-}
-
-string DiskIoMgr::ScanRange::DebugString() const {
-  stringstream ss;
-  ss << "file=" << file_ << " disk_id=" << disk_id_ << " offset=" << offset_
-     << " len=" << len_ << " bytes_read=" << bytes_read_
-     << " buffer_queue=" << ready_buffers_.size()
-     << " hdfs_file=" << exclusive_hdfs_fh_;
-  return ss.str();
-}
-
-bool DiskIoMgr::ScanRange::Validate() {
-  if (bytes_read_ > len_) {
-    LOG(WARNING) << "Bytes read tracking is wrong. Shouldn't read past the scan range."
-                 << " bytes_read_=" << bytes_read_ << " len_=" << len_;
-    return false;
-  }
-  if (eosr_returned_ && !eosr_queued_) {
-    LOG(WARNING) << "Returned eosr to reader before finishing reading the scan range"
-                 << " eosr_returned_=" << eosr_returned_
-                 << " eosr_queued_=" << eosr_queued_;
-    return false;
-  }
-  return true;
-}
-
-DiskIoMgr::ScanRange::ScanRange()
-  : RequestRange(RequestType::READ),
-    num_remote_bytes_(0),
-    external_buffer_tag_(ExternalBufferTag::NO_BUFFER),
-    mtime_(-1) {}
-
-DiskIoMgr::ScanRange::~ScanRange() {
-  DCHECK(exclusive_hdfs_fh_ == nullptr) << "File was not closed.";
-  DCHECK(external_buffer_tag_ != ExternalBufferTag::CACHED_BUFFER)
-      << "Cached buffer was not released.";
-}
-
-void DiskIoMgr::ScanRange::Reset(hdfsFS fs, const char* file, int64_t len, int64_t offset,
-    int disk_id, bool expected_local, const BufferOpts& buffer_opts, void* meta_data) {
-  DCHECK(ready_buffers_.empty());
-  DCHECK(file != nullptr);
-  DCHECK_GE(len, 0);
-  DCHECK_GE(offset, 0);
-  DCHECK(buffer_opts.client_buffer_ == nullptr ||
-         buffer_opts.client_buffer_len_ >= len_);
-  fs_ = fs;
-  file_ = file;
-  len_ = len;
-  offset_ = offset;
-  disk_id_ = disk_id;
-  try_cache_ = buffer_opts.try_cache_;
-  mtime_ = buffer_opts.mtime_;
-  expected_local_ = expected_local;
-  num_remote_bytes_ = 0;
-  meta_data_ = meta_data;
-  if (buffer_opts.client_buffer_ != nullptr) {
-    external_buffer_tag_ = ExternalBufferTag::CLIENT_BUFFER;
-    client_buffer_.data = buffer_opts.client_buffer_;
-    client_buffer_.len = buffer_opts.client_buffer_len_;
-  } else {
-    external_buffer_tag_ = ExternalBufferTag::NO_BUFFER;
-  }
-  io_mgr_ = nullptr;
-  reader_ = nullptr;
-  exclusive_hdfs_fh_ = nullptr;
-}
-
-void DiskIoMgr::ScanRange::InitInternal(DiskIoMgr* io_mgr, DiskIoRequestContext* reader) {
-  DCHECK(exclusive_hdfs_fh_ == nullptr);
-  DCHECK(local_file_ == nullptr);
-  // Reader must provide MemTracker or a buffer.
-  DCHECK(external_buffer_tag_ == ExternalBufferTag::CLIENT_BUFFER
-      || reader->mem_tracker_ != nullptr);
-  io_mgr_ = io_mgr;
-  reader_ = reader;
-  local_file_ = nullptr;
-  exclusive_hdfs_fh_ = nullptr;
-  bytes_read_ = 0;
-  is_cancelled_ = false;
-  eosr_queued_= false;
-  eosr_returned_= false;
-  blocked_on_queue_ = false;
-  DCHECK(Validate()) << DebugString();
-}
-
-Status DiskIoMgr::ScanRange::Open(bool use_file_handle_cache) {
-  unique_lock<mutex> hdfs_lock(hdfs_lock_);
-  if (is_cancelled_) return Status::CANCELLED;
-
-  if (fs_ != nullptr) {
-    if (exclusive_hdfs_fh_ != nullptr) return Status::OK();
-    // With file handle caching, the scan range does not maintain its own
-    // hdfs file handle. File handle caching is only used for local files,
-    // so s3 and remote filesystems should obtain an exclusive file handle
-    // for each scan range.
-    if (use_file_handle_cache && expected_local_) return Status::OK();
-    // Get a new exclusive file handle.
-    exclusive_hdfs_fh_ = io_mgr_->GetCachedHdfsFileHandle(fs_, file_string(),
-        mtime(), reader_, true);
-    if (exclusive_hdfs_fh_ == nullptr) {
-      return Status(TErrorCode::DISK_IO_ERROR,
-          GetHdfsErrorMsg("Failed to open HDFS file ", file_));
-    }
-
-    if (hdfsSeek(fs_, exclusive_hdfs_fh_->file(), offset_) != 0) {
-      // Destroy the file handle and remove it from the cache.
-      io_mgr_->ReleaseCachedHdfsFileHandle(file_string(), exclusive_hdfs_fh_, true);
-      exclusive_hdfs_fh_ = nullptr;
-      return Status(TErrorCode::DISK_IO_ERROR,
-          Substitute("Error seeking to $0 in file: $1 $2", offset_, file_,
-          GetHdfsErrorMsg("")));
-    }
-  } else {
-    if (local_file_ != nullptr) return Status::OK();
-
-    local_file_ = fopen(file(), "r");
-    if (local_file_ == nullptr) {
-      return Status(TErrorCode::DISK_IO_ERROR, Substitute("Could not open file: $0: $1",
-            file_, GetStrErrMsg()));
-    }
-    if (fseek(local_file_, offset_, SEEK_SET) == -1) {
-      fclose(local_file_);
-      local_file_ = nullptr;
-      return Status(TErrorCode::DISK_IO_ERROR, Substitute("Could not seek to $0 "
-          "for file: $1: $2", offset_, file_, GetStrErrMsg()));
-    }
-  }
-  if (ImpaladMetrics::IO_MGR_NUM_OPEN_FILES != nullptr) {
-    ImpaladMetrics::IO_MGR_NUM_OPEN_FILES->Increment(1L);
-  }
-  return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::Close() {
-  unique_lock<mutex> hdfs_lock(hdfs_lock_);
-  bool closed_file = false;
-  if (fs_ != nullptr) {
-    if (exclusive_hdfs_fh_ != nullptr) {
-      GetHdfsStatistics(exclusive_hdfs_fh_->file());
-
-      if (external_buffer_tag_ == ExternalBufferTag::CACHED_BUFFER) {
-        hadoopRzBufferFree(exclusive_hdfs_fh_->file(), cached_buffer_);
-        cached_buffer_ = nullptr;
-        external_buffer_tag_ = ExternalBufferTag::NO_BUFFER;
-      }
-
-      // Destroy the file handle and remove it from the cache.
-      io_mgr_->ReleaseCachedHdfsFileHandle(file_string(), exclusive_hdfs_fh_, true);
-      exclusive_hdfs_fh_ = nullptr;
-      closed_file = true;
-    }
-
-    if (FLAGS_use_hdfs_pread) {
-      // Update Hedged Read Metrics.
-      // We call it only if the --use_hdfs_pread flag is set, to avoid having the
-      // libhdfs client malloc and free a hdfsHedgedReadMetrics object unnecessarily
-      // otherwise. 'hedged_metrics' is only set upon success.
-      struct hdfsHedgedReadMetrics* hedged_metrics;
-      int success = hdfsGetHedgedReadMetrics(fs_, &hedged_metrics);
-      if (success == 0) {
-        ImpaladMetrics::HEDGED_READ_OPS->set_value(hedged_metrics->hedgedReadOps);
-        ImpaladMetrics::HEDGED_READ_OPS_WIN->set_value(hedged_metrics->hedgedReadOpsWin);
-        hdfsFreeHedgedReadMetrics(hedged_metrics);
-      }
-    }
-
-    if (num_remote_bytes_ > 0) {
-      reader_->num_remote_ranges_.Add(1L);
-      if (expected_local_) {
-        reader_->unexpected_remote_bytes_.Add(num_remote_bytes_);
-        VLOG_FILE << "Unexpected remote HDFS read of "
-                  << PrettyPrinter::Print(num_remote_bytes_, TUnit::BYTES)
-                  << " for file '" << file_ << "'";
-      }
-    }
-  } else {
-    if (local_file_ == nullptr) return;
-    fclose(local_file_);
-    local_file_ = nullptr;
-    closed_file = true;
-  }
-  if (closed_file && ImpaladMetrics::IO_MGR_NUM_OPEN_FILES != nullptr) {
-    ImpaladMetrics::IO_MGR_NUM_OPEN_FILES->Increment(-1L);
-  }
-}
-
-int64_t DiskIoMgr::ScanRange::MaxReadChunkSize() const {
-  // S3 InputStreams don't support DIRECT_READ (i.e. java.nio.ByteBuffer read()
-  // interface).  So, hdfsRead() needs to allocate a Java byte[] and copy the data out.
-  // Profiles show that both the JNI array allocation and the memcpy adds much more
-  // overhead for larger buffers, so limit the size of each read request.  128K was
-  // chosen empirically by trying values between 4K and 8M and optimizing for lower CPU
-  // utilization and higher S3 througput.
-  if (disk_id_ == io_mgr_->RemoteS3DiskId()) {
-    DCHECK(IsS3APath(file()));
-    return 128 * 1024;
-  }
-  if (disk_id_ == io_mgr_->RemoteAdlsDiskId()) {
-    DCHECK(IsADLSPath(file()));
-    return FLAGS_adls_read_chunk_size;
-  }
-  // The length argument of hdfsRead() is an int. Ensure we don't overflow it.
-  return numeric_limits<int>::max();
-}
-
-// TODO: how do we best use the disk here.  e.g. is it good to break up a
-// 1MB read into 8 128K reads?
-// TODO: look at linux disk scheduling
-Status DiskIoMgr::ScanRange::Read(
-    uint8_t* buffer, int64_t buffer_len, int64_t* bytes_read, bool* eosr) {
-  unique_lock<mutex> hdfs_lock(hdfs_lock_);
-  if (is_cancelled_) return Status::CANCELLED;
-
-  *eosr = false;
-  *bytes_read = 0;
-  // Read until the end of the scan range or the end of the buffer.
-  int bytes_to_read = min(len_ - bytes_read_, buffer_len);
-  DCHECK_GE(bytes_to_read, 0);
-
-  if (fs_ != nullptr) {
-    HdfsFileHandle* borrowed_hdfs_fh = nullptr;
-    hdfsFile hdfs_file;
-
-    // If the scan range has an exclusive file handle, use it. Otherwise, borrow
-    // a file handle from the cache.
-    if (exclusive_hdfs_fh_ != nullptr) {
-      hdfs_file = exclusive_hdfs_fh_->file();
-    } else {
-      borrowed_hdfs_fh = io_mgr_->GetCachedHdfsFileHandle(fs_, file_string(),
-          mtime(), reader_, false);
-      if (borrowed_hdfs_fh == nullptr) {
-        return Status(TErrorCode::DISK_IO_ERROR,
-            GetHdfsErrorMsg("Failed to open HDFS file ", file_));
-      }
-      hdfs_file = borrowed_hdfs_fh->file();
-    }
-
-    int64_t max_chunk_size = MaxReadChunkSize();
-    Status status = Status::OK();
-    while (*bytes_read < bytes_to_read) {
-      int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
-      DCHECK_GE(chunk_size, 0);
-      // The hdfsRead() length argument is an int.
-      DCHECK_LE(chunk_size, numeric_limits<int>::max());
-      int current_bytes_read = -1;
-      // bytes_read_ is only updated after the while loop
-      int64_t position_in_file = offset_ + bytes_read_ + *bytes_read;
-      int num_retries = 0;
-      while (true) {
-        status = Status::OK();
-        // For file handles from the cache, any of the below file operations may fail
-        // due to a bad file handle. In each case, record the error, but allow for a
-        // retry to fix it.
-        if (FLAGS_use_hdfs_pread) {
-          current_bytes_read = hdfsPread(fs_, hdfs_file, position_in_file,
-              buffer + *bytes_read, chunk_size);
-          if (current_bytes_read == -1) {
-            status = Status(TErrorCode::DISK_IO_ERROR,
-                GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
-          }
-        } else {
-          // If the file handle is borrowed, it may not be at the appropriate
-          // location. Seek to the appropriate location.
-          bool seek_failed = false;
-          if (borrowed_hdfs_fh != nullptr) {
-            if (hdfsSeek(fs_, hdfs_file, position_in_file) != 0) {
-              status = Status(TErrorCode::DISK_IO_ERROR, Substitute("Error seeking to $0 "
-                  " in file: $1: $2", position_in_file, file_, GetHdfsErrorMsg("")));
-              seek_failed = true;
-            }
-          }
-          if (!seek_failed) {
-            current_bytes_read = hdfsRead(fs_, hdfs_file, buffer + *bytes_read,
-                chunk_size);
-            if (current_bytes_read == -1) {
-              status = Status(TErrorCode::DISK_IO_ERROR,
-                  GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
-            }
-          }
-        }
-
-        // Do not retry:
-        // - if read was successful (current_bytes_read != -1)
-        // - or if already retried once
-        // - or if this not using a borrowed file handle
-        DCHECK_LE(num_retries, 1);
-        if (current_bytes_read != -1 || borrowed_hdfs_fh == nullptr ||
-            num_retries == 1) {
-          break;
-        }
-        // The error may be due to a bad file handle. Reopen the file handle and retry.
-        ++num_retries;
-        RETURN_IF_ERROR(io_mgr_->ReopenCachedHdfsFileHandle(fs_, file_string(),
-            mtime(), &borrowed_hdfs_fh));
-        hdfs_file = borrowed_hdfs_fh->file();
-      }
-      if (!status.ok()) break;
-      if (current_bytes_read == 0) {
-        // No more bytes in the file. The scan range went past the end.
-        *eosr = true;
-        break;
-      }
-      *bytes_read += current_bytes_read;
-
-      // Collect and accumulate statistics
-      GetHdfsStatistics(hdfs_file);
-    }
-
-    if (borrowed_hdfs_fh != nullptr) {
-      io_mgr_->ReleaseCachedHdfsFileHandle(file_string(), borrowed_hdfs_fh, false);
-    }
-    if (!status.ok()) return status;
-  } else {
-    DCHECK(local_file_ != nullptr);
-    *bytes_read = fread(buffer, 1, bytes_to_read, local_file_);
-    DCHECK_GE(*bytes_read, 0);
-    DCHECK_LE(*bytes_read, bytes_to_read);
-    if (*bytes_read < bytes_to_read) {
-      if (ferror(local_file_) != 0) {
-        return Status(TErrorCode::DISK_IO_ERROR, Substitute("Error reading from $0"
-            "at byte offset: $1: $2", file_, offset_ + bytes_read_, GetStrErrMsg()));
-      } else {
-        // On Linux, we should only get partial reads from block devices on error or eof.
-        DCHECK(feof(local_file_) != 0);
-        *eosr = true;
-      }
-    }
-  }
-  bytes_read_ += *bytes_read;
-  DCHECK_LE(bytes_read_, len_);
-  if (bytes_read_ == len_) *eosr = true;
-  return Status::OK();
-}
-
-Status DiskIoMgr::ScanRange::ReadFromCache(
-    const unique_lock<mutex>& reader_lock, bool* read_succeeded) {
-  DCHECK(reader_lock.mutex() == &reader_->lock_ && reader_lock.owns_lock());
-  DCHECK(try_cache_);
-  DCHECK_EQ(bytes_read_, 0);
-  *read_succeeded = false;
-  Status status = Open(false);
-  if (!status.ok()) return status;
-
-  // Cached reads not supported on local filesystem.
-  if (fs_ == nullptr) return Status::OK();
-
-  {
-    unique_lock<mutex> hdfs_lock(hdfs_lock_);
-    if (is_cancelled_) return Status::CANCELLED;
-
-    DCHECK(exclusive_hdfs_fh_ != nullptr);
-    DCHECK(external_buffer_tag_ == ExternalBufferTag::NO_BUFFER);
-    cached_buffer_ =
-      hadoopReadZero(exclusive_hdfs_fh_->file(), io_mgr_->cached_read_options_, len());
-    if (cached_buffer_ != nullptr) {
-      external_buffer_tag_ = ExternalBufferTag::CACHED_BUFFER;
-    }
-  }
-  // Data was not cached, caller will fall back to normal read path.
-  if (external_buffer_tag_ != ExternalBufferTag::CACHED_BUFFER) {
-    VLOG_QUERY << "Cache read failed for scan range: " << DebugString()
-               << ". Switching to disk read path.";
-    // Clean up the scan range state before re-issuing it.
-    Close();
-    return Status::OK();
-  }
-
-  // Cached read returned a buffer, verify we read the correct amount of data.
-  void* buffer = const_cast<void*>(hadoopRzBufferGet(cached_buffer_));
-  int32_t bytes_read = hadoopRzBufferLength(cached_buffer_);
-  // A partial read can happen when files are truncated.
-  // TODO: If HDFS ever supports partially cached blocks, we'll have to distinguish
-  // between errors and partially cached blocks here.
-  if (bytes_read < len()) {
-    VLOG_QUERY << "Error reading file from HDFS cache: " << file_ << ". Expected "
-      << len() << " bytes, but read " << bytes_read << ". Switching to disk read path.";
-    // Close the scan range. 'read_succeeded' is still false, so the caller will fall back
-    // to non-cached read of this scan range.
-    Close();
-    return Status::OK();
-  }
-
-  // Create a single buffer desc for the entire scan range and enqueue that.
-  // 'mem_tracker' is nullptr because the memory is owned by the HDFS java client,
-  // not the Impala backend.
-  unique_ptr<BufferDescriptor> desc = unique_ptr<BufferDescriptor>(new BufferDescriptor(
-      io_mgr_, reader_, this, reinterpret_cast<uint8_t*>(buffer), 0, nullptr));
-  desc->len_ = bytes_read;
-  desc->scan_range_offset_ = 0;
-  desc->eosr_ = true;
-  bytes_read_ = bytes_read;
-  EnqueueBuffer(reader_lock, move(desc));
-  if (reader_->bytes_read_counter_ != nullptr) {
-    COUNTER_ADD(reader_->bytes_read_counter_, bytes_read);
-  }
-  *read_succeeded = true;
-  reader_->num_used_buffers_.Add(1);
-  return Status::OK();
-}
-
-void DiskIoMgr::ScanRange::GetHdfsStatistics(hdfsFile hdfs_file) {
-  struct hdfsReadStatistics* stats;
-  if (IsHdfsPath(file())) {
-    int success = hdfsFileGetReadStatistics(hdfs_file, &stats);
-    if (success == 0) {
-      reader_->bytes_read_local_.Add(stats->totalLocalBytesRead);
-      reader_->bytes_read_short_circuit_.Add(stats->totalShortCircuitBytesRead);
-      reader_->bytes_read_dn_cache_.Add(stats->totalZeroCopyBytesRead);
-      if (stats->totalLocalBytesRead != stats->totalBytesRead) {
-        num_remote_bytes_ += stats->totalBytesRead - stats->totalLocalBytesRead;
-      }
-      hdfsFileFreeReadStatistics(stats);
-    }
-    hdfsFileClearReadStatistics(hdfs_file);
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-stress-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-stress-test.cc b/be/src/runtime/disk-io-mgr-stress-test.cc
deleted file mode 100644
index 7ae9515..0000000
--- a/be/src/runtime/disk-io-mgr-stress-test.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/disk-io-mgr-stress.h"
-#include "util/cpu-info.h"
-#include "util/string-parser.h"
-
-#include "common/names.h"
-
-using namespace impala;
-
-// Simple utility to run the disk io stress test.  A optional second parameter
-// can be passed to control how long to run this test (0 for forever).
-
-// TODO: make these configurable once we decide how to run BE tests with args
-const int DEFAULT_DURATION_SEC = 1;
-const int NUM_DISKS = 5;
-const int NUM_THREADS_PER_DISK = 5;
-const int NUM_CLIENTS = 10;
-const bool TEST_CANCELLATION = true;
-
-int main(int argc, char** argv) {
-  google::InitGoogleLogging(argv[0]);
-  CpuInfo::Init();
-  OsInfo::Init();
-  impala::InitThreading();
-  int duration_sec = DEFAULT_DURATION_SEC;
-
-  if (argc == 2) {
-    StringParser::ParseResult status;
-    duration_sec = StringParser::StringToInt<int>(argv[1], strlen(argv[1]), &status);
-    if (status != StringParser::PARSE_SUCCESS) {
-      printf("Invalid arg: %s\n", argv[1]);
-      return 1;
-    }
-  }
-  if (duration_sec != 0) {
-    printf("Running stress test for %d seconds.\n", duration_sec);
-  } else {
-    printf("Running stress test indefinitely.\n");
-  }
-  DiskIoMgrStress test(NUM_DISKS, NUM_THREADS_PER_DISK, NUM_CLIENTS, TEST_CANCELLATION);
-  test.Run(duration_sec);
-
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-stress.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-stress.cc b/be/src/runtime/disk-io-mgr-stress.cc
deleted file mode 100644
index a98c3a4..0000000
--- a/be/src/runtime/disk-io-mgr-stress.cc
+++ /dev/null
@@ -1,246 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <boost/thread/mutex.hpp>
-
-#include "runtime/disk-io-mgr-stress.h"
-
-#include "runtime/disk-io-mgr-reader-context.h"
-#include "util/time.h"
-
-#include "common/names.h"
-
-using namespace impala;
-
-static const float ABORT_CHANCE = .10f;
-static const int MIN_READ_LEN = 1;
-static const int MAX_READ_LEN = 20;
-
-static const int MIN_FILE_LEN = 10;
-static const int MAX_FILE_LEN = 1024;
-
-// Make sure this is between MIN/MAX FILE_LEN to test more cases
-static const int MIN_READ_BUFFER_SIZE = 64;
-static const int MAX_READ_BUFFER_SIZE = 128;
-
-static const int CANCEL_READER_PERIOD_MS = 20;  // in ms
-
-static void CreateTempFile(const char* filename, const char* data) {
-  FILE* file = fopen(filename, "w");
-  CHECK(file != NULL);
-  fwrite(data, 1, strlen(data), file);
-  fclose(file);
-}
-
-string GenerateRandomData() {
-  int rand_len = rand() % (MAX_FILE_LEN - MIN_FILE_LEN) + MIN_FILE_LEN;
-  stringstream ss;
-  for (int i = 0; i < rand_len; ++i) {
-    char c = rand() % 26 + 'a';
-    ss << c;
-  }
-  return ss.str();
-}
-
-struct DiskIoMgrStress::Client {
-  boost::mutex lock;
-  unique_ptr<DiskIoRequestContext> reader;
-  int file_idx;
-  vector<DiskIoMgr::ScanRange*> scan_ranges;
-  int abort_at_byte;
-  int files_processed;
-};
-
-DiskIoMgrStress::DiskIoMgrStress(int num_disks, int num_threads_per_disk,
-     int num_clients, bool includes_cancellation) :
-    num_clients_(num_clients),
-    includes_cancellation_(includes_cancellation) {
-
-  time_t rand_seed = time(NULL);
-  LOG(INFO) << "Running with rand seed: " << rand_seed;
-  srand(rand_seed);
-
-  io_mgr_.reset(new DiskIoMgr(num_disks, num_threads_per_disk, num_threads_per_disk,
-      MIN_READ_BUFFER_SIZE, MAX_READ_BUFFER_SIZE));
-  Status status = io_mgr_->Init(&mem_tracker_);
-  CHECK(status.ok());
-
-  // Initialize some data files.  It doesn't really matter how many there are.
-  files_.resize(num_clients * 2);
-  for (int i = 0; i < files_.size(); ++i) {
-    stringstream ss;
-    ss << "/tmp/disk_io_mgr_stress_file" << i;
-    files_[i].filename = ss.str();
-    files_[i].data = GenerateRandomData();
-    CreateTempFile(files_[i].filename.c_str(), files_[i].data.c_str());
-  }
-
-  clients_ = new Client[num_clients_];
-  client_mem_trackers_.resize(num_clients_);
-  for (int i = 0; i < num_clients_; ++i) {
-    NewClient(i);
-  }
-}
-
-void DiskIoMgrStress::ClientThread(int client_id) {
-  Client* client = &clients_[client_id];
-  Status status;
-  char read_buffer[MAX_FILE_LEN];
-
-  while (!shutdown_) {
-    bool eos = false;
-    int bytes_read = 0;
-
-    const string& expected = files_[client->file_idx].data;
-
-    while (!eos) {
-      DiskIoMgr::ScanRange* range;
-      Status status = io_mgr_->GetNextRange(client->reader.get(), &range);
-      CHECK(status.ok() || status.IsCancelled());
-      if (range == NULL) break;
-
-      while (true) {
-        unique_ptr<DiskIoMgr::BufferDescriptor> buffer;
-        status = range->GetNext(&buffer);
-        CHECK(status.ok() || status.IsCancelled());
-        if (buffer == NULL) break;
-
-        int64_t scan_range_offset = buffer->scan_range_offset();
-        int len = buffer->len();
-        CHECK_GE(scan_range_offset, 0);
-        CHECK_LT(scan_range_offset, expected.size());
-        CHECK_GT(len, 0);
-
-        // We get scan ranges back in arbitrary order so the scan range to the file
-        // offset.
-        int64_t file_offset = scan_range_offset + range->offset();
-
-        // Validate the bytes read
-        CHECK_LE(file_offset + len, expected.size());
-        CHECK_EQ(strncmp(reinterpret_cast<char*>(buffer->buffer()),
-                     &expected.c_str()[file_offset], len), 0);
-
-        // Copy the bytes from this read into the result buffer.
-        memcpy(read_buffer + file_offset, buffer->buffer(), buffer->len());
-        io_mgr_->ReturnBuffer(move(buffer));
-        bytes_read += len;
-
-        CHECK_GE(bytes_read, 0);
-        CHECK_LE(bytes_read, expected.size());
-
-        if (bytes_read > client->abort_at_byte) {
-          eos = true;
-          break;
-        }
-      } // End of buffer
-    } // End of scan range
-
-    if (bytes_read == expected.size()) {
-      // This entire file was read without being cancelled, validate the entire result
-      CHECK(status.ok());
-      CHECK_EQ(strncmp(read_buffer, expected.c_str(), bytes_read), 0);
-    }
-
-    // Unregister the old client and get a new one
-    unique_lock<mutex> lock(client->lock);
-    io_mgr_->UnregisterContext(client->reader.get());
-    NewClient(client_id);
-  }
-
-  unique_lock<mutex> lock(client->lock);
-  io_mgr_->UnregisterContext(client->reader.get());
-  client->reader = NULL;
-}
-
-// Cancel a random reader
-void DiskIoMgrStress::CancelRandomReader() {
-  if (!includes_cancellation_) return;
-
-  int rand_client = rand() % num_clients_;
-
-  unique_lock<mutex> lock(clients_[rand_client].lock);
-  io_mgr_->CancelContext(clients_[rand_client].reader.get());
-}
-
-void DiskIoMgrStress::Run(int sec) {
-  shutdown_ = false;
-  for (int i = 0; i < num_clients_; ++i) {
-    readers_.add_thread(
-        new thread(&DiskIoMgrStress::ClientThread, this, i));
-  }
-
-  // Sleep and let the clients do their thing for 'sec'
-  for (int loop_count = 1; sec == 0 || loop_count <= sec; ++loop_count) {
-    int iter = (1000) / CANCEL_READER_PERIOD_MS;
-    for (int i = 0; i < iter; ++i) {
-      SleepForMs(CANCEL_READER_PERIOD_MS);
-      CancelRandomReader();
-    }
-    LOG(ERROR) << "Finished iteration: " << loop_count;
-  }
-
-  // Signal shutdown for the client threads
-  shutdown_ = true;
-
-  for (int i = 0; i < num_clients_; ++i) {
-    unique_lock<mutex> lock(clients_[i].lock);
-    if (clients_[i].reader != NULL) io_mgr_->CancelContext(clients_[i].reader.get());
-  }
-
-  readers_.join_all();
-}
-
-// Initialize a client to read one of the files at random.  The scan ranges are
-// assigned randomly.
-void DiskIoMgrStress::NewClient(int i) {
-  Client& client = clients_[i];
-  ++client.files_processed;
-  client.file_idx = rand() % files_.size();
-  int file_len = files_[client.file_idx].data.size();
-
-  client.abort_at_byte = file_len;
-
-  if (includes_cancellation_) {
-    float rand_value = rand() / (float)RAND_MAX;
-    if (rand_value < ABORT_CHANCE) {
-      // Abort at a random byte inside the file
-      client.abort_at_byte = rand() % file_len;
-    }
-  }
-
-  for (int i = 0; i < client.scan_ranges.size(); ++i) {
-    delete client.scan_ranges[i];
-  }
-  client.scan_ranges.clear();
-
-  int assigned_len = 0;
-  while (assigned_len < file_len) {
-    int range_len = rand() % (MAX_READ_LEN - MIN_READ_LEN) + MIN_READ_LEN;
-    range_len = min(range_len, file_len - assigned_len);
-
-    DiskIoMgr::ScanRange* range = new DiskIoMgr::ScanRange();
-    range->Reset(NULL, files_[client.file_idx].filename.c_str(), range_len, assigned_len,
-        0, false, DiskIoMgr::BufferOpts::Uncached());
-    client.scan_ranges.push_back(range);
-    assigned_len += range_len;
-  }
-
-  client_mem_trackers_[i].reset(new MemTracker(-1, "", &mem_tracker_));
-  client.reader = io_mgr_->RegisterContext(client_mem_trackers_[i].get());
-  Status status = io_mgr_->AddScanRanges(client.reader.get(), client.scan_ranges);
-  CHECK(status.ok());
-}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr-stress.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr-stress.h b/be/src/runtime/disk-io-mgr-stress.h
deleted file mode 100644
index 0a66f2c..0000000
--- a/be/src/runtime/disk-io-mgr-stress.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-
-#ifndef IMPALA_RUNTIME_DISK_IO_MGR_STRESS_H
-#define IMPALA_RUNTIME_DISK_IO_MGR_STRESS_H
-
-#include <memory>
-#include <vector>
-#include <boost/scoped_ptr.hpp>
-#include <boost/thread/thread.hpp>
-
-#include "runtime/disk-io-mgr.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/thread-resource-mgr.h"
-
-namespace impala {
-
-/// Test utility to stress the disk io mgr.  It allows for a configurable
-/// number of clients.  The clients continuously issue work to the io mgr and
-/// asynchronously get cancelled.  The stress test can be run forever or for
-/// a fixed duration.  The unit test runs this for a fixed duration.
-class DiskIoMgrStress {
- public:
-  DiskIoMgrStress(int num_disks, int num_threads_per_disk, int num_clients,
-      bool includes_cancellation);
-
-  /// Run the test for 'sec'.  If 0, run forever
-  void Run(int sec);
-
- private:
-  struct Client;
-
-  struct File {
-    std::string filename;
-    std::string data;  // the data in the file, used to validate
-  };
-
-
-  /// Files used for testing.  These are created at startup and recycled
-  /// during the test
-  std::vector<File> files_;
-
-  /// Root mem tracker.
-  MemTracker mem_tracker_;
-
-  /// io manager
-  boost::scoped_ptr<DiskIoMgr> io_mgr_;
-
-  /// Thread group for reader threads
-  boost::thread_group readers_;
-
-  /// Array of clients
-  int num_clients_;
-  Client* clients_;
-
-  /// Client MemTrackers, one per client.
-  std::vector<std::unique_ptr<MemTracker>> client_mem_trackers_;
-
-  /// If true, tests cancelling readers
-  bool includes_cancellation_;
-
-  /// Flag to signal that client reader threads should exit
-  volatile bool shutdown_;
-
-  /// Helper to initialize a new reader client, registering a new reader with the
-  /// io mgr and initializing the scan ranges
-  void NewClient(int i);
-
-  /// Thread running the reader.  When the current reader is done (either normally
-  /// or cancelled), it picks up a new reader
-  void ClientThread(int client_id);
-
-  /// Possibly cancels a random reader.
-  void CancelRandomReader();
-};
-
-}
-
-#endif

[12/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/disk-io-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.h b/be/src/runtime/disk-io-mgr.h
deleted file mode 100644
index 49de0ff..0000000
--- a/be/src/runtime/disk-io-mgr.h
+++ /dev/null
@@ -1,972 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_DISK_IO_MGR_H
-#define IMPALA_RUNTIME_DISK_IO_MGR_H
-
-#include <deque>
-#include <functional>
-#include <vector>
-
-#include <boost/scoped_ptr.hpp>
-#include <boost/unordered_set.hpp>
-#include <boost/thread/mutex.hpp>
-
-#include "common/atomic.h"
-#include "common/hdfs.h"
-#include "common/object-pool.h"
-#include "common/status.h"
-#include "runtime/disk-io-mgr-handle-cache.h"
-#include "runtime/thread-resource-mgr.h"
-#include "util/aligned-new.h"
-#include "util/bit-util.h"
-#include "util/condition-variable.h"
-#include "util/error-util.h"
-#include "util/internal-queue.h"
-#include "util/runtime-profile.h"
-#include "util/thread.h"
-
-namespace impala {
-
-class MemTracker;
-
-/// Manager object that schedules IO for all queries on all disks and remote filesystems
-/// (such as S3). Each query maps to one or more DiskIoRequestContext objects, each of which
-/// has its own queue of scan ranges and/or write ranges.
-//
-/// The API splits up requesting scan/write ranges (non-blocking) and reading the data
-/// (blocking). The DiskIoMgr has worker threads that will read from and write to
-/// disk/hdfs/remote-filesystems, allowing interleaving of IO and CPU. This allows us to
-/// keep all disks and all cores as busy as possible.
-//
-/// All public APIs are thread-safe. It is not valid to call any of the APIs after
-/// UnregisterContext() returns.
-//
-/// For Readers:
-/// We can model this problem as a multiple producer (threads for each disk), multiple
-/// consumer (scan ranges) problem. There are multiple queues that need to be
-/// synchronized. Conceptually, there are two queues:
-///   1. The per disk queue: this contains a queue of readers that need reads.
-///   2. The per scan range ready-buffer queue: this contains buffers that have been
-///      read and are ready for the caller.
-/// The disk queue contains a queue of readers and is scheduled in a round robin fashion.
-/// Readers map to scan nodes. The reader then contains a queue of scan ranges. The caller
-/// asks the IoMgr for the next range to process. The IoMgr then selects the best range
-/// to read based on disk activity and begins reading and queuing buffers for that range.
-/// TODO: We should map readers to queries. A reader is the unit of scheduling and queries
-/// that have multiple scan nodes shouldn't have more 'turns'.
-//
-/// For Writers:
-/// Data is written via AddWriteRange(). This is non-blocking and adds a WriteRange to a
-/// per-disk queue. After the write is complete, a callback in WriteRange is invoked.
-/// No memory is allocated within IoMgr for writes and no copies are made. It is the
-/// responsibility of the client to ensure that the data to be written is valid and that
-/// the file to be written to exists until the callback is invoked.
-//
-/// The IoMgr provides three key APIs.
-///  1. AddScanRanges: this is non-blocking and tells the IoMgr all the ranges that
-///     will eventually need to be read.
-///  2. GetNextRange: returns to the caller the next scan range it should process.
-///     This is based on disk load. This also begins reading the data in this scan
-///     range. This is blocking.
-///  3. ScanRange::GetNext: returns the next buffer for this range.  This is blocking.
-//
-/// The disk threads do not synchronize with each other. The readers and writers don't
-/// synchronize with each other. There is a lock and condition variable for each request
-/// context queue and each disk queue.
-/// IMPORTANT: whenever both locks are needed, the lock order is to grab the context lock
-/// before the disk lock.
-//
-/// Scheduling: If there are multiple request contexts with work for a single disk, the
-/// request contexts are scheduled in round-robin order. Multiple disk threads can
-/// operate on the same request context. Exactly one request range is processed by a
-/// disk thread at a time. If there are multiple scan ranges scheduled via
-/// GetNextRange() for a single context, these are processed in round-robin order.
-/// If there are multiple scan and write ranges for a disk, a read is always followed
-/// by a write, and a write is followed by a read, i.e. reads and writes alternate.
-/// If multiple write ranges are enqueued for a single disk, they will be processed
-/// by the disk threads in order, but may complete in any order. No guarantees are made
-/// on ordering of writes across disks.
-//
-/// Resource Management: effective resource management in the IoMgr is key to good
-/// performance. The IoMgr helps coordinate two resources: CPU and disk. For CPU,
-/// spinning up too many threads causes thrashing.
-/// Memory usage in the IoMgr comes from queued read buffers.  If we queue the minimum
-/// (i.e. 1), then the disks are idle while we are processing the buffer. If we don't
-/// limit the queue, then it possible we end up queueing the entire data set (i.e. CPU
-/// is slower than disks) and run out of memory.
-/// For both CPU and memory, we want to model the machine as having a fixed amount of
-/// resources.  If a single query is running, it should saturate either CPU or Disk
-/// as well as using as little memory as possible. With multiple queries, each query
-/// should get less CPU. In that case each query will need fewer queued buffers and
-/// therefore have less memory usage.
-//
-/// The IoMgr defers CPU management to the caller. The IoMgr provides a GetNextRange
-/// API which will return the next scan range the caller should process. The caller
-/// can call this from the desired number of reading threads. Once a scan range
-/// has been returned via GetNextRange, the IoMgr will start to buffer reads for
-/// that range and it is expected the caller will pull those buffers promptly. For
-/// example, if the caller would like to have 1 scanner thread, the read loop
-/// would look like:
-///   while (more_ranges)
-///     range = GetNextRange()
-///     while (!range.eosr)
-///       buffer = range.GetNext()
-/// To have multiple reading threads, the caller would simply spin up the threads
-/// and each would process the loops above.
-//
-/// To control the number of IO buffers, each scan range has a limit of two queued
-/// buffers (SCAN_RANGE_READY_BUFFER_LIMIT). If the number of buffers is at capacity,
-/// the IoMgr will no longer read for that scan range until the caller has processed
-/// a buffer. Assuming the client returns each buffer before requesting the next one
-/// from the scan range, then this will consume up to 3 * 8MB = 24MB of I/O buffers per
-/// scan range.
-//
-/// Buffer Management:
-/// Buffers for reads are either a) allocated by the IoMgr and transferred to the caller,
-/// b) cached HDFS buffers if the scan range uses HDFS caching, or c) provided by the
-/// caller when constructing the scan range.
-///
-/// As a caller reads from a scan range, these buffers are wrapped in BufferDescriptors
-/// and returned to the caller. The caller must always call ReturnBuffer() on the buffer
-/// descriptor to allow recycling of the associated buffer (if there is an
-/// IoMgr-allocated or HDFS cached buffer).
-///
-/// Caching support:
-/// Scan ranges contain metadata on whether or not it is cached on the DN. In that
-/// case, we use the HDFS APIs to read the cached data without doing any copies. For these
-/// ranges, the reads happen on the caller thread (as opposed to the disk threads).
-/// It is possible for the cached read APIs to fail, in which case the ranges are then
-/// queued on the disk threads and behave identically to the case where the range
-/// is not cached.
-/// Resources for these ranges are also not accounted against the reader because none
-/// are consumed.
-/// While a cached block is being processed, the block is mlocked. We want to minimize
-/// the time the mlock is held.
-///   - HDFS will time us out if we hold onto the mlock for too long
-///   - Holding the lock prevents uncaching this file due to a caching policy change.
-/// Therefore, we only issue the cached read when the caller is ready to process the
-/// range (GetNextRange()) instead of when the ranges are issued. This guarantees that
-/// there will be a CPU available to process the buffer and any throttling we do with
-/// the number of scanner threads properly controls the amount of files we mlock.
-/// With cached scan ranges, we cannot close the scan range until the cached buffer
-/// is returned (HDFS does not allow this). We therefore need to defer the close until
-/// the cached buffer is returned (ReturnBuffer()).
-//
-/// Remote filesystem support (e.g. S3):
-/// Remote filesystems are modeled as "remote disks". That is, there is a seperate disk
-/// queue for each supported remote filesystem type. In order to maximize throughput,
-/// multiple connections are opened in parallel by having multiple threads running per
-/// queue. Also note that reading from a remote filesystem service can be more CPU
-/// intensive than local disk/hdfs because of non-direct I/O and SSL processing, and can
-/// be CPU bottlenecked especially if not enough I/O threads for these queues are
-/// started.
-//
-/// TODO: IoMgr should be able to request additional scan ranges from the coordinator
-/// to help deal with stragglers.
-/// TODO: look into using a lock free queue
-/// TODO: simplify the common path (less locking, memory allocations).
-/// TODO: Break this up the .h/.cc into multiple files under an /io subdirectory.
-//
-/// Structure of the Implementation:
-///  - All client APIs are defined in this file
-///  - Internal classes are defined in disk-io-mgr-internal.h
-///  - ScanRange APIs are implemented in disk-io-mgr-scan-range.cc
-///    This contains the ready buffer queue logic
-///  - DiskIoRequestContext APIs are implemented in disk-io-mgr-reader-context.cc
-///    This contains the logic for picking scan ranges for a reader.
-///  - Disk Thread and general APIs are implemented in disk-io-mgr.cc.
-
-class DiskIoRequestContext;
-
-// This is cache line aligned because the FileHandleCache needs cache line alignment
-// for its partitions.
-class DiskIoMgr : public CacheLineAligned {
- public:
-  class ScanRange;
-
-  /// Buffer struct that is used by the caller and IoMgr to pass read buffers.
-  /// It is is expected that only one thread has ownership of this object at a
-  /// time.
-  class BufferDescriptor {
-   public:
-    ~BufferDescriptor() {
-      DCHECK(buffer_ == nullptr); // Check we didn't leak a buffer.
-    }
-
-    ScanRange* scan_range() { return scan_range_; }
-    uint8_t* buffer() { return buffer_; }
-    int64_t buffer_len() { return buffer_len_; }
-    int64_t len() { return len_; }
-    bool eosr() { return eosr_; }
-
-    /// Returns the offset within the scan range that this buffer starts at
-    int64_t scan_range_offset() const { return scan_range_offset_; }
-
-    /// Transfer ownership of buffer memory from 'mem_tracker_' to 'dst' and set
-    /// 'mem_tracker_' to 'dst'. 'mem_tracker_' and 'dst' must be non-NULL. Does not
-    /// check memory limits on 'dst': the caller should check the memory limit if a
-    /// different memory limit may apply to 'dst'. If the buffer was a client-provided
-    /// buffer, transferring is not allowed.
-    /// TODO: IMPALA-3209: revisit this as part of scanner memory usage revamp.
-    void TransferOwnership(MemTracker* dst);
-
-   private:
-    friend class DiskIoMgr;
-    friend class DiskIoMgr::ScanRange;
-    friend class DiskIoRequestContext;
-
-    /// Create a buffer descriptor for a new reader, range and data buffer. The buffer
-    /// memory should already be accounted against 'mem_tracker'.
-    BufferDescriptor(DiskIoMgr* io_mgr, DiskIoRequestContext* reader,
-        ScanRange* scan_range, uint8_t* buffer, int64_t buffer_len,
-        MemTracker* mem_tracker);
-
-    /// Return true if this is a cached buffer owned by HDFS.
-    bool is_cached() const {
-      return scan_range_->external_buffer_tag_
-          == ScanRange::ExternalBufferTag::CACHED_BUFFER;
-    }
-
-    /// Return true if this is a buffer owner by the client that was provided when
-    /// constructing the scan range.
-    bool is_client_buffer() const {
-      return scan_range_->external_buffer_tag_
-          == ScanRange::ExternalBufferTag::CLIENT_BUFFER;
-    }
-
-    DiskIoMgr* const io_mgr_;
-
-    /// Reader that this buffer is for.
-    DiskIoRequestContext* const reader_;
-
-    /// The current tracker this buffer is associated with. After initialisation,
-    /// NULL for cached buffers and non-NULL for all other buffers.
-    MemTracker* mem_tracker_;
-
-    /// Scan range that this buffer is for. Non-NULL when initialised.
-    ScanRange* const scan_range_;
-
-    /// buffer with the read contents
-    uint8_t* buffer_;
-
-    /// length of buffer_. For buffers from cached reads, the length is 0.
-    const int64_t buffer_len_;
-
-    /// length of read contents
-    int64_t len_ = 0;
-
-    /// true if the current scan range is complete
-    bool eosr_ = false;
-
-    /// Status of the read to this buffer. if status is not ok, 'buffer' is nullptr
-    Status status_;
-
-    int64_t scan_range_offset_ = 0;
-  };
-
-  /// The request type, read or write associated with a request range.
-  struct RequestType {
-    enum type {
-      READ,
-      WRITE,
-    };
-  };
-
-  /// Represents a contiguous sequence of bytes in a single file.
-  /// This is the common base class for read and write IO requests - ScanRange and
-  /// WriteRange. Each disk thread processes exactly one RequestRange at a time.
-  class RequestRange : public InternalQueue<RequestRange>::Node {
-   public:
-    hdfsFS fs() const { return fs_; }
-    const char* file() const { return file_.c_str(); }
-    std::string* file_string() { return &file_; }
-    int64_t offset() const { return offset_; }
-    int64_t len() const { return len_; }
-    int disk_id() const { return disk_id_; }
-    RequestType::type request_type() const { return request_type_; }
-
-   protected:
-    RequestRange(RequestType::type request_type)
-      : fs_(nullptr), offset_(-1), len_(-1), disk_id_(-1), request_type_(request_type) {}
-
-    /// Hadoop filesystem that contains file_, or set to nullptr for local filesystem.
-    hdfsFS fs_;
-
-    /// Path to file being read or written.
-    std::string file_;
-
-    /// Offset within file_ being read or written.
-    int64_t offset_;
-
-    /// Length of data read or written.
-    int64_t len_;
-
-    /// Id of disk containing byte range.
-    int disk_id_;
-
-    /// The type of IO request, READ or WRITE.
-    RequestType::type request_type_;
-  };
-
-  /// Param struct for different combinations of buffering.
-  struct BufferOpts {
-   public:
-    // Set options for a read into an IoMgr-allocated or HDFS-cached buffer. Caching is
-    // enabled if 'try_cache' is true, the file is in the HDFS cache and 'mtime' matches
-    // the modified time of the cached file in the HDFS cache.
-    BufferOpts(bool try_cache, int64_t mtime)
-      : try_cache_(try_cache),
-        mtime_(mtime),
-        client_buffer_(nullptr),
-        client_buffer_len_(-1) {}
-
-    /// Set options for an uncached read into an IoMgr-allocated buffer.
-    static BufferOpts Uncached() {
-      return BufferOpts(false, NEVER_CACHE, nullptr, -1);
-    }
-
-    /// Set options to read the entire scan range into 'client_buffer'. The length of the
-    /// buffer, 'client_buffer_len', must fit the entire scan range. HDFS caching is not
-    /// enabled in this case.
-    static BufferOpts ReadInto(uint8_t* client_buffer, int64_t client_buffer_len) {
-      return BufferOpts(false, NEVER_CACHE, client_buffer, client_buffer_len);
-    }
-
-   private:
-    friend class ScanRange;
-
-    BufferOpts(
-        bool try_cache, int64_t mtime, uint8_t* client_buffer, int64_t client_buffer_len)
-      : try_cache_(try_cache),
-        mtime_(mtime),
-        client_buffer_(client_buffer),
-        client_buffer_len_(client_buffer_len) {}
-
-    /// If 'mtime_' is set to NEVER_CACHE, the file handle will never be cached, because
-    /// the modification time won't match.
-    const static int64_t NEVER_CACHE = -1;
-
-    /// If true, read from HDFS cache if possible.
-    const bool try_cache_;
-
-    /// Last modified time of the file associated with the scan range. If set to
-    /// NEVER_CACHE, caching is disabled.
-    const int64_t mtime_;
-
-    /// A destination buffer provided by the client, nullptr and -1 if no buffer.
-    uint8_t* const client_buffer_;
-    const int64_t client_buffer_len_;
-  };
-
-  /// ScanRange description. The caller must call Reset() to initialize the fields
-  /// before calling AddScanRanges(). The private fields are used internally by
-  /// the IoMgr.
-  class ScanRange : public RequestRange {
-   public:
-    ScanRange();
-
-    virtual ~ScanRange();
-
-    /// Resets this scan range object with the scan range description. The scan range
-    /// is for bytes [offset, offset + len) in 'file' on 'fs' (which is nullptr for the
-    /// local filesystem). The scan range must fall within the file bounds (offset >= 0
-    /// and offset + len <= file_length). 'disk_id' is the disk queue to add the range
-    /// to. If 'expected_local' is true, a warning is generated if the read did not
-    /// come from a local disk. 'buffer_opts' specifies buffer management options -
-    /// see the DiskIoMgr class comment and the BufferOpts comments for details.
-    /// 'meta_data' is an arbitrary client-provided pointer for any auxiliary data.
-    void Reset(hdfsFS fs, const char* file, int64_t len, int64_t offset, int disk_id,
-        bool expected_local, const BufferOpts& buffer_opts, void* meta_data = nullptr);
-
-    void* meta_data() const { return meta_data_; }
-    bool try_cache() const { return try_cache_; }
-    bool expected_local() const { return expected_local_; }
-
-    /// Returns the next buffer for this scan range. buffer is an output parameter.
-    /// This function blocks until a buffer is ready or an error occurred. If this is
-    /// called when all buffers have been returned, *buffer is set to nullptr and Status::OK
-    /// is returned.
-    /// Only one thread can be in GetNext() at any time.
-    Status GetNext(std::unique_ptr<BufferDescriptor>* buffer) WARN_UNUSED_RESULT;
-
-    /// Cancel this scan range. This cleans up all queued buffers and
-    /// wakes up any threads blocked on GetNext().
-    /// Status is the reason the range was cancelled. Must not be ok().
-    /// Status is returned to the user in GetNext().
-    void Cancel(const Status& status);
-
-    /// return a descriptive string for debug.
-    std::string DebugString() const;
-
-    int64_t mtime() const { return mtime_; }
-
-   private:
-    friend class DiskIoMgr;
-    friend class DiskIoRequestContext;
-
-    /// Initialize internal fields
-    void InitInternal(DiskIoMgr* io_mgr, DiskIoRequestContext* reader);
-
-    /// Enqueues a buffer for this range. This does not block.
-    /// Returns true if this scan range has hit the queue capacity, false otherwise.
-    /// The caller passes ownership of buffer to the scan range and it is not
-    /// valid to access buffer after this call. The reader lock must be held by the
-    /// caller.
-    bool EnqueueBuffer(const boost::unique_lock<boost::mutex>& reader_lock,
-        std::unique_ptr<BufferDescriptor> buffer);
-
-    /// Cleanup any queued buffers (i.e. due to cancellation). This cannot
-    /// be called with any locks taken.
-    void CleanupQueuedBuffers();
-
-    /// Validates the internal state of this range. lock_ must be taken
-    /// before calling this.
-    bool Validate();
-
-    /// Maximum length in bytes for hdfsRead() calls.
-    int64_t MaxReadChunkSize() const;
-
-    /// Opens the file for this range. This function only modifies state in this range.
-    /// If 'use_file_handle_cache' is true and this is a local hdfs file, then this scan
-    /// range will not maintain an exclusive file handle. It will borrow an hdfs file
-    /// handle from the file handle cache for each Read(), so Open() does nothing.
-    /// If 'use_file_handle_cache' is false or this is a remote hdfs file or this is
-    /// a local OS file, Open() will maintain a file handle on the scan range for
-    /// exclusive use by this scan range. An exclusive hdfs file handle still comes
-    /// from the cache, but it is a newly opened file handle that is held for the
-    /// entire duration of a scan range's lifetime and destroyed in Close().
-    /// All local OS files are opened using normal OS file APIs.
-    Status Open(bool use_file_handle_cache) WARN_UNUSED_RESULT;
-
-    /// Closes the file for this range. This function only modifies state in this range.
-    void Close();
-
-    /// Reads from this range into 'buffer', which has length 'buffer_len' bytes. Returns
-    /// the number of bytes read. The read position in this scan range is updated.
-    Status Read(uint8_t* buffer, int64_t buffer_len, int64_t* bytes_read,
-        bool* eosr) WARN_UNUSED_RESULT;
-
-    /// Get the read statistics from the Hdfs file handle and aggregate them to
-    /// the DiskIoRequestContext. This clears the statistics on this file handle.
-    /// It is safe to pass hdfsFile by value, as hdfsFile's underlying type is a
-    /// pointer.
-    void GetHdfsStatistics(hdfsFile fh);
-
-    /// Reads from the DN cache. On success, sets cached_buffer_ to the DN buffer
-    /// and *read_succeeded to true.
-    /// If the data is not cached, returns ok() and *read_succeeded is set to false.
-    /// Returns a non-ok status if it ran into a non-continuable error.
-    ///  The reader lock must be held by the caller.
-    Status ReadFromCache(const boost::unique_lock<boost::mutex>& reader_lock,
-        bool* read_succeeded) WARN_UNUSED_RESULT;
-
-    /// Pointer to caller specified metadata. This is untouched by the io manager
-    /// and the caller can put whatever auxiliary data in here.
-    void* meta_data_ = nullptr;
-
-    /// If true, this scan range is expected to be cached. Note that this might be wrong
-    /// since the block could have been uncached. In that case, the cached path
-    /// will fail and we'll just put the scan range on the normal read path.
-    bool try_cache_ = false;
-
-    /// If true, we expect this scan range to be a local read. Note that if this is false,
-    /// it does not necessarily mean we expect the read to be remote, and that we never
-    /// create scan ranges where some of the range is expected to be remote and some of it
-    /// local.
-    /// TODO: we can do more with this
-    bool expected_local_ = false;
-
-    /// Total number of bytes read remotely. This is necessary to maintain a count of
-    /// the number of remote scan ranges. Since IO statistics can be collected multiple
-    /// times for a scan range, it is necessary to keep some state about whether this
-    /// scan range has already been counted as remote. There is also a requirement to
-    /// log the number of unexpected remote bytes for a scan range. To solve both
-    /// requirements, maintain num_remote_bytes_ on the ScanRange and push it to the
-    /// reader_ once at the close of the scan range.
-    int64_t num_remote_bytes_;
-
-    DiskIoMgr* io_mgr_ = nullptr;
-
-    /// Reader/owner of the scan range
-    DiskIoRequestContext* reader_ = nullptr;
-
-    /// File handle either to hdfs or local fs (FILE*)
-    /// The hdfs file handle is only stored here in three cases:
-    /// 1. The file handle cache is off (max_cached_file_handles == 0).
-    /// 2. The scan range is using hdfs caching.
-    /// -OR-
-    /// 3. The hdfs file is expected to be remote (expected_local_ == false)
-    /// In each case, the scan range gets a new file handle from the file handle cache
-    /// at Open(), holds it exclusively, and destroys it in Close().
-    union {
-      FILE* local_file_ = nullptr;
-      HdfsFileHandle* exclusive_hdfs_fh_;
-    };
-
-    /// Tagged union that holds a buffer for the cases when there is a buffer allocated
-    /// externally from DiskIoMgr that is associated with the scan range.
-    enum class ExternalBufferTag { CLIENT_BUFFER, CACHED_BUFFER, NO_BUFFER };
-    ExternalBufferTag external_buffer_tag_;
-    union {
-      /// Valid if the 'external_buffer_tag_' is CLIENT_BUFFER.
-      struct {
-        /// Client-provided buffer to read the whole scan range into.
-        uint8_t* data;
-
-        /// Length of the client-provided buffer.
-        int64_t len;
-      } client_buffer_;
-
-      /// Valid and non-NULL if the external_buffer_tag_ is CACHED_BUFFER, which means
-      /// that a cached read succeeded and all the bytes for the range are in this buffer.
-      struct hadoopRzBuffer* cached_buffer_ = nullptr;
-    };
-
-    /// Lock protecting fields below.
-    /// This lock should not be taken during Open()/Read()/Close().
-    /// If DiskIoRequestContext::lock_ and this lock need to be held simultaneously,
-    /// DiskIoRequestContext::lock_ must be taken first.
-    boost::mutex lock_;
-
-    /// Number of bytes read so far for this scan range
-    int bytes_read_;
-
-    /// Status for this range. This is non-ok if is_cancelled_ is true.
-    /// Note: an individual range can fail without the DiskIoRequestContext being
-    /// cancelled. This allows us to skip individual ranges.
-    Status status_;
-
-    /// If true, the last buffer for this scan range has been queued.
-    bool eosr_queued_ = false;
-
-    /// If true, the last buffer for this scan range has been returned.
-    bool eosr_returned_ = false;
-
-    /// If true, this scan range has been removed from the reader's in_flight_ranges
-    /// queue because the ready_buffers_ queue is full.
-    bool blocked_on_queue_ = false;
-
-    /// IO buffers that are queued for this scan range.
-    /// Condition variable for GetNext
-    ConditionVariable buffer_ready_cv_;
-    std::deque<std::unique_ptr<BufferDescriptor>> ready_buffers_;
-
-    /// Lock that should be taken during hdfs calls. Only one thread (the disk reading
-    /// thread) calls into hdfs at a time so this lock does not have performance impact.
-    /// This lock only serves to coordinate cleanup. Specifically it serves to ensure
-    /// that the disk threads are finished with HDFS calls before is_cancelled_ is set
-    /// to true and cleanup starts.
-    /// If this lock and lock_ need to be taken, lock_ must be taken first.
-    boost::mutex hdfs_lock_;
-
-    /// If true, this scan range has been cancelled.
-    bool is_cancelled_ = false;
-
-    /// Last modified time of the file associated with the scan range
-    int64_t mtime_;
-  };
-
-  /// Used to specify data to be written to a file and offset.
-  /// It is the responsibility of the client to ensure that the data to be written is
-  /// valid and that the file to be written to exists until the callback is invoked.
-  /// A callback is invoked to inform the client when the write is done.
-  class WriteRange : public RequestRange {
-   public:
-    /// This callback is invoked on each WriteRange after the write is complete or the
-    /// context is cancelled. The status returned by the callback parameter indicates
-    /// if the write was successful (i.e. Status::OK), if there was an error
-    /// TStatusCode::RUNTIME_ERROR) or if the context was cancelled
-    /// (TStatusCode::CANCELLED). The callback is only invoked if this WriteRange was
-    /// successfully added (i.e. AddWriteRange() succeeded). No locks are held while
-    /// the callback is invoked.
-    typedef std::function<void(const Status&)> WriteDoneCallback;
-    WriteRange(const std::string& file, int64_t file_offset, int disk_id,
-        WriteDoneCallback callback);
-
-    /// Change the file and offset of this write range. Data and callbacks are unchanged.
-    /// Can only be called when the write is not in flight (i.e. before AddWriteRange()
-    /// is called or after the write callback was called).
-    void SetRange(const std::string& file, int64_t file_offset, int disk_id);
-
-    /// Set the data and number of bytes to be written for this WriteRange.
-    /// Can only be called when the write is not in flight (i.e. before AddWriteRange()
-    /// is called or after the write callback was called).
-    void SetData(const uint8_t* buffer, int64_t len);
-
-    const uint8_t* data() const { return data_; }
-
-   private:
-    friend class DiskIoMgr;
-    friend class DiskIoRequestContext;
-
-    /// Data to be written. RequestRange::len_ contains the length of data
-    /// to be written.
-    const uint8_t* data_;
-
-    /// Callback to invoke after the write is complete.
-    WriteDoneCallback callback_;
-  };
-
-  /// Create a DiskIoMgr object. This constructor is only used for testing.
-  ///  - num_disks: The number of disks the IoMgr should use. This is used for testing.
-  ///    Specify 0, to have the disk IoMgr query the os for the number of disks.
-  ///  - threads_per_rotational_disk: number of read threads to create per rotational
-  ///    disk. This is also the max queue depth.
-  ///  - threads_per_solid_state_disk: number of read threads to create per solid state
-  ///    disk. This is also the max queue depth.
-  ///  - min_buffer_size: minimum io buffer size (in bytes)
-  ///  - max_buffer_size: maximum io buffer size (in bytes). Also the max read size.
-  DiskIoMgr(int num_disks, int threads_per_rotational_disk,
-      int threads_per_solid_state_disk, int min_buffer_size, int max_buffer_size);
-
-  /// Create DiskIoMgr with default configs.
-  DiskIoMgr();
-
-  /// Clean up all threads and resources. This is mostly useful for testing since
-  /// for impalad, this object is never destroyed.
-  ~DiskIoMgr();
-
-  /// Initialize the IoMgr. Must be called once before any of the other APIs.
-  Status Init(MemTracker* process_mem_tracker) WARN_UNUSED_RESULT;
-
-  /// Allocates tracking structure for a request context.
-  /// Register a new request context and return it to the caller. The caller must call
-  /// UnregisterContext() for each context.
-  /// reader_mem_tracker: Is non-null only for readers. IO buffers
-  ///    used for this reader will be tracked by this. If the limit is exceeded
-  ///    the reader will be cancelled and MEM_LIMIT_EXCEEDED will be returned via
-  ///    GetNext().
-  std::unique_ptr<DiskIoRequestContext> RegisterContext(MemTracker* reader_mem_tracker);
-
-  /// Unregisters context from the disk IoMgr by first cancelling it then blocking until
-  /// all references to the context are removed from I/O manager internal data structures.
-  /// This must be called for every RegisterContext() to ensure that the context object
-  /// can be safely destroyed. It is invalid to add more request ranges to 'context' after
-  /// after this call. This call blocks until all the disk threads have finished cleaning
-  /// up.
-  void UnregisterContext(DiskIoRequestContext* context);
-
-  /// This function cancels the context asychronously. All outstanding requests
-  /// are aborted and tracking structures cleaned up. This does not need to be
-  /// called if the context finishes normally.
-  /// This will also fail any outstanding GetNext()/Read requests.
-  void CancelContext(DiskIoRequestContext* context);
-
-  /// Adds the scan ranges to the queues. This call is non-blocking. The caller must
-  /// not deallocate the scan range pointers before UnregisterContext().
-  /// If schedule_immediately, the ranges are immediately put on the read queue
-  /// (i.e. the caller should not/cannot call GetNextRange for these ranges).
-  /// This can be used to do synchronous reads as well as schedule dependent ranges,
-  /// as in the case for columnar formats.
-  Status AddScanRanges(DiskIoRequestContext* reader,
-      const std::vector<ScanRange*>& ranges,
-      bool schedule_immediately = false) WARN_UNUSED_RESULT;
-  Status AddScanRange(DiskIoRequestContext* reader, ScanRange* range,
-      bool schedule_immediately = false) WARN_UNUSED_RESULT;
-
-  /// Add a WriteRange for the writer. This is non-blocking and schedules the context
-  /// on the IoMgr disk queue. Does not create any files.
-  Status AddWriteRange(
-      DiskIoRequestContext* writer, WriteRange* write_range) WARN_UNUSED_RESULT;
-
-  /// Returns the next unstarted scan range for this reader. When the range is returned,
-  /// the disk threads in the IoMgr will already have started reading from it. The
-  /// caller is expected to call ScanRange::GetNext on the returned range.
-  /// If there are no more unstarted ranges, nullptr is returned.
-  /// This call is blocking.
-  Status GetNextRange(DiskIoRequestContext* reader, ScanRange** range) WARN_UNUSED_RESULT;
-
-  /// Reads the range and returns the result in buffer.
-  /// This behaves like the typical synchronous read() api, blocking until the data
-  /// is read. This can be called while there are outstanding ScanRanges and is
-  /// thread safe. Multiple threads can be calling Read() per reader at a time.
-  /// range *cannot* have already been added via AddScanRanges.
-  /// This can only be used if the scan range fits in a single IO buffer (i.e. is smaller
-  /// than max_read_buffer_size()) or if reading into a client-provided buffer.
-  Status Read(DiskIoRequestContext* reader, ScanRange* range,
-      std::unique_ptr<BufferDescriptor>* buffer) WARN_UNUSED_RESULT;
-
-  /// Returns the buffer to the IoMgr. This must be called for every buffer
-  /// returned by GetNext()/Read() that did not return an error. This is non-blocking.
-  /// After calling this, the buffer descriptor is invalid and cannot be accessed.
-  void ReturnBuffer(std::unique_ptr<BufferDescriptor> buffer);
-
-  /// Determine which disk queue this file should be assigned to.  Returns an index into
-  /// disk_queues_.  The disk_id is the volume ID for the local disk that holds the
-  /// files, or -1 if unknown.  Flag expected_local is true iff this impalad is
-  /// co-located with the datanode for this file.
-  int AssignQueue(const char* file, int disk_id, bool expected_local);
-
-  /// TODO: The functions below can be moved to DiskIoRequestContext.
-  /// Returns the current status of the context.
-  Status context_status(DiskIoRequestContext* context) const WARN_UNUSED_RESULT;
-
-  void set_bytes_read_counter(DiskIoRequestContext*, RuntimeProfile::Counter*);
-  void set_read_timer(DiskIoRequestContext*, RuntimeProfile::Counter*);
-  void set_active_read_thread_counter(DiskIoRequestContext*, RuntimeProfile::Counter*);
-  void set_disks_access_bitmap(DiskIoRequestContext*, RuntimeProfile::Counter*);
-
-  int64_t queue_size(DiskIoRequestContext* reader) const;
-  int64_t bytes_read_local(DiskIoRequestContext* reader) const;
-  int64_t bytes_read_short_circuit(DiskIoRequestContext* reader) const;
-  int64_t bytes_read_dn_cache(DiskIoRequestContext* reader) const;
-  int num_remote_ranges(DiskIoRequestContext* reader) const;
-  int64_t unexpected_remote_bytes(DiskIoRequestContext* reader) const;
-  int cached_file_handles_hit_count(DiskIoRequestContext* reader) const;
-  int cached_file_handles_miss_count(DiskIoRequestContext* reader) const;
-
-  /// Returns the read throughput across all readers.
-  /// TODO: should this be a sliding window?  This should report metrics for the
-  /// last minute, hour and since the beginning.
-  int64_t GetReadThroughput();
-
-  /// Returns the maximum read buffer size
-  int max_read_buffer_size() const { return max_buffer_size_; }
-
-  /// Returns the total number of disk queues (both local and remote).
-  int num_total_disks() const { return disk_queues_.size(); }
-
-  /// Returns the total number of remote "disk" queues.
-  int num_remote_disks() const { return REMOTE_NUM_DISKS; }
-
-  /// Returns the number of local disks attached to the system.
-  int num_local_disks() const { return num_total_disks() - num_remote_disks(); }
-
-  /// The disk ID (and therefore disk_queues_ index) used for DFS accesses.
-  int RemoteDfsDiskId() const { return num_local_disks() + REMOTE_DFS_DISK_OFFSET; }
-
-  /// The disk ID (and therefore disk_queues_ index) used for S3 accesses.
-  int RemoteS3DiskId() const { return num_local_disks() + REMOTE_S3_DISK_OFFSET; }
-
-  /// The disk ID (and therefore disk_queues_ index) used for ADLS accesses.
-  int RemoteAdlsDiskId() const { return num_local_disks() + REMOTE_ADLS_DISK_OFFSET; }
-
-  /// Dumps the disk IoMgr queues (for readers and disks)
-  std::string DebugString();
-
-  /// Validates the internal state is consistent. This is intended to only be used
-  /// for debugging.
-  bool Validate() const;
-
-  /// Given a FS handle, name and last modified time of the file, gets an HdfsFileHandle
-  /// from the file handle cache. If 'require_new_handle' is true, the cache will open
-  /// a fresh file handle. On success, records statistics about whether this was
-  /// a cache hit or miss in the 'reader' as well as at the system level. In case of an
-  /// error returns nullptr.
-  HdfsFileHandle* GetCachedHdfsFileHandle(const hdfsFS& fs,
-      std::string* fname, int64_t mtime, DiskIoRequestContext *reader,
-      bool require_new_handle);
-
-  /// Releases a file handle back to the file handle cache when it is no longer in use.
-  /// If 'destroy_handle' is true, the file handle cache will close the file handle
-  /// immediately.
-  void ReleaseCachedHdfsFileHandle(std::string* fname, HdfsFileHandle* fid,
-      bool destroy_handle);
-
-  /// Reopens a file handle by destroying the file handle and getting a fresh
-  /// file handle from the cache. Returns an error if the file could not be reopened.
-  Status ReopenCachedHdfsFileHandle(const hdfsFS& fs, std::string* fname, int64_t mtime,
-      HdfsFileHandle** fid);
-
-  /// Garbage collect unused I/O buffers up to 'bytes_to_free', or all the buffers if
-  /// 'bytes_to_free' is -1.
-  void GcIoBuffers(int64_t bytes_to_free = -1);
-
-  /// The maximum number of ready buffers that can be queued in a scan range. Having two
-  /// queued buffers (plus the buffer that is returned to the client) gives good
-  /// performance in most scenarios:
-  /// 1. If the consumer is consuming data faster than we can read from disk, then the
-  ///    queue will be empty most of the time because the buffer will be immediately
-  ///    pulled off the queue as soon as it is added. There will always be an I/O request
-  ///    in the disk queue to maximize I/O throughput, which is the bottleneck in this
-  ///    case.
-  /// 2. If we can read from disk faster than the consumer is consuming data, the queue
-  ///    will fill up and there will always be a buffer available for the consumer to
-  ///    read, so the consumer will not block and we maximize consumer throughput, which
-  ///    is the bottleneck in this case.
-  /// 3. If the consumer is consuming data at approximately the same rate as we are
-  ///    reading from disk, then the steady state is that the consumer is processing one
-  ///    buffer and one buffer is in the disk queue. The additional buffer can absorb
-  ///    bursts where the producer runs faster than the consumer or the consumer runs
-  ///    faster than the producer without blocking either the producer or consumer.
-  static const int SCAN_RANGE_READY_BUFFER_LIMIT = 2;
-
-  /// "Disk" queue offsets for remote accesses.  Offset 0 corresponds to
-  /// disk ID (i.e. disk_queue_ index) of num_local_disks().
-  enum {
-    REMOTE_DFS_DISK_OFFSET = 0,
-    REMOTE_S3_DISK_OFFSET,
-    REMOTE_ADLS_DISK_OFFSET,
-    REMOTE_NUM_DISKS
-  };
-
- private:
-  friend class BufferDescriptor;
-  friend class DiskIoRequestContext;
-  struct DiskQueue;
-
-  friend class DiskIoMgrTest_Buffers_Test;
-  friend class DiskIoMgrTest_VerifyNumThreadsParameter_Test;
-
-  /// Memory tracker for unused I/O buffers owned by DiskIoMgr.
-  boost::scoped_ptr<MemTracker> free_buffer_mem_tracker_;
-
-  /// Memory tracker for I/O buffers where the DiskIoRequestContext has no MemTracker.
-  /// TODO: once IMPALA-3200 is fixed, there should be no more cases where readers don't
-  /// provide a MemTracker.
-  boost::scoped_ptr<MemTracker> unowned_buffer_mem_tracker_;
-
-  /// Number of worker(read) threads per rotational disk. Also the max depth of queued
-  /// work to the disk.
-  const int num_io_threads_per_rotational_disk_;
-
-  /// Number of worker(read) threads per solid state disk. Also the max depth of queued
-  /// work to the disk.
-  const int num_io_threads_per_solid_state_disk_;
-
-  /// Maximum read size. This is also the maximum size of each allocated buffer.
-  const int max_buffer_size_;
-
-  /// The minimum size of each read buffer.
-  const int min_buffer_size_;
-
-  /// Thread group containing all the worker threads.
-  ThreadGroup disk_thread_group_;
-
-  /// Options object for cached hdfs reads. Set on startup and never modified.
-  struct hadoopRzOptions* cached_read_options_ = nullptr;
-
-  /// True if the IoMgr should be torn down. Worker threads watch for this to
-  /// know to terminate. This variable is read/written to by different threads.
-  volatile bool shut_down_;
-
-  /// Total bytes read by the IoMgr.
-  RuntimeProfile::Counter total_bytes_read_counter_;
-
-  /// Total time spent in hdfs reading
-  RuntimeProfile::Counter read_timer_;
-
-  /// Protects free_buffers_
-  boost::mutex free_buffers_lock_;
-
-  /// Free buffers that can be handed out to clients. There is one list for each buffer
-  /// size, indexed by the Log2 of the buffer size in units of min_buffer_size_. The
-  /// maximum buffer size is max_buffer_size_, so the maximum index is
-  /// Log2(max_buffer_size_ / min_buffer_size_).
-  //
-  /// E.g. if min_buffer_size_ = 1024 bytes:
-  ///  free_buffers_[0]  => list of free buffers with size 1024 B
-  ///  free_buffers_[1]  => list of free buffers with size 2048 B
-  ///  free_buffers_[10] => list of free buffers with size 1 MB
-  ///  free_buffers_[13] => list of free buffers with size 8 MB
-  ///  free_buffers_[n]  => list of free buffers with size 2^n * 1024 B
-  std::vector<std::deque<uint8_t*>> free_buffers_;
-
-  /// Total number of allocated buffers, used for debugging.
-  AtomicInt32 num_allocated_buffers_;
-
-  /// Total number of buffers in readers
-  AtomicInt32 num_buffers_in_readers_;
-
-  /// Per disk queues. This is static and created once at Init() time.  One queue is
-  /// allocated for each local disk on the system and for each remote filesystem type.
-  /// It is indexed by disk id.
-  std::vector<DiskQueue*> disk_queues_;
-
-  /// The next disk queue to write to if the actual 'disk_id_' is unknown (i.e. the file
-  /// is not associated with a particular local disk or remote queue). Used to implement
-  /// round-robin assignment for that case.
-  static AtomicInt32 next_disk_id_;
-
-  // Number of file handle cache partitions to use
-  static const size_t NUM_FILE_HANDLE_CACHE_PARTITIONS = 16;
-
-  // Caching structure that maps file names to cached file handles. The cache has an upper
-  // limit of entries defined by FLAGS_max_cached_file_handles. Evicted cached file
-  // handles are closed.
-  FileHandleCache<NUM_FILE_HANDLE_CACHE_PARTITIONS> file_handle_cache_;
-
-  /// Returns the index into free_buffers_ for a given buffer size
-  int free_buffers_idx(int64_t buffer_size);
-
-  /// Returns a buffer to read into with size between 'buffer_size' and
-  /// 'max_buffer_size_', If there is an appropriately-sized free buffer in the
-  /// 'free_buffers_', that is returned, otherwise a new one is allocated.
-  /// The returned *buffer_size must be between 0 and 'max_buffer_size_'.
-  /// The buffer memory is tracked against reader's mem tracker, or
-  /// 'unowned_buffer_mem_tracker_' if the reader does not have one.
-  std::unique_ptr<BufferDescriptor> GetFreeBuffer(
-      DiskIoRequestContext* reader, ScanRange* range, int64_t buffer_size);
-
-  /// Disassociates the desc->buffer_ memory from 'desc' (which cannot be nullptr), either
-  /// freeing it or returning it to 'free_buffers_'. Memory tracking is updated to
-  /// reflect the transfer of ownership from desc->mem_tracker_ to the disk I/O mgr.
-  void FreeBufferMemory(BufferDescriptor* desc);
-
-  /// Disk worker thread loop. This function retrieves the next range to process on
-  /// the disk queue and invokes ReadRange() or Write() depending on the type of Range().
-  /// There can be multiple threads per disk running this loop.
-  void WorkLoop(DiskQueue* queue);
-
-  /// This is called from the disk thread to get the next range to process. It will
-  /// wait until a scan range and buffer are available, or a write range is available.
-  /// This functions returns the range to process.
-  /// Only returns false if the disk thread should be shut down.
-  /// No locks should be taken before this function call and none are left taken after.
-  bool GetNextRequestRange(DiskQueue* disk_queue, RequestRange** range,
-      DiskIoRequestContext** request_context);
-
-  /// Updates disk queue and reader state after a read is complete. The read result
-  /// is captured in the buffer descriptor.
-  void HandleReadFinished(DiskQueue* disk_queue, DiskIoRequestContext* reader,
-      std::unique_ptr<BufferDescriptor> buffer);
-
-  /// Invokes write_range->callback_  after the range has been written and
-  /// updates per-disk state and handle state. The status of the write OK/RUNTIME_ERROR
-  /// etc. is passed via write_status and to the callback.
-  /// The write_status does not affect the writer->status_. That is, an write error does
-  /// not cancel the writer context - that decision is left to the callback handler.
-  /// TODO: On the read path, consider not canceling the reader context on error.
-  void HandleWriteFinished(
-      DiskIoRequestContext* writer, WriteRange* write_range, const Status& write_status);
-
-  /// Validates that range is correctly initialized
-  Status ValidateScanRange(ScanRange* range) WARN_UNUSED_RESULT;
-
-  /// Write the specified range to disk and calls HandleWriteFinished when done.
-  /// Responsible for opening and closing the file that is written.
-  void Write(DiskIoRequestContext* writer_context, WriteRange* write_range);
-
-  /// Helper method to write a range using the specified FILE handle. Returns Status:OK
-  /// if the write succeeded, or a RUNTIME_ERROR with an appropriate message otherwise.
-  /// Does not open or close the file that is written.
-  Status WriteRangeHelper(FILE* file_handle, WriteRange* write_range) WARN_UNUSED_RESULT;
-
-  /// Reads the specified scan range and calls HandleReadFinished when done.
-  void ReadRange(DiskQueue* disk_queue, DiskIoRequestContext* reader, ScanRange* range);
-
-  /// Try to allocate the next buffer for the scan range, returning the new buffer
-  /// if successful. If 'reader' is cancelled, cancels the range and returns nullptr.
-  /// If there is memory pressure and buffers are already queued, adds the range
-  /// to the blocked ranges and returns nullptr.
-  std::unique_ptr<BufferDescriptor> TryAllocateNextBufferForRange(DiskQueue* disk_queue,
-      DiskIoRequestContext* reader, ScanRange* range, int64_t buffer_size);
-};
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/exec-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc
index 999b56a..6fb572c 100644
--- a/be/src/runtime/exec-env.cc
+++ b/be/src/runtime/exec-env.cc
@@ -36,7 +36,7 @@
 #include "runtime/client-cache.h"
 #include "runtime/coordinator.h"
 #include "runtime/data-stream-mgr.h"
-#include "runtime/disk-io-mgr.h"
+#include "runtime/io/disk-io-mgr.h"
 #include "runtime/hbase-table-factory.h"
 #include "runtime/hdfs-fs-cache.h"
 #include "runtime/krpc-data-stream-mgr.h"
@@ -164,7 +164,7 @@ ExecEnv::ExecEnv(const string& hostname, int backend_port, int krpc_port,
             FLAGS_catalog_client_rpc_timeout_ms, FLAGS_catalog_client_rpc_timeout_ms, "",
             !FLAGS_ssl_client_ca_certificate.empty())),
     htable_factory_(new HBaseTableFactory()),
-    disk_io_mgr_(new DiskIoMgr()),
+    disk_io_mgr_(new io::DiskIoMgr()),
     webserver_(new Webserver(webserver_port)),
     pool_mem_trackers_(new PoolMemTrackerRegistry),
     thread_mgr_(new ThreadResourceMgr),

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/exec-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.h b/be/src/runtime/exec-env.h
index 8fafdc5..193fdde 100644
--- a/be/src/runtime/exec-env.h
+++ b/be/src/runtime/exec-env.h
@@ -43,7 +43,6 @@ class BufferPool;
 class CallableThreadPool;
 class DataStreamMgrBase;
 class DataStreamMgr;
-class DiskIoMgr;
 class QueryExecMgr;
 class Frontend;
 class HBaseTableFactory;
@@ -65,6 +64,10 @@ class ThreadResourceMgr;
 class TmpFileMgr;
 class Webserver;
 
+namespace io {
+  class DiskIoMgr;
+}
+
 /// Execution environment for Impala daemon. Contains all required global structures, and
 /// handles to singleton services. Clients must call StartServices() exactly once to
 /// properly initialise service state.
@@ -116,7 +119,7 @@ class ExecEnv {
     return catalogd_client_cache_.get();
   }
   HBaseTableFactory* htable_factory() { return htable_factory_.get(); }
-  DiskIoMgr* disk_io_mgr() { return disk_io_mgr_.get(); }
+  io::DiskIoMgr* disk_io_mgr() { return disk_io_mgr_.get(); }
   Webserver* webserver() { return webserver_.get(); }
   MetricGroup* metrics() { return metrics_.get(); }
   MemTracker* process_mem_tracker() { return mem_tracker_.get(); }
@@ -174,7 +177,7 @@ class ExecEnv {
   boost::scoped_ptr<ImpalaBackendClientCache> impalad_client_cache_;
   boost::scoped_ptr<CatalogServiceClientCache> catalogd_client_cache_;
   boost::scoped_ptr<HBaseTableFactory> htable_factory_;
-  boost::scoped_ptr<DiskIoMgr> disk_io_mgr_;
+  boost::scoped_ptr<io::DiskIoMgr> disk_io_mgr_;
   boost::scoped_ptr<Webserver> webserver_;
   boost::scoped_ptr<MemTracker> mem_tracker_;
   boost::scoped_ptr<PoolMemTrackerRegistry> pool_mem_trackers_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/CMakeLists.txt b/be/src/runtime/io/CMakeLists.txt
new file mode 100644
index 0000000..ae89509
--- /dev/null
+++ b/be/src/runtime/io/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# where to put generated libraries
+set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime/io")
+
+# where to put generated binaries
+set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime/io")
+
+add_library(Io
+  disk-io-mgr.cc
+  disk-io-mgr-stress.cc
+  request-context.cc
+  scan-range.cc
+)
+add_dependencies(Io gen-deps)
+
+# This test runs forever so should not be part of 'make test'
+add_executable(disk-io-mgr-stress-test disk-io-mgr-stress-test.cc)
+target_link_libraries(disk-io-mgr-stress-test ${IMPALA_TEST_LINK_LIBS})
+
+ADD_BE_TEST(disk-io-mgr-test)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr-internal.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr-internal.h b/be/src/runtime/io/disk-io-mgr-internal.h
new file mode 100644
index 0000000..3fc3895
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr-internal.h
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_DISK_IO_MGR_INTERNAL_H
+#define IMPALA_RUNTIME_DISK_IO_MGR_INTERNAL_H
+
+#include <unistd.h>
+#include <queue>
+#include <boost/thread/locks.hpp>
+#include <gutil/strings/substitute.h>
+
+#include "common/logging.h"
+#include "runtime/io/request-context.h"
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/thread-resource-mgr.h"
+#include "util/condition-variable.h"
+#include "util/cpu-info.h"
+#include "util/debug-util.h"
+#include "util/disk-info.h"
+#include "util/filesystem-util.h"
+#include "util/hdfs-util.h"
+#include "util/impalad-metrics.h"
+
+/// This file contains internal structures shared between submodules of the IoMgr. Users
+/// of the IoMgr do not need to include this file.
+namespace impala {
+namespace io {
+
+/// Per disk state
+struct DiskIoMgr::DiskQueue {
+  /// Disk id (0-based)
+  int disk_id;
+
+  /// Lock that protects access to 'request_contexts' and 'work_available'
+  boost::mutex lock;
+
+  /// Condition variable to signal the disk threads that there is work to do or the
+  /// thread should shut down.  A disk thread will be woken up when there is a reader
+  /// added to the queue. A reader is only on the queue when it has at least one
+  /// scan range that is not blocked on available buffers.
+  ConditionVariable work_available;
+
+  /// list of all request contexts that have work queued on this disk
+  std::list<RequestContext*> request_contexts;
+
+  /// Enqueue the request context to the disk queue.  The DiskQueue lock must not be taken.
+  inline void EnqueueContext(RequestContext* worker) {
+    {
+      boost::unique_lock<boost::mutex> disk_lock(lock);
+      /// Check that the reader is not already on the queue
+      DCHECK(find(request_contexts.begin(), request_contexts.end(), worker) ==
+          request_contexts.end());
+      request_contexts.push_back(worker);
+    }
+    work_available.NotifyAll();
+  }
+
+  DiskQueue(int id) : disk_id(id) {}
+};
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr-stress-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr-stress-test.cc b/be/src/runtime/io/disk-io-mgr-stress-test.cc
new file mode 100644
index 0000000..45b36ed
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr-stress-test.cc
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/io/disk-io-mgr-stress.h"
+#include "util/cpu-info.h"
+#include "util/string-parser.h"
+
+#include "common/names.h"
+
+using namespace impala;
+using namespace impala::io;
+
+// Simple utility to run the disk io stress test.  A optional second parameter
+// can be passed to control how long to run this test (0 for forever).
+
+// TODO: make these configurable once we decide how to run BE tests with args
+const int DEFAULT_DURATION_SEC = 1;
+const int NUM_DISKS = 5;
+const int NUM_THREADS_PER_DISK = 5;
+const int NUM_CLIENTS = 10;
+const bool TEST_CANCELLATION = true;
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  CpuInfo::Init();
+  OsInfo::Init();
+  impala::InitThreading();
+  int duration_sec = DEFAULT_DURATION_SEC;
+
+  if (argc == 2) {
+    StringParser::ParseResult status;
+    duration_sec = StringParser::StringToInt<int>(argv[1], strlen(argv[1]), &status);
+    if (status != StringParser::PARSE_SUCCESS) {
+      printf("Invalid arg: %s\n", argv[1]);
+      return 1;
+    }
+  }
+  if (duration_sec != 0) {
+    printf("Running stress test for %d seconds.\n", duration_sec);
+  } else {
+    printf("Running stress test indefinitely.\n");
+  }
+  DiskIoMgrStress test(NUM_DISKS, NUM_THREADS_PER_DISK, NUM_CLIENTS, TEST_CANCELLATION);
+  test.Run(duration_sec);
+
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr-stress.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr-stress.cc b/be/src/runtime/io/disk-io-mgr-stress.cc
new file mode 100644
index 0000000..8815357
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr-stress.cc
@@ -0,0 +1,247 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <boost/thread/mutex.hpp>
+
+#include "runtime/io/disk-io-mgr-stress.h"
+
+#include "runtime/io/request-context.h"
+#include "util/time.h"
+
+#include "common/names.h"
+
+using namespace impala;
+using namespace impala::io;
+
+static const float ABORT_CHANCE = .10f;
+static const int MIN_READ_LEN = 1;
+static const int MAX_READ_LEN = 20;
+
+static const int MIN_FILE_LEN = 10;
+static const int MAX_FILE_LEN = 1024;
+
+// Make sure this is between MIN/MAX FILE_LEN to test more cases
+static const int MIN_READ_BUFFER_SIZE = 64;
+static const int MAX_READ_BUFFER_SIZE = 128;
+
+static const int CANCEL_READER_PERIOD_MS = 20;  // in ms
+
+static void CreateTempFile(const char* filename, const char* data) {
+  FILE* file = fopen(filename, "w");
+  CHECK(file != NULL);
+  fwrite(data, 1, strlen(data), file);
+  fclose(file);
+}
+
+string GenerateRandomData() {
+  int rand_len = rand() % (MAX_FILE_LEN - MIN_FILE_LEN) + MIN_FILE_LEN;
+  stringstream ss;
+  for (int i = 0; i < rand_len; ++i) {
+    char c = rand() % 26 + 'a';
+    ss << c;
+  }
+  return ss.str();
+}
+
+struct DiskIoMgrStress::Client {
+  boost::mutex lock;
+  unique_ptr<RequestContext> reader;
+  int file_idx;
+  vector<ScanRange*> scan_ranges;
+  int abort_at_byte;
+  int files_processed;
+};
+
+DiskIoMgrStress::DiskIoMgrStress(int num_disks, int num_threads_per_disk,
+     int num_clients, bool includes_cancellation) :
+    num_clients_(num_clients),
+    includes_cancellation_(includes_cancellation) {
+
+  time_t rand_seed = time(NULL);
+  LOG(INFO) << "Running with rand seed: " << rand_seed;
+  srand(rand_seed);
+
+  io_mgr_.reset(new DiskIoMgr(num_disks, num_threads_per_disk, num_threads_per_disk,
+      MIN_READ_BUFFER_SIZE, MAX_READ_BUFFER_SIZE));
+  Status status = io_mgr_->Init(&mem_tracker_);
+  CHECK(status.ok());
+
+  // Initialize some data files.  It doesn't really matter how many there are.
+  files_.resize(num_clients * 2);
+  for (int i = 0; i < files_.size(); ++i) {
+    stringstream ss;
+    ss << "/tmp/disk_io_mgr_stress_file" << i;
+    files_[i].filename = ss.str();
+    files_[i].data = GenerateRandomData();
+    CreateTempFile(files_[i].filename.c_str(), files_[i].data.c_str());
+  }
+
+  clients_ = new Client[num_clients_];
+  client_mem_trackers_.resize(num_clients_);
+  for (int i = 0; i < num_clients_; ++i) {
+    NewClient(i);
+  }
+}
+
+void DiskIoMgrStress::ClientThread(int client_id) {
+  Client* client = &clients_[client_id];
+  Status status;
+  char read_buffer[MAX_FILE_LEN];
+
+  while (!shutdown_) {
+    bool eos = false;
+    int bytes_read = 0;
+
+    const string& expected = files_[client->file_idx].data;
+
+    while (!eos) {
+      ScanRange* range;
+      Status status = io_mgr_->GetNextRange(client->reader.get(), &range);
+      CHECK(status.ok() || status.IsCancelled());
+      if (range == NULL) break;
+
+      while (true) {
+        unique_ptr<BufferDescriptor> buffer;
+        status = range->GetNext(&buffer);
+        CHECK(status.ok() || status.IsCancelled());
+        if (buffer == NULL) break;
+
+        int64_t scan_range_offset = buffer->scan_range_offset();
+        int len = buffer->len();
+        CHECK_GE(scan_range_offset, 0);
+        CHECK_LT(scan_range_offset, expected.size());
+        CHECK_GT(len, 0);
+
+        // We get scan ranges back in arbitrary order so the scan range to the file
+        // offset.
+        int64_t file_offset = scan_range_offset + range->offset();
+
+        // Validate the bytes read
+        CHECK_LE(file_offset + len, expected.size());
+        CHECK_EQ(strncmp(reinterpret_cast<char*>(buffer->buffer()),
+                     &expected.c_str()[file_offset], len), 0);
+
+        // Copy the bytes from this read into the result buffer.
+        memcpy(read_buffer + file_offset, buffer->buffer(), buffer->len());
+        io_mgr_->ReturnBuffer(move(buffer));
+        bytes_read += len;
+
+        CHECK_GE(bytes_read, 0);
+        CHECK_LE(bytes_read, expected.size());
+
+        if (bytes_read > client->abort_at_byte) {
+          eos = true;
+          break;
+        }
+      } // End of buffer
+    } // End of scan range
+
+    if (bytes_read == expected.size()) {
+      // This entire file was read without being cancelled, validate the entire result
+      CHECK(status.ok());
+      CHECK_EQ(strncmp(read_buffer, expected.c_str(), bytes_read), 0);
+    }
+
+    // Unregister the old client and get a new one
+    unique_lock<mutex> lock(client->lock);
+    io_mgr_->UnregisterContext(client->reader.get());
+    NewClient(client_id);
+  }
+
+  unique_lock<mutex> lock(client->lock);
+  io_mgr_->UnregisterContext(client->reader.get());
+  client->reader = NULL;
+}
+
+// Cancel a random reader
+void DiskIoMgrStress::CancelRandomReader() {
+  if (!includes_cancellation_) return;
+
+  int rand_client = rand() % num_clients_;
+
+  unique_lock<mutex> lock(clients_[rand_client].lock);
+  io_mgr_->CancelContext(clients_[rand_client].reader.get());
+}
+
+void DiskIoMgrStress::Run(int sec) {
+  shutdown_ = false;
+  for (int i = 0; i < num_clients_; ++i) {
+    readers_.add_thread(
+        new thread(&DiskIoMgrStress::ClientThread, this, i));
+  }
+
+  // Sleep and let the clients do their thing for 'sec'
+  for (int loop_count = 1; sec == 0 || loop_count <= sec; ++loop_count) {
+    int iter = (1000) / CANCEL_READER_PERIOD_MS;
+    for (int i = 0; i < iter; ++i) {
+      SleepForMs(CANCEL_READER_PERIOD_MS);
+      CancelRandomReader();
+    }
+    LOG(ERROR) << "Finished iteration: " << loop_count;
+  }
+
+  // Signal shutdown for the client threads
+  shutdown_ = true;
+
+  for (int i = 0; i < num_clients_; ++i) {
+    unique_lock<mutex> lock(clients_[i].lock);
+    if (clients_[i].reader != NULL) io_mgr_->CancelContext(clients_[i].reader.get());
+  }
+
+  readers_.join_all();
+}
+
+// Initialize a client to read one of the files at random.  The scan ranges are
+// assigned randomly.
+void DiskIoMgrStress::NewClient(int i) {
+  Client& client = clients_[i];
+  ++client.files_processed;
+  client.file_idx = rand() % files_.size();
+  int file_len = files_[client.file_idx].data.size();
+
+  client.abort_at_byte = file_len;
+
+  if (includes_cancellation_) {
+    float rand_value = rand() / (float)RAND_MAX;
+    if (rand_value < ABORT_CHANCE) {
+      // Abort at a random byte inside the file
+      client.abort_at_byte = rand() % file_len;
+    }
+  }
+
+  for (int i = 0; i < client.scan_ranges.size(); ++i) {
+    delete client.scan_ranges[i];
+  }
+  client.scan_ranges.clear();
+
+  int assigned_len = 0;
+  while (assigned_len < file_len) {
+    int range_len = rand() % (MAX_READ_LEN - MIN_READ_LEN) + MIN_READ_LEN;
+    range_len = min(range_len, file_len - assigned_len);
+
+    ScanRange* range = new ScanRange();
+    range->Reset(NULL, files_[client.file_idx].filename.c_str(), range_len, assigned_len,
+        0, false, BufferOpts::Uncached());
+    client.scan_ranges.push_back(range);
+    assigned_len += range_len;
+  }
+
+  client_mem_trackers_[i].reset(new MemTracker(-1, "", &mem_tracker_));
+  client.reader = io_mgr_->RegisterContext(client_mem_trackers_[i].get());
+  Status status = io_mgr_->AddScanRanges(client.reader.get(), client.scan_ranges);
+  CHECK(status.ok());
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/disk-io-mgr-stress.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/disk-io-mgr-stress.h b/be/src/runtime/io/disk-io-mgr-stress.h
new file mode 100644
index 0000000..b872694
--- /dev/null
+++ b/be/src/runtime/io/disk-io-mgr-stress.h
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+#ifndef IMPALA_RUNTIME_DISK_IO_MGR_STRESS_H
+#define IMPALA_RUNTIME_DISK_IO_MGR_STRESS_H
+
+#include <memory>
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+#include <boost/thread/thread.hpp>
+
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/thread-resource-mgr.h"
+
+namespace impala {
+namespace io {
+
+/// Test utility to stress the disk io mgr.  It allows for a configurable
+/// number of clients.  The clients continuously issue work to the io mgr and
+/// asynchronously get cancelled.  The stress test can be run forever or for
+/// a fixed duration.  The unit test runs this for a fixed duration.
+class DiskIoMgrStress {
+ public:
+  DiskIoMgrStress(int num_disks, int num_threads_per_disk, int num_clients,
+      bool includes_cancellation);
+
+  /// Run the test for 'sec'.  If 0, run forever
+  void Run(int sec);
+
+ private:
+  struct Client;
+
+  struct File {
+    std::string filename;
+    std::string data;  // the data in the file, used to validate
+  };
+
+
+  /// Files used for testing.  These are created at startup and recycled
+  /// during the test
+  std::vector<File> files_;
+
+  /// Root mem tracker.
+  MemTracker mem_tracker_;
+
+  /// io manager
+  boost::scoped_ptr<DiskIoMgr> io_mgr_;
+
+  /// Thread group for reader threads
+  boost::thread_group readers_;
+
+  /// Array of clients
+  int num_clients_;
+  Client* clients_;
+
+  /// Client MemTrackers, one per client.
+  std::vector<std::unique_ptr<MemTracker>> client_mem_trackers_;
+
+  /// If true, tests cancelling readers
+  bool includes_cancellation_;
+
+  /// Flag to signal that client reader threads should exit
+  volatile bool shutdown_;
+
+  /// Helper to initialize a new reader client, registering a new reader with the
+  /// io mgr and initializing the scan ranges
+  void NewClient(int i);
+
+  /// Thread running the reader.  When the current reader is done (either normally
+  /// or cancelled), it picks up a new reader
+  void ClientThread(int client_id);
+
+  /// Possibly cancels a random reader.
+  void CancelRandomReader();
+};
+}
+}
+
+#endif

[09/16] incubator-impala git commit: IMPALA-4835 (prep only): create io subfolder and namespace

Posted by ta...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/handle-cache.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/handle-cache.inline.h b/be/src/runtime/io/handle-cache.inline.h
new file mode 100644
index 0000000..10db49e
--- /dev/null
+++ b/be/src/runtime/io/handle-cache.inline.h
@@ -0,0 +1,232 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <tuple>
+
+#include "runtime/io/handle-cache.h"
+#include "util/hash-util.h"
+#include "util/time.h"
+
+#ifndef IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_INLINE_H
+#define IMPALA_RUNTIME_DISK_IO_MGR_HANDLE_CACHE_INLINE_H
+
+namespace impala {
+namespace io {
+
+HdfsFileHandle::HdfsFileHandle(const hdfsFS& fs, const char* fname,
+    int64_t mtime)
+    : fs_(fs), hdfs_file_(hdfsOpenFile(fs, fname, O_RDONLY, 0, 0, 0)), mtime_(mtime) {
+  ImpaladMetrics::IO_MGR_NUM_CACHED_FILE_HANDLES->Increment(1L);
+  VLOG_FILE << "hdfsOpenFile() file=" << fname << " fid=" << hdfs_file_;
+}
+
+HdfsFileHandle::~HdfsFileHandle() {
+  if (hdfs_file_ != nullptr && fs_ != nullptr) {
+    ImpaladMetrics::IO_MGR_NUM_CACHED_FILE_HANDLES->Increment(-1L);
+    VLOG_FILE << "hdfsCloseFile() fid=" << hdfs_file_;
+    hdfsCloseFile(fs_, hdfs_file_);
+  }
+  fs_ = nullptr;
+  hdfs_file_ = nullptr;
+}
+
+template <size_t NUM_PARTITIONS>
+  FileHandleCache<NUM_PARTITIONS>::FileHandleCache(size_t capacity,
+      uint64_t unused_handle_timeout_secs)
+  : unused_handle_timeout_secs_(unused_handle_timeout_secs) {
+  DCHECK_GT(NUM_PARTITIONS, 0);
+  size_t remainder = capacity % NUM_PARTITIONS;
+  size_t base_capacity = capacity / NUM_PARTITIONS;
+  size_t partition_capacity = (remainder > 0 ? base_capacity + 1 : base_capacity);
+  for (FileHandleCachePartition& p : cache_partitions_) {
+    p.size = 0;
+    p.capacity = partition_capacity;
+  }
+}
+
+template <size_t NUM_PARTITIONS>
+FileHandleCache<NUM_PARTITIONS>::LruListEntry::LruListEntry(
+    typename MapType::iterator map_entry_in)
+     : map_entry(map_entry_in), timestamp_seconds(MonotonicSeconds()) {}
+
+template <size_t NUM_PARTITIONS>
+FileHandleCache<NUM_PARTITIONS>::~FileHandleCache() {
+  shut_down_promise_.Set(true);
+  if (eviction_thread_ != nullptr) eviction_thread_->Join();
+}
+
+template <size_t NUM_PARTITIONS>
+Status FileHandleCache<NUM_PARTITIONS>::Init() {
+  return Thread::Create("disk-io-mgr-handle-cache", "File Handle Timeout",
+      &FileHandleCache<NUM_PARTITIONS>::EvictHandlesLoop, this, &eviction_thread_);
+}
+
+template <size_t NUM_PARTITIONS>
+HdfsFileHandle* FileHandleCache<NUM_PARTITIONS>::GetFileHandle(
+    const hdfsFS& fs, std::string* fname, int64_t mtime, bool require_new_handle,
+    bool* cache_hit) {
+  // Hash the key and get appropriate partition
+  int index = HashUtil::Hash(fname->data(), fname->size(), 0) % NUM_PARTITIONS;
+  FileHandleCachePartition& p = cache_partitions_[index];
+  boost::lock_guard<SpinLock> g(p.lock);
+  pair<typename MapType::iterator, typename MapType::iterator> range =
+    p.cache.equal_range(*fname);
+
+  // If this requires a new handle, skip to the creation codepath. Otherwise,
+  // find an unused entry with the same mtime
+  FileHandleEntry* ret_elem = nullptr;
+  if (!require_new_handle) {
+    while (range.first != range.second) {
+      FileHandleEntry* elem = &range.first->second;
+      if (!elem->in_use && elem->fh->mtime() == mtime) {
+        // This element is currently in the lru_list, which means that lru_entry must
+        // be an iterator pointing into the lru_list.
+        DCHECK(elem->lru_entry != p.lru_list.end());
+        // Remove the element from the lru_list and designate that it is not on
+        // the lru_list by resetting its iterator to point to the end of the list.
+        p.lru_list.erase(elem->lru_entry);
+        elem->lru_entry = p.lru_list.end();
+        ret_elem = elem;
+        *cache_hit = true;
+        break;
+      }
+      ++range.first;
+    }
+  }
+
+  // There was no entry that was free or caller asked for a new handle
+  if (!ret_elem) {
+    *cache_hit = false;
+    // Create a new entry and move it into the map
+    HdfsFileHandle* new_fh = new HdfsFileHandle(fs, fname->data(), mtime);
+    if (!new_fh->ok()) {
+      delete new_fh;
+      return nullptr;
+    }
+    FileHandleEntry entry(new_fh, p.lru_list);
+    typename MapType::iterator new_it = p.cache.emplace_hint(range.second,
+        *fname, std::move(entry));
+    ret_elem = &new_it->second;
+    ++p.size;
+    if (p.size > p.capacity) EvictHandles(p);
+  }
+
+  DCHECK(ret_elem->fh.get() != nullptr);
+  DCHECK(!ret_elem->in_use);
+  ret_elem->in_use = true;
+  ImpaladMetrics::IO_MGR_NUM_FILE_HANDLES_OUTSTANDING->Increment(1L);
+  return ret_elem->fh.get();
+}
+
+template <size_t NUM_PARTITIONS>
+void FileHandleCache<NUM_PARTITIONS>::ReleaseFileHandle(std::string* fname,
+    HdfsFileHandle* fh, bool destroy_handle) {
+  DCHECK(fh != nullptr);
+  // Hash the key and get appropriate partition
+  int index = HashUtil::Hash(fname->data(), fname->size(), 0) % NUM_PARTITIONS;
+  FileHandleCachePartition& p = cache_partitions_[index];
+  boost::lock_guard<SpinLock> g(p.lock);
+  pair<typename MapType::iterator, typename MapType::iterator> range =
+    p.cache.equal_range(*fname);
+
+  // TODO: This can be optimized by maintaining some state in the file handle about
+  // its location in the map.
+  typename MapType::iterator release_it = range.first;
+  while (release_it != range.second) {
+    FileHandleEntry* elem = &release_it->second;
+    if (elem->fh.get() == fh) break;
+    ++release_it;
+  }
+  DCHECK(release_it != range.second);
+
+  // This file handle is no longer referenced
+  FileHandleEntry* release_elem = &release_it->second;
+  DCHECK(release_elem->in_use);
+  release_elem->in_use = false;
+  ImpaladMetrics::IO_MGR_NUM_FILE_HANDLES_OUTSTANDING->Increment(-1L);
+  if (destroy_handle) {
+    --p.size;
+    p.cache.erase(release_it);
+    return;
+  }
+  // Hdfs can use some memory for readahead buffering. Calling unbuffer reduces
+  // this buffering so that the file handle takes up less memory when in the cache.
+  // If unbuffering is not supported, then hdfsUnbufferFile() will return a non-zero
+  // return code, and we close the file handle and remove it from the cache.
+  if (hdfsUnbufferFile(release_elem->fh->file()) == 0) {
+    // This FileHandleEntry must not be in the lru list already, because it was
+    // in use. Verify this by checking that the lru_entry is pointing to the end,
+    // which cannot be true for any element in the lru list.
+    DCHECK(release_elem->lru_entry == p.lru_list.end());
+    // Add this to the lru list, establishing links in both directions.
+    // The FileHandleEntry has an iterator to the LruListEntry and the
+    // LruListEntry has an iterator to the location of the FileHandleEntry in
+    // the cache.
+    release_elem->lru_entry = p.lru_list.emplace(p.lru_list.end(), release_it);
+    if (p.size > p.capacity) EvictHandles(p);
+  } else {
+    VLOG_FILE << "FS does not support file handle unbuffering, closing file="
+              << fname;
+    --p.size;
+    p.cache.erase(release_it);
+  }
+}
+
+template <size_t NUM_PARTITIONS>
+void FileHandleCache<NUM_PARTITIONS>::EvictHandlesLoop() {
+  while (true) {
+    for (FileHandleCachePartition& p : cache_partitions_) {
+      boost::lock_guard<SpinLock> g(p.lock);
+      EvictHandles(p);
+    }
+    // This Get() will time out until shutdown, when the promise is set.
+    bool timed_out;
+    shut_down_promise_.Get(EVICT_HANDLES_PERIOD_MS, &timed_out);
+    if (!timed_out) break;
+  }
+  // The promise must be set to true.
+  DCHECK(shut_down_promise_.IsSet());
+  DCHECK(shut_down_promise_.Get());
+}
+
+template <size_t NUM_PARTITIONS>
+void FileHandleCache<NUM_PARTITIONS>::EvictHandles(
+    FileHandleCache<NUM_PARTITIONS>::FileHandleCachePartition& p) {
+  uint64_t now = MonotonicSeconds();
+  uint64_t oldest_allowed_timestamp =
+      now > unused_handle_timeout_secs_ ? now - unused_handle_timeout_secs_ : 0;
+  while (p.lru_list.size() > 0) {
+    // Peek at the oldest element
+    LruListEntry oldest_entry = p.lru_list.front();
+    typename MapType::iterator oldest_entry_map_it = oldest_entry.map_entry;
+    uint64_t oldest_entry_timestamp = oldest_entry.timestamp_seconds;
+    // If the oldest element does not need to be aged out and the cache is not over
+    // capacity, then we are done and there is nothing to evict.
+    if (p.size <= p.capacity && (unused_handle_timeout_secs_ == 0 ||
+        oldest_entry_timestamp >= oldest_allowed_timestamp)) {
+      return;
+    }
+    // Evict the oldest element
+    DCHECK(!oldest_entry_map_it->second.in_use);
+    p.cache.erase(oldest_entry_map_it);
+    p.lru_list.pop_front();
+    --p.size;
+  }
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/request-context.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/request-context.cc b/be/src/runtime/io/request-context.cc
new file mode 100644
index 0000000..287f53a
--- /dev/null
+++ b/be/src/runtime/io/request-context.cc
@@ -0,0 +1,293 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/io/disk-io-mgr-internal.h"
+
+#include "common/names.h"
+
+using namespace impala;
+using namespace impala::io;
+
+void RequestContext::Cancel(const Status& status) {
+  DCHECK(!status.ok());
+
+  // Callbacks are collected in this vector and invoked while no lock is held.
+  vector<WriteRange::WriteDoneCallback> write_callbacks;
+  {
+    lock_guard<mutex> lock(lock_);
+    DCHECK(Validate()) << endl << DebugString();
+
+    // Already being cancelled
+    if (state_ == RequestContext::Cancelled) return;
+
+    DCHECK(status_.ok());
+    status_ = status;
+
+    // The reader will be put into a cancelled state until call cleanup is complete.
+    state_ = RequestContext::Cancelled;
+
+    // Cancel all scan ranges for this reader. Each range could be one one of
+    // four queues.
+    for (int i = 0; i < disk_states_.size(); ++i) {
+      RequestContext::PerDiskState& state = disk_states_[i];
+      RequestRange* range = NULL;
+      while ((range = state.in_flight_ranges()->Dequeue()) != NULL) {
+        if (range->request_type() == RequestType::READ) {
+          static_cast<ScanRange*>(range)->Cancel(status);
+        } else {
+          DCHECK(range->request_type() == RequestType::WRITE);
+          write_callbacks.push_back(static_cast<WriteRange*>(range)->callback_);
+        }
+      }
+
+      ScanRange* scan_range;
+      while ((scan_range = state.unstarted_scan_ranges()->Dequeue()) != NULL) {
+        scan_range->Cancel(status);
+      }
+      WriteRange* write_range;
+      while ((write_range = state.unstarted_write_ranges()->Dequeue()) != NULL) {
+        write_callbacks.push_back(write_range->callback_);
+      }
+    }
+
+    ScanRange* range = NULL;
+    while ((range = ready_to_start_ranges_.Dequeue()) != NULL) {
+      range->Cancel(status);
+    }
+    while ((range = blocked_ranges_.Dequeue()) != NULL) {
+      range->Cancel(status);
+    }
+    while ((range = cached_ranges_.Dequeue()) != NULL) {
+      range->Cancel(status);
+    }
+
+    // Schedule reader on all disks. The disks will notice it is cancelled and do any
+    // required cleanup
+    for (int i = 0; i < disk_states_.size(); ++i) {
+      RequestContext::PerDiskState& state = disk_states_[i];
+      state.ScheduleContext(this, i);
+    }
+  }
+
+  for (const WriteRange::WriteDoneCallback& write_callback: write_callbacks) {
+    write_callback(status_);
+  }
+
+  // Signal reader and unblock the GetNext/Read thread.  That read will fail with
+  // a cancelled status.
+  ready_to_start_ranges_cv_.NotifyAll();
+}
+
+void RequestContext::CancelAndMarkInactive() {
+  Cancel(Status::CANCELLED);
+
+  boost::unique_lock<boost::mutex> l(lock_);
+  DCHECK_NE(state_, Inactive);
+  DCHECK(Validate()) << endl << DebugString();
+
+  // Wait until the ranges finish up.
+  while (num_disks_with_ranges_ > 0) disks_complete_cond_var_.Wait(l);
+
+  // Validate that no buffers were leaked from this context.
+  DCHECK_EQ(num_buffers_in_reader_.Load(), 0) << endl << DebugString();
+  DCHECK_EQ(num_used_buffers_.Load(), 0) << endl << DebugString();
+  DCHECK(Validate()) << endl << DebugString();
+  state_ = Inactive;
+}
+
+void RequestContext::AddRequestRange(
+    RequestRange* range, bool schedule_immediately) {
+  // DCHECK(lock_.is_locked()); // TODO: boost should have this API
+  RequestContext::PerDiskState& state = disk_states_[range->disk_id()];
+  if (state.done()) {
+    DCHECK_EQ(state.num_remaining_ranges(), 0);
+    state.set_done(false);
+    ++num_disks_with_ranges_;
+  }
+
+  bool schedule_context;
+  if (range->request_type() == RequestType::READ) {
+    ScanRange* scan_range = static_cast<ScanRange*>(range);
+    if (schedule_immediately) {
+      ScheduleScanRange(scan_range);
+    } else {
+      state.unstarted_scan_ranges()->Enqueue(scan_range);
+      num_unstarted_scan_ranges_.Add(1);
+    }
+    // If next_scan_range_to_start is NULL, schedule this RequestContext so that it will
+    // be set. If it's not NULL, this context will be scheduled when GetNextRange() is
+    // invoked.
+    schedule_context = state.next_scan_range_to_start() == NULL;
+  } else {
+    DCHECK(range->request_type() == RequestType::WRITE);
+    DCHECK(!schedule_immediately);
+    WriteRange* write_range = static_cast<WriteRange*>(range);
+    state.unstarted_write_ranges()->Enqueue(write_range);
+
+    // ScheduleContext() has no effect if the context is already scheduled,
+    // so this is safe.
+    schedule_context = true;
+  }
+
+  if (schedule_context) state.ScheduleContext(this, range->disk_id());
+  ++state.num_remaining_ranges();
+}
+
+RequestContext::RequestContext(
+    DiskIoMgr* parent, int num_disks, MemTracker* tracker)
+  : parent_(parent), mem_tracker_(tracker), disk_states_(num_disks) {}
+
+// Dumps out request context information. Lock should be taken by caller
+string RequestContext::DebugString() const {
+  stringstream ss;
+  ss << endl << "  RequestContext: " << (void*)this << " (state=";
+  if (state_ == RequestContext::Inactive) ss << "Inactive";
+  if (state_ == RequestContext::Cancelled) ss << "Cancelled";
+  if (state_ == RequestContext::Active) ss << "Active";
+  if (state_ != RequestContext::Inactive) {
+    ss << " status_=" << (status_.ok() ? "OK" : status_.GetDetail())
+       << " #ready_buffers=" << num_ready_buffers_.Load()
+       << " #used_buffers=" << num_used_buffers_.Load()
+       << " #num_buffers_in_reader=" << num_buffers_in_reader_.Load()
+       << " #finished_scan_ranges=" << num_finished_ranges_.Load()
+       << " #disk_with_ranges=" << num_disks_with_ranges_
+       << " #disks=" << num_disks_with_ranges_;
+    for (int i = 0; i < disk_states_.size(); ++i) {
+      ss << endl << "   " << i << ": "
+         << "is_on_queue=" << disk_states_[i].is_on_queue()
+         << " done=" << disk_states_[i].done()
+         << " #num_remaining_scan_ranges=" << disk_states_[i].num_remaining_ranges()
+         << " #in_flight_ranges=" << disk_states_[i].in_flight_ranges()->size()
+         << " #unstarted_scan_ranges=" << disk_states_[i].unstarted_scan_ranges()->size()
+         << " #unstarted_write_ranges="
+         << disk_states_[i].unstarted_write_ranges()->size()
+         << " #reading_threads=" << disk_states_[i].num_threads_in_op();
+    }
+  }
+  ss << ")";
+  return ss.str();
+}
+
+bool RequestContext::Validate() const {
+  if (state_ == RequestContext::Inactive) {
+    LOG(WARNING) << "state_ == RequestContext::Inactive";
+    return false;
+  }
+
+  if (num_used_buffers_.Load() < 0) {
+    LOG(WARNING) << "num_used_buffers_ < 0: #used=" << num_used_buffers_.Load();
+    return false;
+  }
+
+  if (num_ready_buffers_.Load() < 0) {
+    LOG(WARNING) << "num_ready_buffers_ < 0: #used=" << num_ready_buffers_.Load();
+    return false;
+  }
+
+  int total_unstarted_ranges = 0;
+  for (int i = 0; i < disk_states_.size(); ++i) {
+    const PerDiskState& state = disk_states_[i];
+    bool on_queue = state.is_on_queue();
+    int num_reading_threads = state.num_threads_in_op();
+
+    total_unstarted_ranges += state.unstarted_scan_ranges()->size();
+
+    if (num_reading_threads < 0) {
+      LOG(WARNING) << "disk_id=" << i
+                   << "state.num_threads_in_read < 0: #threads="
+                   << num_reading_threads;
+      return false;
+    }
+
+    if (state_ != RequestContext::Cancelled) {
+      if (state.unstarted_scan_ranges()->size() + state.in_flight_ranges()->size() >
+          state.num_remaining_ranges()) {
+        LOG(WARNING) << "disk_id=" << i
+                     << " state.unstarted_ranges.size() + state.in_flight_ranges.size()"
+                     << " > state.num_remaining_ranges:"
+                     << " #unscheduled=" << state.unstarted_scan_ranges()->size()
+                     << " #in_flight=" << state.in_flight_ranges()->size()
+                     << " #remaining=" << state.num_remaining_ranges();
+        return false;
+      }
+
+      // If we have an in_flight range, the reader must be on the queue or have a
+      // thread actively reading for it.
+      if (!state.in_flight_ranges()->empty() && !on_queue && num_reading_threads == 0) {
+        LOG(WARNING) << "disk_id=" << i
+                     << " reader has inflight ranges but is not on the disk queue."
+                     << " #in_flight_ranges=" << state.in_flight_ranges()->size()
+                     << " #reading_threads=" << num_reading_threads
+                     << " on_queue=" << on_queue;
+        return false;
+      }
+
+      if (state.done() && num_reading_threads > 0) {
+        LOG(WARNING) << "disk_id=" << i
+                     << " state set to done but there are still threads working."
+                     << " #reading_threads=" << num_reading_threads;
+        return false;
+      }
+    } else {
+      // Is Cancelled
+      if (!state.in_flight_ranges()->empty()) {
+        LOG(WARNING) << "disk_id=" << i
+                     << "Reader cancelled but has in flight ranges.";
+        return false;
+      }
+      if (!state.unstarted_scan_ranges()->empty()) {
+        LOG(WARNING) << "disk_id=" << i
+                     << "Reader cancelled but has unstarted ranges.";
+        return false;
+      }
+    }
+
+    if (state.done() && on_queue) {
+      LOG(WARNING) << "disk_id=" << i
+                   << " state set to done but the reader is still on the disk queue."
+                   << " state.done=true and state.is_on_queue=true";
+      return false;
+    }
+  }
+
+  if (state_ != RequestContext::Cancelled) {
+    if (total_unstarted_ranges != num_unstarted_scan_ranges_.Load()) {
+      LOG(WARNING) << "total_unstarted_ranges=" << total_unstarted_ranges
+                   << " sum_in_states=" << num_unstarted_scan_ranges_.Load();
+      return false;
+    }
+  } else {
+    if (!ready_to_start_ranges_.empty()) {
+      LOG(WARNING) << "Reader cancelled but has ready to start ranges.";
+      return false;
+    }
+    if (!blocked_ranges_.empty()) {
+      LOG(WARNING) << "Reader cancelled but has blocked ranges.";
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void RequestContext::PerDiskState::ScheduleContext(
+    RequestContext* context, int disk_id) {
+  if (!is_on_queue_ && !done_) {
+    is_on_queue_ = true;
+    context->parent_->disk_queues_[disk_id]->EnqueueContext(context);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/request-context.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/request-context.h b/be/src/runtime/io/request-context.h
new file mode 100644
index 0000000..9807805
--- /dev/null
+++ b/be/src/runtime/io/request-context.h
@@ -0,0 +1,403 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_IO_REQUEST_CONTEXT_H
+#define IMPALA_RUNTIME_IO_REQUEST_CONTEXT_H
+
+#include "runtime/io/disk-io-mgr.h"
+#include "util/condition-variable.h"
+
+namespace impala {
+namespace io {
+/// A request context is used to group together I/O requests belonging to a client of the
+/// I/O manager for management and scheduling. For most I/O manager clients it is an
+/// opaque pointer, but some clients may need to include this header, e.g. to make the
+/// unique_ptr<DiskIoRequestContext> destructor work correctly.
+///
+/// Implementation Details
+/// ======================
+/// This object maintains a lot of state that is carefully synchronized. The context
+/// maintains state across all disks as well as per disk state.
+/// The unit for an IO request is a RequestRange, which may be a ScanRange or a
+/// WriteRange.
+/// A scan range for the reader is on one of five states:
+/// 1) PerDiskState's unstarted_ranges: This range has only been queued
+///    and nothing has been read from it.
+/// 2) RequestContext's ready_to_start_ranges_: This range is about to be started.
+///    As soon as the reader picks it up, it will move to the in_flight_ranges
+///    queue.
+/// 3) PerDiskState's in_flight_ranges: This range is being processed and will
+///    be read from the next time a disk thread picks it up in GetNextRequestRange()
+/// 4) ScanRange's outgoing ready buffers is full. We can't read for this range
+///    anymore. We need the caller to pull a buffer off which will put this in
+///    the in_flight_ranges queue. These ranges are in the RequestContext's
+///    blocked_ranges_ queue.
+/// 5) ScanRange is cached and in the cached_ranges_ queue.
+//
+/// If the scan range is read and does not get blocked on the outgoing queue, the
+/// transitions are: 1 -> 2 -> 3.
+/// If the scan range does get blocked, the transitions are
+/// 1 -> 2 -> 3 -> (4 -> 3)*
+//
+/// In the case of a cached scan range, the range is immediately put in cached_ranges_.
+/// When the caller asks for the next range to process, we first pull ranges from
+/// the cache_ranges_ queue. If the range was cached, the range is removed and
+/// done (ranges are either entirely cached or not at all). If the cached read attempt
+/// fails, we put the range in state 1.
+//
+/// A write range for a context may be in one of two lists:
+/// 1) unstarted_write_ranges_ : Ranges that have been queued but not processed.
+/// 2) in_flight_ranges_: The write range is ready to be processed by the next disk thread
+///    that picks it up in GetNextRequestRange().
+//
+/// AddWriteRange() adds WriteRanges for a disk.
+/// It is the responsibility of the client to pin the data to be written via a WriteRange
+/// in memory. After a WriteRange has been written, a callback is invoked to inform the
+/// client that the write has completed.
+//
+/// An important assumption is that write does not exceed the maximum read size and that
+/// the entire range is written when the write request is handled. (In other words, writes
+/// are not broken up.)
+//
+/// When a RequestContext is processed by a disk thread in GetNextRequestRange(),
+/// a write range is always removed from the list of unstarted write ranges and appended
+/// to the in_flight_ranges_ queue. This is done to alternate reads and writes - a read
+/// that is scheduled (by calling GetNextRange()) is always followed by a write (if one
+/// exists).  And since at most one WriteRange can be present in in_flight_ranges_ at any
+/// time (once a write range is returned from GetNetxRequestRange() it is completed an
+/// not re-enqueued), a scan range scheduled via a call to GetNextRange() can be queued up
+/// behind at most one write range.
+class RequestContext {
+ public:
+  ~RequestContext() { DCHECK_EQ(state_, Inactive) << "Must be unregistered."; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(RequestContext);
+  friend class DiskIoMgr;
+  friend class ScanRange;
+
+  class PerDiskState;
+
+  enum State {
+    /// Reader is initialized and maps to a client
+    Active,
+
+    /// Reader is in the process of being cancelled.  Cancellation is coordinated between
+    /// different threads and when they are all complete, the reader context is moved to
+    /// the inactive state.
+    Cancelled,
+
+    /// Reader context does not map to a client.  Accessing memory in this context
+    /// is invalid (i.e. it is equivalent to a dangling pointer).
+    Inactive,
+  };
+
+  RequestContext(DiskIoMgr* parent, int num_disks, MemTracker* tracker);
+
+  /// Decrements the number of active disks for this reader.  If the disk count
+  /// goes to 0, the disk complete condition variable is signaled.
+  /// Reader lock must be taken before this call.
+  void DecrementDiskRefCount() {
+    // boost doesn't let us dcheck that the reader lock is taken
+    DCHECK_GT(num_disks_with_ranges_, 0);
+    if (--num_disks_with_ranges_ == 0) {
+      disks_complete_cond_var_.NotifyAll();
+    }
+    DCHECK(Validate()) << std::endl << DebugString();
+  }
+
+  /// Reader & Disk Scheduling: Readers that currently can't do work are not on
+  /// the disk's queue. These readers are ones that don't have any ranges in the
+  /// in_flight_queue AND have not prepared a range by setting next_range_to_start.
+  /// The rule to make sure readers are scheduled correctly is to ensure anytime a
+  /// range is put on the in_flight_queue or anytime next_range_to_start is set to
+  /// NULL, the reader is scheduled.
+
+  /// Adds range to in_flight_ranges, scheduling this reader on the disk threads
+  /// if necessary.
+  /// Reader lock must be taken before this.
+  void ScheduleScanRange(ScanRange* range) {
+    DCHECK_EQ(state_, Active);
+    DCHECK(range != NULL);
+    RequestContext::PerDiskState& state = disk_states_[range->disk_id()];
+    state.in_flight_ranges()->Enqueue(range);
+    state.ScheduleContext(this, range->disk_id());
+  }
+
+  /// Cancels the context with status code 'status'
+  void Cancel(const Status& status);
+
+  /// Cancel the context if not already cancelled, wait for all scan ranges to finish
+  /// and mark the context as inactive, after which it cannot be used.
+  void CancelAndMarkInactive();
+
+  /// Adds request range to disk queue for this request context. Currently,
+  /// schedule_immediately must be false is RequestRange is a write range.
+  void AddRequestRange(RequestRange* range, bool schedule_immediately);
+
+  /// Validates invariants of reader.  Reader lock must be taken beforehand.
+  bool Validate() const;
+
+  /// Dumps out reader information.  Lock should be taken by caller
+  std::string DebugString() const;
+
+  /// Parent object
+  DiskIoMgr* const parent_;
+
+  /// Memory used for this reader.  This is unowned by this object.
+  MemTracker* const mem_tracker_;
+
+  /// Total bytes read for this reader
+  RuntimeProfile::Counter* bytes_read_counter_ = nullptr;
+
+  /// Total time spent in hdfs reading
+  RuntimeProfile::Counter* read_timer_ = nullptr;
+
+  /// Number of active read threads
+  RuntimeProfile::Counter* active_read_thread_counter_ = nullptr;
+
+  /// Disk access bitmap. The counter's bit[i] is set if disk id i has been accessed.
+  /// TODO: we can only support up to 64 disks with this bitmap but it lets us use a
+  /// builtin atomic instruction. Probably good enough for now.
+  RuntimeProfile::Counter* disks_accessed_bitmap_ = nullptr;
+
+  /// Total number of bytes read locally, updated at end of each range scan
+  AtomicInt64 bytes_read_local_{0};
+
+  /// Total number of bytes read via short circuit read, updated at end of each range scan
+  AtomicInt64 bytes_read_short_circuit_{0};
+
+  /// Total number of bytes read from date node cache, updated at end of each range scan
+  AtomicInt64 bytes_read_dn_cache_{0};
+
+  /// Total number of bytes from remote reads that were expected to be local.
+  AtomicInt64 unexpected_remote_bytes_{0};
+
+  /// The number of buffers that have been returned to the reader (via GetNext) that the
+  /// reader has not returned. Only included for debugging and diagnostics.
+  AtomicInt32 num_buffers_in_reader_{0};
+
+  /// The number of scan ranges that have been completed for this reader.
+  AtomicInt32 num_finished_ranges_{0};
+
+  /// The number of scan ranges that required a remote read, updated at the end of each
+  /// range scan. Only used for diagnostics.
+  AtomicInt32 num_remote_ranges_{0};
+
+  /// The total number of scan ranges that have not been started. Only used for
+  /// diagnostics. This is the sum of all unstarted_scan_ranges across all disks.
+  AtomicInt32 num_unstarted_scan_ranges_{0};
+
+  /// Total number of file handle opens where the file handle was present in the cache
+  AtomicInt32 cached_file_handles_hit_count_{0};
+
+  /// Total number of file handle opens where the file handle was not in the cache
+  AtomicInt32 cached_file_handles_miss_count_{0};
+
+  /// The number of buffers that are being used for this reader. This is the sum
+  /// of all buffers in ScanRange queues and buffers currently being read into (i.e. about
+  /// to be queued). This includes both IOMgr-allocated buffers and client-provided
+  /// buffers.
+  AtomicInt32 num_used_buffers_{0};
+
+  /// The total number of ready buffers across all ranges.  Ready buffers are buffers
+  /// that have been read from disk but not retrieved by the caller.
+  /// This is the sum of all queued buffers in all ranges for this reader context.
+  AtomicInt32 num_ready_buffers_{0};
+
+  /// All fields below are accessed by multiple threads and the lock needs to be
+  /// taken before accessing them. Must be acquired before ScanRange::lock_ if both
+  /// are held simultaneously.
+  boost::mutex lock_;
+
+  /// Current state of the reader
+  State state_ = Active;
+
+  /// Status of this reader.  Set to non-ok if cancelled.
+  Status status_;
+
+  /// The number of disks with scan ranges remaining (always equal to the sum of
+  /// disks with ranges).
+  int num_disks_with_ranges_ = 0;
+
+  /// This is the list of ranges that are expected to be cached on the DN.
+  /// When the reader asks for a new range (GetNextScanRange()), we first
+  /// return ranges from this list.
+  InternalQueue<ScanRange> cached_ranges_;
+
+  /// A list of ranges that should be returned in subsequent calls to
+  /// GetNextRange.
+  /// There is a trade-off with when to populate this list.  Populating it on
+  /// demand means consumers need to wait (happens in DiskIoMgr::GetNextRange()).
+  /// Populating it preemptively means we make worse scheduling decisions.
+  /// We currently populate one range per disk.
+  /// TODO: think about this some more.
+  InternalQueue<ScanRange> ready_to_start_ranges_;
+  ConditionVariable ready_to_start_ranges_cv_; // used with lock_
+
+  /// Ranges that are blocked due to back pressure on outgoing buffers.
+  InternalQueue<ScanRange> blocked_ranges_;
+
+  /// Condition variable for UnregisterContext() to wait for all disks to complete
+  ConditionVariable disks_complete_cond_var_;
+
+  /// Struct containing state per disk. See comments in the disk read loop on how
+  /// they are used.
+  class PerDiskState {
+   public:
+    bool done() const { return done_; }
+    void set_done(bool b) { done_ = b; }
+
+    int num_remaining_ranges() const { return num_remaining_ranges_; }
+    int& num_remaining_ranges() { return num_remaining_ranges_; }
+
+    ScanRange* next_scan_range_to_start() { return next_scan_range_to_start_; }
+    void set_next_scan_range_to_start(ScanRange* range) {
+      next_scan_range_to_start_ = range;
+    }
+
+    /// We need to have a memory barrier to prevent this load from being reordered
+    /// with num_threads_in_op(), since these variables are set without the reader
+    /// lock taken
+    bool is_on_queue() const {
+      bool b = is_on_queue_;
+      __sync_synchronize();
+      return b;
+    }
+
+    int num_threads_in_op() const {
+      int v = num_threads_in_op_.Load();
+      // TODO: determine whether this barrier is necessary for any callsites.
+      AtomicUtil::MemoryBarrier();
+      return v;
+    }
+
+    const InternalQueue<ScanRange>* unstarted_scan_ranges() const {
+      return &unstarted_scan_ranges_;
+    }
+    const InternalQueue<WriteRange>* unstarted_write_ranges() const {
+      return &unstarted_write_ranges_;
+    }
+    const InternalQueue<RequestRange>* in_flight_ranges() const {
+      return &in_flight_ranges_;
+    }
+
+    InternalQueue<ScanRange>* unstarted_scan_ranges() { return &unstarted_scan_ranges_; }
+    InternalQueue<WriteRange>* unstarted_write_ranges() {
+      return &unstarted_write_ranges_;
+    }
+    InternalQueue<RequestRange>* in_flight_ranges() { return &in_flight_ranges_; }
+
+    /// Schedules the request context on this disk if it's not already on the queue.
+    /// Context lock must be taken before this.
+    void ScheduleContext(RequestContext* context, int disk_id);
+
+    /// Increment the ref count on reader.  We need to track the number of threads per
+    /// reader per disk that are in the unlocked hdfs read code section. This is updated
+    /// by multiple threads without a lock so we need to use an atomic int.
+    void IncrementRequestThreadAndDequeue() {
+      num_threads_in_op_.Add(1);
+      is_on_queue_ = false;
+    }
+
+    void DecrementRequestThread() { num_threads_in_op_.Add(-1); }
+
+    /// Decrement request thread count and do final cleanup if this is the last
+    /// thread. RequestContext lock must be taken before this.
+    void DecrementRequestThreadAndCheckDone(RequestContext* context) {
+      num_threads_in_op_.Add(-1); // Also acts as a barrier.
+      if (!is_on_queue_ && num_threads_in_op_.Load() == 0 && !done_) {
+        // This thread is the last one for this reader on this disk, do final cleanup
+        context->DecrementDiskRefCount();
+        done_ = true;
+      }
+    }
+
+   private:
+    /// If true, this disk is all done for this request context, including any cleanup.
+    /// If done is true, it means that this request must not be on this disk's queue
+    /// *AND* there are no threads currently working on this context. To satisfy
+    /// this, only the last thread (per disk) can set this to true.
+    bool done_ = true;
+
+    /// For each disk, keeps track if the context is on this disk's queue, indicating
+    /// the disk must do some work for this context. The disk needs to do work in 4 cases:
+    ///  1) in_flight_ranges is not empty, the disk needs to read for this reader.
+    ///  2) next_range_to_start is NULL, the disk needs to prepare a scan range to be
+    ///     read next.
+    ///  3) the reader has been cancelled and this disk needs to participate in the
+    ///     cleanup.
+    ///  4) A write range is added to queue.
+    /// In general, we only want to put a context on the disk queue if there is something
+    /// useful that can be done. If there's nothing useful, the disk queue will wake up
+    /// and then remove the reader from the queue. Doing this causes thrashing of the
+    /// threads.
+    bool is_on_queue_ = false;
+
+    /// For each disks, the number of request ranges that have not been fully read.
+    /// In the non-cancellation path, this will hit 0, and done will be set to true
+    /// by the disk thread. This is undefined in the cancellation path (the various
+    /// threads notice by looking at the RequestContext's state_).
+    int num_remaining_ranges_ = 0;
+
+    /// Queue of ranges that have not started being read.  This list is exclusive
+    /// with in_flight_ranges.
+    InternalQueue<ScanRange> unstarted_scan_ranges_;
+
+    /// Queue of pending IO requests for this disk in the order that they will be
+    /// processed. A ScanRange is added to this queue when it is returned in
+    /// GetNextRange(), or when it is added with schedule_immediately = true.
+    /// A WriteRange is added to this queue from unstarted_write_ranges_ for each
+    /// invocation of GetNextRequestRange() in WorkLoop().
+    /// The size of this queue is always less than or equal to num_remaining_ranges.
+    InternalQueue<RequestRange> in_flight_ranges_;
+
+    /// The next range to start for this reader on this disk. Each disk (for each reader)
+    /// picks the next range to start. The range is set here and also added to the
+    /// ready_to_start_ranges_ queue. The reader pulls from the queue in FIFO order,
+    /// so the ranges from different disks are round-robined. When the range is pulled
+    /// off the ready_to_start_ranges_ queue, it sets this variable to NULL, so the disk
+    /// knows to populate it again and add it to ready_to_start_ranges_ i.e. it is used
+    /// as a flag by DiskIoMgr::GetNextScanRange to determine if it needs to add another
+    /// range to ready_to_start_ranges_.
+    ScanRange* next_scan_range_to_start_ = nullptr;
+
+    /// For each disk, the number of threads issuing the underlying read/write on behalf
+    /// of this context. There are a few places where we release the context lock, do some
+    /// work, and then grab the lock again.  Because we don't hold the lock for the
+    /// entire operation, we need this ref count to keep track of which thread should do
+    /// final resource cleanup during cancellation.
+    /// Only the thread that sees the count at 0 should do the final cleanup.
+    AtomicInt32 num_threads_in_op_{0};
+
+    /// Queue of write ranges to process for this disk. A write range is always added
+    /// to in_flight_ranges_ in GetNextRequestRange(). There is a separate
+    /// unstarted_read_ranges_ and unstarted_write_ranges_ to alternate between reads
+    /// and writes. (Otherwise, since next_scan_range_to_start is set
+    /// in GetNextRequestRange() whenever it is null, repeated calls to
+    /// GetNextRequestRange() and GetNextRange() may result in only reads being processed)
+    InternalQueue<WriteRange> unstarted_write_ranges_;
+  };
+
+  /// Per disk states to synchronize multiple disk threads accessing the same request
+  /// context.
+  std::vector<PerDiskState> disk_states_;
+};
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/request-ranges.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/request-ranges.h b/be/src/runtime/io/request-ranges.h
new file mode 100644
index 0000000..c1b3bbe
--- /dev/null
+++ b/be/src/runtime/io/request-ranges.h
@@ -0,0 +1,471 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_IO_REQUEST_RANGES_H
+#define IMPALA_RUNTIME_IO_REQUEST_RANGES_H
+
+#include <cstdint>
+#include <deque>
+
+#include <boost/thread/mutex.hpp>
+
+#include "common/hdfs.h"
+#include "common/status.h"
+#include "util/condition-variable.h"
+#include "util/internal-queue.h"
+
+namespace impala {
+class MemTracker;
+
+namespace io {
+class DiskIoMgr;
+class RequestContext;
+class HdfsFileHandle;
+class ScanRange;
+
+/// Buffer struct that is used by the caller and IoMgr to pass read buffers.
+/// It is is expected that only one thread has ownership of this object at a
+/// time.
+class BufferDescriptor {
+ public:
+  ~BufferDescriptor() {
+    DCHECK(buffer_ == nullptr); // Check we didn't leak a buffer.
+  }
+
+  ScanRange* scan_range() { return scan_range_; }
+  uint8_t* buffer() { return buffer_; }
+  int64_t buffer_len() { return buffer_len_; }
+  int64_t len() { return len_; }
+  bool eosr() { return eosr_; }
+
+  /// Returns the offset within the scan range that this buffer starts at
+  int64_t scan_range_offset() const { return scan_range_offset_; }
+
+  /// Transfer ownership of buffer memory from 'mem_tracker_' to 'dst' and set
+  /// 'mem_tracker_' to 'dst'. 'mem_tracker_' and 'dst' must be non-NULL. Does not
+  /// check memory limits on 'dst': the caller should check the memory limit if a
+  /// different memory limit may apply to 'dst'. If the buffer was a client-provided
+  /// buffer, transferring is not allowed.
+  /// TODO: IMPALA-3209: revisit this as part of scanner memory usage revamp.
+  void TransferOwnership(MemTracker* dst);
+
+ private:
+  friend class DiskIoMgr;
+  friend class ScanRange;
+  friend class RequestContext;
+
+  /// Create a buffer descriptor for a new reader, range and data buffer. The buffer
+  /// memory should already be accounted against 'mem_tracker'.
+  BufferDescriptor(DiskIoMgr* io_mgr, RequestContext* reader,
+      ScanRange* scan_range, uint8_t* buffer, int64_t buffer_len,
+      MemTracker* mem_tracker);
+
+  /// Return true if this is a cached buffer owned by HDFS.
+  bool is_cached() const;
+
+  /// Return true if this is a buffer owner by the client that was provided when
+  /// constructing the scan range.
+  bool is_client_buffer() const;
+
+  DiskIoMgr* const io_mgr_;
+
+  /// Reader that this buffer is for.
+  RequestContext* const reader_;
+
+  /// The current tracker this buffer is associated with. After initialisation,
+  /// NULL for cached buffers and non-NULL for all other buffers.
+  MemTracker* mem_tracker_;
+
+  /// Scan range that this buffer is for. Non-NULL when initialised.
+  ScanRange* const scan_range_;
+
+  /// buffer with the read contents
+  uint8_t* buffer_;
+
+  /// length of buffer_. For buffers from cached reads, the length is 0.
+  const int64_t buffer_len_;
+
+  /// length of read contents
+  int64_t len_ = 0;
+
+  /// true if the current scan range is complete
+  bool eosr_ = false;
+
+  /// Status of the read to this buffer. if status is not ok, 'buffer' is nullptr
+  Status status_;
+
+  int64_t scan_range_offset_ = 0;
+};
+
+/// The request type, read or write associated with a request range.
+struct RequestType {
+  enum type {
+    READ,
+    WRITE,
+  };
+};
+
+/// Represents a contiguous sequence of bytes in a single file.
+/// This is the common base class for read and write IO requests - ScanRange and
+/// WriteRange. Each disk thread processes exactly one RequestRange at a time.
+class RequestRange : public InternalQueue<RequestRange>::Node {
+ public:
+  hdfsFS fs() const { return fs_; }
+  const char* file() const { return file_.c_str(); }
+  std::string* file_string() { return &file_; }
+  int64_t offset() const { return offset_; }
+  int64_t len() const { return len_; }
+  int disk_id() const { return disk_id_; }
+  RequestType::type request_type() const { return request_type_; }
+
+ protected:
+  RequestRange(RequestType::type request_type)
+    : fs_(nullptr), offset_(-1), len_(-1), disk_id_(-1), request_type_(request_type) {}
+
+  /// Hadoop filesystem that contains file_, or set to nullptr for local filesystem.
+  hdfsFS fs_;
+
+  /// Path to file being read or written.
+  std::string file_;
+
+  /// Offset within file_ being read or written.
+  int64_t offset_;
+
+  /// Length of data read or written.
+  int64_t len_;
+
+  /// Id of disk containing byte range.
+  int disk_id_;
+
+  /// The type of IO request, READ or WRITE.
+  RequestType::type request_type_;
+};
+
+/// Param struct for different combinations of buffering.
+struct BufferOpts {
+ public:
+  // Set options for a read into an IoMgr-allocated or HDFS-cached buffer. Caching is
+  // enabled if 'try_cache' is true, the file is in the HDFS cache and 'mtime' matches
+  // the modified time of the cached file in the HDFS cache.
+  BufferOpts(bool try_cache, int64_t mtime)
+    : try_cache_(try_cache),
+      mtime_(mtime),
+      client_buffer_(nullptr),
+      client_buffer_len_(-1) {}
+
+  /// Set options for an uncached read into an IoMgr-allocated buffer.
+  static BufferOpts Uncached() {
+    return BufferOpts(false, NEVER_CACHE, nullptr, -1);
+  }
+
+  /// Set options to read the entire scan range into 'client_buffer'. The length of the
+  /// buffer, 'client_buffer_len', must fit the entire scan range. HDFS caching is not
+  /// enabled in this case.
+  static BufferOpts ReadInto(uint8_t* client_buffer, int64_t client_buffer_len) {
+    return BufferOpts(false, NEVER_CACHE, client_buffer, client_buffer_len);
+  }
+
+ private:
+  friend class ScanRange;
+
+  BufferOpts(
+      bool try_cache, int64_t mtime, uint8_t* client_buffer, int64_t client_buffer_len)
+    : try_cache_(try_cache),
+      mtime_(mtime),
+      client_buffer_(client_buffer),
+      client_buffer_len_(client_buffer_len) {}
+
+  /// If 'mtime_' is set to NEVER_CACHE, the file handle will never be cached, because
+  /// the modification time won't match.
+  const static int64_t NEVER_CACHE = -1;
+
+  /// If true, read from HDFS cache if possible.
+  const bool try_cache_;
+
+  /// Last modified time of the file associated with the scan range. If set to
+  /// NEVER_CACHE, caching is disabled.
+  const int64_t mtime_;
+
+  /// A destination buffer provided by the client, nullptr and -1 if no buffer.
+  uint8_t* const client_buffer_;
+  const int64_t client_buffer_len_;
+};
+
+/// ScanRange description. The caller must call Reset() to initialize the fields
+/// before calling AddScanRanges(). The private fields are used internally by
+/// the IoMgr.
+class ScanRange : public RequestRange {
+ public:
+  ScanRange();
+
+  virtual ~ScanRange();
+
+  /// Resets this scan range object with the scan range description. The scan range
+  /// is for bytes [offset, offset + len) in 'file' on 'fs' (which is nullptr for the
+  /// local filesystem). The scan range must fall within the file bounds (offset >= 0
+  /// and offset + len <= file_length). 'disk_id' is the disk queue to add the range
+  /// to. If 'expected_local' is true, a warning is generated if the read did not
+  /// come from a local disk. 'buffer_opts' specifies buffer management options -
+  /// see the DiskIoMgr class comment and the BufferOpts comments for details.
+  /// 'meta_data' is an arbitrary client-provided pointer for any auxiliary data.
+  void Reset(hdfsFS fs, const char* file, int64_t len, int64_t offset, int disk_id,
+      bool expected_local, const BufferOpts& buffer_opts, void* meta_data = nullptr);
+
+  void* meta_data() const { return meta_data_; }
+  bool try_cache() const { return try_cache_; }
+  bool expected_local() const { return expected_local_; }
+
+  /// Returns the next buffer for this scan range. buffer is an output parameter.
+  /// This function blocks until a buffer is ready or an error occurred. If this is
+  /// called when all buffers have been returned, *buffer is set to nullptr and Status::OK
+  /// is returned.
+  /// Only one thread can be in GetNext() at any time.
+  Status GetNext(std::unique_ptr<BufferDescriptor>* buffer) WARN_UNUSED_RESULT;
+
+  /// Cancel this scan range. This cleans up all queued buffers and
+  /// wakes up any threads blocked on GetNext().
+  /// Status is the reason the range was cancelled. Must not be ok().
+  /// Status is returned to the user in GetNext().
+  void Cancel(const Status& status);
+
+  /// return a descriptive string for debug.
+  std::string DebugString() const;
+
+  int64_t mtime() const { return mtime_; }
+
+ private:
+  friend class BufferDescriptor;
+  friend class DiskIoMgr;
+  friend class RequestContext;
+
+  /// Initialize internal fields
+  void InitInternal(DiskIoMgr* io_mgr, RequestContext* reader);
+
+  /// Enqueues a buffer for this range. This does not block.
+  /// Returns true if this scan range has hit the queue capacity, false otherwise.
+  /// The caller passes ownership of buffer to the scan range and it is not
+  /// valid to access buffer after this call. The reader lock must be held by the
+  /// caller.
+  bool EnqueueBuffer(const boost::unique_lock<boost::mutex>& reader_lock,
+      std::unique_ptr<BufferDescriptor> buffer);
+
+  /// Cleanup any queued buffers (i.e. due to cancellation). This cannot
+  /// be called with any locks taken.
+  void CleanupQueuedBuffers();
+
+  /// Validates the internal state of this range. lock_ must be taken
+  /// before calling this.
+  bool Validate();
+
+  /// Maximum length in bytes for hdfsRead() calls.
+  int64_t MaxReadChunkSize() const;
+
+  /// Opens the file for this range. This function only modifies state in this range.
+  /// If 'use_file_handle_cache' is true and this is a local hdfs file, then this scan
+  /// range will not maintain an exclusive file handle. It will borrow an hdfs file
+  /// handle from the file handle cache for each Read(), so Open() does nothing.
+  /// If 'use_file_handle_cache' is false or this is a remote hdfs file or this is
+  /// a local OS file, Open() will maintain a file handle on the scan range for
+  /// exclusive use by this scan range. An exclusive hdfs file handle still comes
+  /// from the cache, but it is a newly opened file handle that is held for the
+  /// entire duration of a scan range's lifetime and destroyed in Close().
+  /// All local OS files are opened using normal OS file APIs.
+  Status Open(bool use_file_handle_cache) WARN_UNUSED_RESULT;
+
+  /// Closes the file for this range. This function only modifies state in this range.
+  void Close();
+
+  /// Reads from this range into 'buffer', which has length 'buffer_len' bytes. Returns
+  /// the number of bytes read. The read position in this scan range is updated.
+  Status Read(uint8_t* buffer, int64_t buffer_len, int64_t* bytes_read,
+      bool* eosr) WARN_UNUSED_RESULT;
+
+  /// Get the read statistics from the Hdfs file handle and aggregate them to
+  /// the RequestContext. This clears the statistics on this file handle.
+  /// It is safe to pass hdfsFile by value, as hdfsFile's underlying type is a
+  /// pointer.
+  void GetHdfsStatistics(hdfsFile fh);
+
+  /// Reads from the DN cache. On success, sets cached_buffer_ to the DN buffer
+  /// and *read_succeeded to true.
+  /// If the data is not cached, returns ok() and *read_succeeded is set to false.
+  /// Returns a non-ok status if it ran into a non-continuable error.
+  ///  The reader lock must be held by the caller.
+  Status ReadFromCache(const boost::unique_lock<boost::mutex>& reader_lock,
+      bool* read_succeeded) WARN_UNUSED_RESULT;
+
+  /// Pointer to caller specified metadata. This is untouched by the io manager
+  /// and the caller can put whatever auxiliary data in here.
+  void* meta_data_ = nullptr;
+
+  /// If true, this scan range is expected to be cached. Note that this might be wrong
+  /// since the block could have been uncached. In that case, the cached path
+  /// will fail and we'll just put the scan range on the normal read path.
+  bool try_cache_ = false;
+
+  /// If true, we expect this scan range to be a local read. Note that if this is false,
+  /// it does not necessarily mean we expect the read to be remote, and that we never
+  /// create scan ranges where some of the range is expected to be remote and some of it
+  /// local.
+  /// TODO: we can do more with this
+  bool expected_local_ = false;
+
+  /// Total number of bytes read remotely. This is necessary to maintain a count of
+  /// the number of remote scan ranges. Since IO statistics can be collected multiple
+  /// times for a scan range, it is necessary to keep some state about whether this
+  /// scan range has already been counted as remote. There is also a requirement to
+  /// log the number of unexpected remote bytes for a scan range. To solve both
+  /// requirements, maintain num_remote_bytes_ on the ScanRange and push it to the
+  /// reader_ once at the close of the scan range.
+  int64_t num_remote_bytes_;
+
+  DiskIoMgr* io_mgr_ = nullptr;
+
+  /// Reader/owner of the scan range
+  RequestContext* reader_ = nullptr;
+
+  /// File handle either to hdfs or local fs (FILE*)
+  /// The hdfs file handle is only stored here in three cases:
+  /// 1. The file handle cache is off (max_cached_file_handles == 0).
+  /// 2. The scan range is using hdfs caching.
+  /// -OR-
+  /// 3. The hdfs file is expected to be remote (expected_local_ == false)
+  /// In each case, the scan range gets a new file handle from the file handle cache
+  /// at Open(), holds it exclusively, and destroys it in Close().
+  union {
+    FILE* local_file_ = nullptr;
+    HdfsFileHandle* exclusive_hdfs_fh_;
+  };
+
+  /// Tagged union that holds a buffer for the cases when there is a buffer allocated
+  /// externally from DiskIoMgr that is associated with the scan range.
+  enum class ExternalBufferTag { CLIENT_BUFFER, CACHED_BUFFER, NO_BUFFER };
+  ExternalBufferTag external_buffer_tag_;
+  union {
+    /// Valid if the 'external_buffer_tag_' is CLIENT_BUFFER.
+    struct {
+      /// Client-provided buffer to read the whole scan range into.
+      uint8_t* data;
+
+      /// Length of the client-provided buffer.
+      int64_t len;
+    } client_buffer_;
+
+    /// Valid and non-NULL if the external_buffer_tag_ is CACHED_BUFFER, which means
+    /// that a cached read succeeded and all the bytes for the range are in this buffer.
+    struct hadoopRzBuffer* cached_buffer_ = nullptr;
+  };
+
+  /// Lock protecting fields below.
+  /// This lock should not be taken during Open()/Read()/Close().
+  /// If RequestContext::lock_ and this lock need to be held simultaneously,
+  /// RequestContext::lock_ must be taken first.
+  boost::mutex lock_;
+
+  /// Number of bytes read so far for this scan range
+  int bytes_read_;
+
+  /// Status for this range. This is non-ok if is_cancelled_ is true.
+  /// Note: an individual range can fail without the RequestContext being
+  /// cancelled. This allows us to skip individual ranges.
+  Status status_;
+
+  /// If true, the last buffer for this scan range has been queued.
+  bool eosr_queued_ = false;
+
+  /// If true, the last buffer for this scan range has been returned.
+  bool eosr_returned_ = false;
+
+  /// If true, this scan range has been removed from the reader's in_flight_ranges
+  /// queue because the ready_buffers_ queue is full.
+  bool blocked_on_queue_ = false;
+
+  /// IO buffers that are queued for this scan range.
+  /// Condition variable for GetNext
+  ConditionVariable buffer_ready_cv_;
+  std::deque<std::unique_ptr<BufferDescriptor>> ready_buffers_;
+
+  /// Lock that should be taken during hdfs calls. Only one thread (the disk reading
+  /// thread) calls into hdfs at a time so this lock does not have performance impact.
+  /// This lock only serves to coordinate cleanup. Specifically it serves to ensure
+  /// that the disk threads are finished with HDFS calls before is_cancelled_ is set
+  /// to true and cleanup starts.
+  /// If this lock and lock_ need to be taken, lock_ must be taken first.
+  boost::mutex hdfs_lock_;
+
+  /// If true, this scan range has been cancelled.
+  bool is_cancelled_ = false;
+
+  /// Last modified time of the file associated with the scan range
+  int64_t mtime_;
+};
+
+/// Used to specify data to be written to a file and offset.
+/// It is the responsibility of the client to ensure that the data to be written is
+/// valid and that the file to be written to exists until the callback is invoked.
+/// A callback is invoked to inform the client when the write is done.
+class WriteRange : public RequestRange {
+ public:
+  /// This callback is invoked on each WriteRange after the write is complete or the
+  /// context is cancelled. The status returned by the callback parameter indicates
+  /// if the write was successful (i.e. Status::OK), if there was an error
+  /// TStatusCode::RUNTIME_ERROR) or if the context was cancelled
+  /// (TStatusCode::CANCELLED). The callback is only invoked if this WriteRange was
+  /// successfully added (i.e. AddWriteRange() succeeded). No locks are held while
+  /// the callback is invoked.
+  typedef std::function<void(const Status&)> WriteDoneCallback;
+  WriteRange(const std::string& file, int64_t file_offset, int disk_id,
+      WriteDoneCallback callback);
+
+  /// Change the file and offset of this write range. Data and callbacks are unchanged.
+  /// Can only be called when the write is not in flight (i.e. before AddWriteRange()
+  /// is called or after the write callback was called).
+  void SetRange(const std::string& file, int64_t file_offset, int disk_id);
+
+  /// Set the data and number of bytes to be written for this WriteRange.
+  /// Can only be called when the write is not in flight (i.e. before AddWriteRange()
+  /// is called or after the write callback was called).
+  void SetData(const uint8_t* buffer, int64_t len);
+
+  const uint8_t* data() const { return data_; }
+
+ private:
+  friend class DiskIoMgr;
+  friend class RequestContext;
+  friend class ScanRange;
+
+  /// Data to be written. RequestRange::len_ contains the length of data
+  /// to be written.
+  const uint8_t* data_;
+
+  /// Callback to invoke after the write is complete.
+  WriteDoneCallback callback_;
+};
+
+inline bool BufferDescriptor::is_cached() const {
+  return scan_range_->external_buffer_tag_
+      == ScanRange::ExternalBufferTag::CACHED_BUFFER;
+}
+
+inline bool BufferDescriptor::is_client_buffer() const {
+  return scan_range_->external_buffer_tag_
+      == ScanRange::ExternalBufferTag::CLIENT_BUFFER;
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/io/scan-range.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/io/scan-range.cc b/be/src/runtime/io/scan-range.cc
new file mode 100644
index 0000000..b7655a8
--- /dev/null
+++ b/be/src/runtime/io/scan-range.cc
@@ -0,0 +1,593 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/io/disk-io-mgr.h"
+#include "runtime/io/disk-io-mgr-internal.h"
+#include "util/error-util.h"
+#include "util/hdfs-util.h"
+
+#include "common/names.h"
+
+using namespace impala;
+using namespace impala::io;
+
+DEFINE_bool(use_hdfs_pread, false, "Enables using hdfsPread() instead of hdfsRead() "
+    "when performing HDFS read operations. This is necessary to use HDFS hedged reads "
+    "(assuming the HDFS client is configured to do so).");
+
+// TODO: Run perf tests and empirically settle on the most optimal default value for the
+// read buffer size. Currently setting it as 128k for the same reason as for S3, i.e.
+// due to JNI array allocation and memcpy overhead, 128k was emperically found to have the
+// least overhead.
+DEFINE_int64(adls_read_chunk_size, 128 * 1024, "The maximum read chunk size to use when "
+    "reading from ADLS.");
+
+// Implementation of the ScanRange functionality. Each ScanRange contains a queue
+// of ready buffers. For each ScanRange, there is only a single producer and
+// consumer thread, i.e. only one disk thread will push to a scan range at
+// any time and only one thread will remove from the queue. This is to guarantee
+// that buffers are queued and read in file order.
+
+bool ScanRange::EnqueueBuffer(
+    const unique_lock<mutex>& reader_lock, unique_ptr<BufferDescriptor> buffer) {
+  DCHECK(reader_lock.mutex() == &reader_->lock_ && reader_lock.owns_lock());
+  {
+    unique_lock<mutex> scan_range_lock(lock_);
+    DCHECK(Validate()) << DebugString();
+    DCHECK(!eosr_returned_);
+    DCHECK(!eosr_queued_);
+    if (is_cancelled_) {
+      // Return the buffer, this range has been cancelled
+      if (buffer->buffer_ != nullptr) {
+        io_mgr_->num_buffers_in_readers_.Add(1);
+        reader_->num_buffers_in_reader_.Add(1);
+      }
+      reader_->num_used_buffers_.Add(-1);
+      io_mgr_->ReturnBuffer(move(buffer));
+      return false;
+    }
+    reader_->num_ready_buffers_.Add(1);
+    eosr_queued_ = buffer->eosr();
+    ready_buffers_.emplace_back(move(buffer));
+
+    DCHECK_LE(ready_buffers_.size(), DiskIoMgr::SCAN_RANGE_READY_BUFFER_LIMIT);
+    blocked_on_queue_ = ready_buffers_.size() == DiskIoMgr::SCAN_RANGE_READY_BUFFER_LIMIT;
+  }
+
+  buffer_ready_cv_.NotifyOne();
+
+  return blocked_on_queue_;
+}
+
+Status ScanRange::GetNext(unique_ptr<BufferDescriptor>* buffer) {
+  DCHECK(*buffer == nullptr);
+  bool eosr;
+  {
+    unique_lock<mutex> scan_range_lock(lock_);
+    if (eosr_returned_) return Status::OK();
+    DCHECK(Validate()) << DebugString();
+
+    while (ready_buffers_.empty() && !is_cancelled_) {
+      buffer_ready_cv_.Wait(scan_range_lock);
+    }
+
+    if (is_cancelled_) {
+      DCHECK(!status_.ok());
+      return status_;
+    }
+
+    // Remove the first ready buffer from the queue and return it
+    DCHECK(!ready_buffers_.empty());
+    DCHECK_LE(ready_buffers_.size(), DiskIoMgr::SCAN_RANGE_READY_BUFFER_LIMIT);
+    *buffer = move(ready_buffers_.front());
+    ready_buffers_.pop_front();
+    eosr_returned_ = (*buffer)->eosr();
+    eosr = (*buffer)->eosr();
+  }
+
+  // Update tracking counters. The buffer has now moved from the IoMgr to the
+  // caller.
+  io_mgr_->num_buffers_in_readers_.Add(1);
+  reader_->num_buffers_in_reader_.Add(1);
+  reader_->num_ready_buffers_.Add(-1);
+  reader_->num_used_buffers_.Add(-1);
+  if (eosr) reader_->num_finished_ranges_.Add(1);
+
+  Status status = (*buffer)->status_;
+  if (!status.ok()) {
+    io_mgr_->ReturnBuffer(move(*buffer));
+    return status;
+  }
+
+  unique_lock<mutex> reader_lock(reader_->lock_);
+
+  DCHECK(reader_->Validate()) << endl << reader_->DebugString();
+  if (reader_->state_ == RequestContext::Cancelled) {
+    reader_->blocked_ranges_.Remove(this);
+    Cancel(reader_->status_);
+    io_mgr_->ReturnBuffer(move(*buffer));
+    return status_;
+  }
+
+  {
+    // Check to see if we can re-schedule a blocked range. Note that EnqueueBuffer()
+    // may have been called after we released 'lock_' above so we need to re-check
+    // whether the queue is full.
+    unique_lock<mutex> scan_range_lock(lock_);
+    if (blocked_on_queue_
+        && ready_buffers_.size() < DiskIoMgr::SCAN_RANGE_READY_BUFFER_LIMIT
+        && !eosr_queued_) {
+      blocked_on_queue_ = false;
+      // This scan range was blocked and is no longer, add it to the reader
+      // queue again.
+      reader_->blocked_ranges_.Remove(this);
+      reader_->ScheduleScanRange(this);
+    }
+  }
+  return Status::OK();
+}
+
+void ScanRange::Cancel(const Status& status) {
+  // Cancelling a range that was never started, ignore.
+  if (io_mgr_ == nullptr) return;
+
+  DCHECK(!status.ok());
+  {
+    // Grab both locks to make sure that all working threads see is_cancelled_.
+    unique_lock<mutex> scan_range_lock(lock_);
+    unique_lock<mutex> hdfs_lock(hdfs_lock_);
+    DCHECK(Validate()) << DebugString();
+    if (is_cancelled_) return;
+    is_cancelled_ = true;
+    status_ = status;
+  }
+  buffer_ready_cv_.NotifyAll();
+  CleanupQueuedBuffers();
+
+  // For cached buffers, we can't close the range until the cached buffer is returned.
+  // Close() is called from DiskIoMgr::ReturnBuffer().
+  if (external_buffer_tag_ != ExternalBufferTag::CACHED_BUFFER) Close();
+}
+
+void ScanRange::CleanupQueuedBuffers() {
+  DCHECK(is_cancelled_);
+  io_mgr_->num_buffers_in_readers_.Add(ready_buffers_.size());
+  reader_->num_buffers_in_reader_.Add(ready_buffers_.size());
+  reader_->num_used_buffers_.Add(-ready_buffers_.size());
+  reader_->num_ready_buffers_.Add(-ready_buffers_.size());
+
+  while (!ready_buffers_.empty()) {
+    io_mgr_->ReturnBuffer(move(ready_buffers_.front()));
+    ready_buffers_.pop_front();
+  }
+}
+
+string ScanRange::DebugString() const {
+  stringstream ss;
+  ss << "file=" << file_ << " disk_id=" << disk_id_ << " offset=" << offset_
+     << " len=" << len_ << " bytes_read=" << bytes_read_
+     << " buffer_queue=" << ready_buffers_.size()
+     << " hdfs_file=" << exclusive_hdfs_fh_;
+  return ss.str();
+}
+
+bool ScanRange::Validate() {
+  if (bytes_read_ > len_) {
+    LOG(WARNING) << "Bytes read tracking is wrong. Shouldn't read past the scan range."
+                 << " bytes_read_=" << bytes_read_ << " len_=" << len_;
+    return false;
+  }
+  if (eosr_returned_ && !eosr_queued_) {
+    LOG(WARNING) << "Returned eosr to reader before finishing reading the scan range"
+                 << " eosr_returned_=" << eosr_returned_
+                 << " eosr_queued_=" << eosr_queued_;
+    return false;
+  }
+  return true;
+}
+
+ScanRange::ScanRange()
+  : RequestRange(RequestType::READ),
+    num_remote_bytes_(0),
+    external_buffer_tag_(ExternalBufferTag::NO_BUFFER),
+    mtime_(-1) {}
+
+ScanRange::~ScanRange() {
+  DCHECK(exclusive_hdfs_fh_ == nullptr) << "File was not closed.";
+  DCHECK(external_buffer_tag_ != ExternalBufferTag::CACHED_BUFFER)
+      << "Cached buffer was not released.";
+}
+
+void ScanRange::Reset(hdfsFS fs, const char* file, int64_t len, int64_t offset,
+    int disk_id, bool expected_local, const BufferOpts& buffer_opts, void* meta_data) {
+  DCHECK(ready_buffers_.empty());
+  DCHECK(file != nullptr);
+  DCHECK_GE(len, 0);
+  DCHECK_GE(offset, 0);
+  DCHECK(buffer_opts.client_buffer_ == nullptr ||
+         buffer_opts.client_buffer_len_ >= len_);
+  fs_ = fs;
+  file_ = file;
+  len_ = len;
+  offset_ = offset;
+  disk_id_ = disk_id;
+  try_cache_ = buffer_opts.try_cache_;
+  mtime_ = buffer_opts.mtime_;
+  expected_local_ = expected_local;
+  num_remote_bytes_ = 0;
+  meta_data_ = meta_data;
+  if (buffer_opts.client_buffer_ != nullptr) {
+    external_buffer_tag_ = ExternalBufferTag::CLIENT_BUFFER;
+    client_buffer_.data = buffer_opts.client_buffer_;
+    client_buffer_.len = buffer_opts.client_buffer_len_;
+  } else {
+    external_buffer_tag_ = ExternalBufferTag::NO_BUFFER;
+  }
+  io_mgr_ = nullptr;
+  reader_ = nullptr;
+  exclusive_hdfs_fh_ = nullptr;
+}
+
+void ScanRange::InitInternal(DiskIoMgr* io_mgr, RequestContext* reader) {
+  DCHECK(exclusive_hdfs_fh_ == nullptr);
+  DCHECK(local_file_ == nullptr);
+  // Reader must provide MemTracker or a buffer.
+  DCHECK(external_buffer_tag_ == ExternalBufferTag::CLIENT_BUFFER
+      || reader->mem_tracker_ != nullptr);
+  io_mgr_ = io_mgr;
+  reader_ = reader;
+  local_file_ = nullptr;
+  exclusive_hdfs_fh_ = nullptr;
+  bytes_read_ = 0;
+  is_cancelled_ = false;
+  eosr_queued_= false;
+  eosr_returned_= false;
+  blocked_on_queue_ = false;
+  DCHECK(Validate()) << DebugString();
+}
+
+Status ScanRange::Open(bool use_file_handle_cache) {
+  unique_lock<mutex> hdfs_lock(hdfs_lock_);
+  if (is_cancelled_) return Status::CANCELLED;
+
+  if (fs_ != nullptr) {
+    if (exclusive_hdfs_fh_ != nullptr) return Status::OK();
+    // With file handle caching, the scan range does not maintain its own
+    // hdfs file handle. File handle caching is only used for local files,
+    // so s3 and remote filesystems should obtain an exclusive file handle
+    // for each scan range.
+    if (use_file_handle_cache && expected_local_) return Status::OK();
+    // Get a new exclusive file handle.
+    exclusive_hdfs_fh_ = io_mgr_->GetCachedHdfsFileHandle(fs_, file_string(),
+        mtime(), reader_, true);
+    if (exclusive_hdfs_fh_ == nullptr) {
+      return Status(TErrorCode::DISK_IO_ERROR,
+          GetHdfsErrorMsg("Failed to open HDFS file ", file_));
+    }
+
+    if (hdfsSeek(fs_, exclusive_hdfs_fh_->file(), offset_) != 0) {
+      // Destroy the file handle and remove it from the cache.
+      io_mgr_->ReleaseCachedHdfsFileHandle(file_string(), exclusive_hdfs_fh_, true);
+      exclusive_hdfs_fh_ = nullptr;
+      return Status(TErrorCode::DISK_IO_ERROR,
+          Substitute("Error seeking to $0 in file: $1 $2", offset_, file_,
+          GetHdfsErrorMsg("")));
+    }
+  } else {
+    if (local_file_ != nullptr) return Status::OK();
+
+    local_file_ = fopen(file(), "r");
+    if (local_file_ == nullptr) {
+      return Status(TErrorCode::DISK_IO_ERROR, Substitute("Could not open file: $0: $1",
+            file_, GetStrErrMsg()));
+    }
+    if (fseek(local_file_, offset_, SEEK_SET) == -1) {
+      fclose(local_file_);
+      local_file_ = nullptr;
+      return Status(TErrorCode::DISK_IO_ERROR, Substitute("Could not seek to $0 "
+          "for file: $1: $2", offset_, file_, GetStrErrMsg()));
+    }
+  }
+  if (ImpaladMetrics::IO_MGR_NUM_OPEN_FILES != nullptr) {
+    ImpaladMetrics::IO_MGR_NUM_OPEN_FILES->Increment(1L);
+  }
+  return Status::OK();
+}
+
+void ScanRange::Close() {
+  unique_lock<mutex> hdfs_lock(hdfs_lock_);
+  bool closed_file = false;
+  if (fs_ != nullptr) {
+    if (exclusive_hdfs_fh_ != nullptr) {
+      GetHdfsStatistics(exclusive_hdfs_fh_->file());
+
+      if (external_buffer_tag_ == ExternalBufferTag::CACHED_BUFFER) {
+        hadoopRzBufferFree(exclusive_hdfs_fh_->file(), cached_buffer_);
+        cached_buffer_ = nullptr;
+        external_buffer_tag_ = ExternalBufferTag::NO_BUFFER;
+      }
+
+      // Destroy the file handle and remove it from the cache.
+      io_mgr_->ReleaseCachedHdfsFileHandle(file_string(), exclusive_hdfs_fh_, true);
+      exclusive_hdfs_fh_ = nullptr;
+      closed_file = true;
+    }
+
+    if (FLAGS_use_hdfs_pread) {
+      // Update Hedged Read Metrics.
+      // We call it only if the --use_hdfs_pread flag is set, to avoid having the
+      // libhdfs client malloc and free a hdfsHedgedReadMetrics object unnecessarily
+      // otherwise. 'hedged_metrics' is only set upon success.
+      struct hdfsHedgedReadMetrics* hedged_metrics;
+      int success = hdfsGetHedgedReadMetrics(fs_, &hedged_metrics);
+      if (success == 0) {
+        ImpaladMetrics::HEDGED_READ_OPS->set_value(hedged_metrics->hedgedReadOps);
+        ImpaladMetrics::HEDGED_READ_OPS_WIN->set_value(hedged_metrics->hedgedReadOpsWin);
+        hdfsFreeHedgedReadMetrics(hedged_metrics);
+      }
+    }
+
+    if (num_remote_bytes_ > 0) {
+      reader_->num_remote_ranges_.Add(1L);
+      if (expected_local_) {
+        reader_->unexpected_remote_bytes_.Add(num_remote_bytes_);
+        VLOG_FILE << "Unexpected remote HDFS read of "
+                  << PrettyPrinter::Print(num_remote_bytes_, TUnit::BYTES)
+                  << " for file '" << file_ << "'";
+      }
+    }
+  } else {
+    if (local_file_ == nullptr) return;
+    fclose(local_file_);
+    local_file_ = nullptr;
+    closed_file = true;
+  }
+  if (closed_file && ImpaladMetrics::IO_MGR_NUM_OPEN_FILES != nullptr) {
+    ImpaladMetrics::IO_MGR_NUM_OPEN_FILES->Increment(-1L);
+  }
+}
+
+int64_t ScanRange::MaxReadChunkSize() const {
+  // S3 InputStreams don't support DIRECT_READ (i.e. java.nio.ByteBuffer read()
+  // interface).  So, hdfsRead() needs to allocate a Java byte[] and copy the data out.
+  // Profiles show that both the JNI array allocation and the memcpy adds much more
+  // overhead for larger buffers, so limit the size of each read request.  128K was
+  // chosen empirically by trying values between 4K and 8M and optimizing for lower CPU
+  // utilization and higher S3 througput.
+  if (disk_id_ == io_mgr_->RemoteS3DiskId()) {
+    DCHECK(IsS3APath(file()));
+    return 128 * 1024;
+  }
+  if (disk_id_ == io_mgr_->RemoteAdlsDiskId()) {
+    DCHECK(IsADLSPath(file()));
+    return FLAGS_adls_read_chunk_size;
+  }
+  // The length argument of hdfsRead() is an int. Ensure we don't overflow it.
+  return numeric_limits<int>::max();
+}
+
+// TODO: how do we best use the disk here.  e.g. is it good to break up a
+// 1MB read into 8 128K reads?
+// TODO: look at linux disk scheduling
+Status ScanRange::Read(
+    uint8_t* buffer, int64_t buffer_len, int64_t* bytes_read, bool* eosr) {
+  unique_lock<mutex> hdfs_lock(hdfs_lock_);
+  if (is_cancelled_) return Status::CANCELLED;
+
+  *eosr = false;
+  *bytes_read = 0;
+  // Read until the end of the scan range or the end of the buffer.
+  int bytes_to_read = min(len_ - bytes_read_, buffer_len);
+  DCHECK_GE(bytes_to_read, 0);
+
+  if (fs_ != nullptr) {
+    HdfsFileHandle* borrowed_hdfs_fh = nullptr;
+    hdfsFile hdfs_file;
+
+    // If the scan range has an exclusive file handle, use it. Otherwise, borrow
+    // a file handle from the cache.
+    if (exclusive_hdfs_fh_ != nullptr) {
+      hdfs_file = exclusive_hdfs_fh_->file();
+    } else {
+      borrowed_hdfs_fh = io_mgr_->GetCachedHdfsFileHandle(fs_, file_string(),
+          mtime(), reader_, false);
+      if (borrowed_hdfs_fh == nullptr) {
+        return Status(TErrorCode::DISK_IO_ERROR,
+            GetHdfsErrorMsg("Failed to open HDFS file ", file_));
+      }
+      hdfs_file = borrowed_hdfs_fh->file();
+    }
+
+    int64_t max_chunk_size = MaxReadChunkSize();
+    Status status = Status::OK();
+    while (*bytes_read < bytes_to_read) {
+      int chunk_size = min(bytes_to_read - *bytes_read, max_chunk_size);
+      DCHECK_GE(chunk_size, 0);
+      // The hdfsRead() length argument is an int.
+      DCHECK_LE(chunk_size, numeric_limits<int>::max());
+      int current_bytes_read = -1;
+      // bytes_read_ is only updated after the while loop
+      int64_t position_in_file = offset_ + bytes_read_ + *bytes_read;
+      int num_retries = 0;
+      while (true) {
+        status = Status::OK();
+        // For file handles from the cache, any of the below file operations may fail
+        // due to a bad file handle. In each case, record the error, but allow for a
+        // retry to fix it.
+        if (FLAGS_use_hdfs_pread) {
+          current_bytes_read = hdfsPread(fs_, hdfs_file, position_in_file,
+              buffer + *bytes_read, chunk_size);
+          if (current_bytes_read == -1) {
+            status = Status(TErrorCode::DISK_IO_ERROR,
+                GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
+          }
+        } else {
+          // If the file handle is borrowed, it may not be at the appropriate
+          // location. Seek to the appropriate location.
+          bool seek_failed = false;
+          if (borrowed_hdfs_fh != nullptr) {
+            if (hdfsSeek(fs_, hdfs_file, position_in_file) != 0) {
+              status = Status(TErrorCode::DISK_IO_ERROR, Substitute("Error seeking to $0 "
+                  " in file: $1: $2", position_in_file, file_, GetHdfsErrorMsg("")));
+              seek_failed = true;
+            }
+          }
+          if (!seek_failed) {
+            current_bytes_read = hdfsRead(fs_, hdfs_file, buffer + *bytes_read,
+                chunk_size);
+            if (current_bytes_read == -1) {
+              status = Status(TErrorCode::DISK_IO_ERROR,
+                  GetHdfsErrorMsg("Error reading from HDFS file: ", file_));
+            }
+          }
+        }
+
+        // Do not retry:
+        // - if read was successful (current_bytes_read != -1)
+        // - or if already retried once
+        // - or if this not using a borrowed file handle
+        DCHECK_LE(num_retries, 1);
+        if (current_bytes_read != -1 || borrowed_hdfs_fh == nullptr ||
+            num_retries == 1) {
+          break;
+        }
+        // The error may be due to a bad file handle. Reopen the file handle and retry.
+        ++num_retries;
+        RETURN_IF_ERROR(io_mgr_->ReopenCachedHdfsFileHandle(fs_, file_string(),
+            mtime(), &borrowed_hdfs_fh));
+        hdfs_file = borrowed_hdfs_fh->file();
+      }
+      if (!status.ok()) break;
+      if (current_bytes_read == 0) {
+        // No more bytes in the file. The scan range went past the end.
+        *eosr = true;
+        break;
+      }
+      *bytes_read += current_bytes_read;
+
+      // Collect and accumulate statistics
+      GetHdfsStatistics(hdfs_file);
+    }
+
+    if (borrowed_hdfs_fh != nullptr) {
+      io_mgr_->ReleaseCachedHdfsFileHandle(file_string(), borrowed_hdfs_fh, false);
+    }
+    if (!status.ok()) return status;
+  } else {
+    DCHECK(local_file_ != nullptr);
+    *bytes_read = fread(buffer, 1, bytes_to_read, local_file_);
+    DCHECK_GE(*bytes_read, 0);
+    DCHECK_LE(*bytes_read, bytes_to_read);
+    if (*bytes_read < bytes_to_read) {
+      if (ferror(local_file_) != 0) {
+        return Status(TErrorCode::DISK_IO_ERROR, Substitute("Error reading from $0"
+            "at byte offset: $1: $2", file_, offset_ + bytes_read_, GetStrErrMsg()));
+      } else {
+        // On Linux, we should only get partial reads from block devices on error or eof.
+        DCHECK(feof(local_file_) != 0);
+        *eosr = true;
+      }
+    }
+  }
+  bytes_read_ += *bytes_read;
+  DCHECK_LE(bytes_read_, len_);
+  if (bytes_read_ == len_) *eosr = true;
+  return Status::OK();
+}
+
+Status ScanRange::ReadFromCache(
+    const unique_lock<mutex>& reader_lock, bool* read_succeeded) {
+  DCHECK(reader_lock.mutex() == &reader_->lock_ && reader_lock.owns_lock());
+  DCHECK(try_cache_);
+  DCHECK_EQ(bytes_read_, 0);
+  *read_succeeded = false;
+  Status status = Open(false);
+  if (!status.ok()) return status;
+
+  // Cached reads not supported on local filesystem.
+  if (fs_ == nullptr) return Status::OK();
+
+  {
+    unique_lock<mutex> hdfs_lock(hdfs_lock_);
+    if (is_cancelled_) return Status::CANCELLED;
+
+    DCHECK(exclusive_hdfs_fh_ != nullptr);
+    DCHECK(external_buffer_tag_ == ExternalBufferTag::NO_BUFFER);
+    cached_buffer_ =
+      hadoopReadZero(exclusive_hdfs_fh_->file(), io_mgr_->cached_read_options_, len());
+    if (cached_buffer_ != nullptr) {
+      external_buffer_tag_ = ExternalBufferTag::CACHED_BUFFER;
+    }
+  }
+  // Data was not cached, caller will fall back to normal read path.
+  if (external_buffer_tag_ != ExternalBufferTag::CACHED_BUFFER) {
+    VLOG_QUERY << "Cache read failed for scan range: " << DebugString()
+               << ". Switching to disk read path.";
+    // Clean up the scan range state before re-issuing it.
+    Close();
+    return Status::OK();
+  }
+
+  // Cached read returned a buffer, verify we read the correct amount of data.
+  void* buffer = const_cast<void*>(hadoopRzBufferGet(cached_buffer_));
+  int32_t bytes_read = hadoopRzBufferLength(cached_buffer_);
+  // A partial read can happen when files are truncated.
+  // TODO: If HDFS ever supports partially cached blocks, we'll have to distinguish
+  // between errors and partially cached blocks here.
+  if (bytes_read < len()) {
+    VLOG_QUERY << "Error reading file from HDFS cache: " << file_ << ". Expected "
+      << len() << " bytes, but read " << bytes_read << ". Switching to disk read path.";
+    // Close the scan range. 'read_succeeded' is still false, so the caller will fall back
+    // to non-cached read of this scan range.
+    Close();
+    return Status::OK();
+  }
+
+  // Create a single buffer desc for the entire scan range and enqueue that.
+  // 'mem_tracker' is nullptr because the memory is owned by the HDFS java client,
+  // not the Impala backend.
+  unique_ptr<BufferDescriptor> desc = unique_ptr<BufferDescriptor>(new BufferDescriptor(
+      io_mgr_, reader_, this, reinterpret_cast<uint8_t*>(buffer), 0, nullptr));
+  desc->len_ = bytes_read;
+  desc->scan_range_offset_ = 0;
+  desc->eosr_ = true;
+  bytes_read_ = bytes_read;
+  EnqueueBuffer(reader_lock, move(desc));
+  if (reader_->bytes_read_counter_ != nullptr) {
+    COUNTER_ADD(reader_->bytes_read_counter_, bytes_read);
+  }
+  *read_succeeded = true;
+  reader_->num_used_buffers_.Add(1);
+  return Status::OK();
+}
+
+void ScanRange::GetHdfsStatistics(hdfsFile hdfs_file) {
+  struct hdfsReadStatistics* stats;
+  if (IsHdfsPath(file())) {
+    int success = hdfsFileGetReadStatistics(hdfs_file, &stats);
+    if (success == 0) {
+      reader_->bytes_read_local_.Add(stats->totalLocalBytesRead);
+      reader_->bytes_read_short_circuit_.Add(stats->totalShortCircuitBytesRead);
+      reader_->bytes_read_dn_cache_.Add(stats->totalZeroCopyBytesRead);
+      if (stats->totalLocalBytesRead != stats->totalBytesRead) {
+        num_remote_bytes_ += stats->totalBytesRead - stats->totalLocalBytesRead;
+      }
+      hdfsFileFreeReadStatistics(stats);
+    }
+    hdfsFileClearReadStatistics(hdfs_file);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/row-batch.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.h b/be/src/runtime/row-batch.h
index d246024..2c08f30 100644
--- a/be/src/runtime/row-batch.h
+++ b/be/src/runtime/row-batch.h
@@ -29,7 +29,7 @@
 #include "kudu/util/slice.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/descriptors.h"
-#include "runtime/disk-io-mgr.h"
+#include "runtime/io/disk-io-mgr.h"
 #include "runtime/mem-pool.h"
 
 namespace kudu {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/runtime-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.cc b/be/src/runtime/runtime-state.cc
index 308b2c4..37219cc 100644
--- a/be/src/runtime/runtime-state.cc
+++ b/be/src/runtime/runtime-state.cc
@@ -260,7 +260,7 @@ CatalogServiceClientCache* RuntimeState::catalogd_client_cache() {
   return exec_env_->catalogd_client_cache();
 }
 
-DiskIoMgr* RuntimeState::io_mgr() {
+io::DiskIoMgr* RuntimeState::io_mgr() {
   return exec_env_->disk_io_mgr();
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/runtime-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.h b/be/src/runtime/runtime-state.h
index 74c27e5..4eb3e10 100644
--- a/be/src/runtime/runtime-state.h
+++ b/be/src/runtime/runtime-state.h
@@ -35,7 +35,6 @@ namespace impala {
 class BufferPool;
 class DataStreamRecvr;
 class DescriptorTbl;
-class DiskIoMgr;
 class Expr;
 class LlvmCodeGen;
 class MemTracker;
@@ -53,6 +52,10 @@ class TPlanFragmentCtx;
 class TPlanFragmentInstanceCtx;
 class QueryState;
 
+namespace io {
+  class DiskIoMgr;
+}
+
 /// TODO: move the typedefs into a separate .h (and fix the includes for that)
 
 /// Counts how many rows an INSERT query has added to a particular partition
@@ -124,7 +127,7 @@ class RuntimeState {
   HBaseTableFactory* htable_factory();
   ImpalaBackendClientCache* impalad_client_cache();
   CatalogServiceClientCache* catalogd_client_cache();
-  DiskIoMgr* io_mgr();
+  io::DiskIoMgr* io_mgr();
   MemTracker* instance_mem_tracker() { return instance_mem_tracker_.get(); }
   MemTracker* query_mem_tracker();  // reference to the query_state_'s memtracker
   ReservationTracker* instance_buffer_reservation() {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/test-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.h b/be/src/runtime/test-env.h
index e721510..5fb9a1c 100644
--- a/be/src/runtime/test-env.h
+++ b/be/src/runtime/test-env.h
@@ -18,7 +18,7 @@
 #ifndef IMPALA_RUNTIME_TEST_ENV
 #define IMPALA_RUNTIME_TEST_ENV
 
-#include "runtime/disk-io-mgr.h"
+#include "runtime/io/disk-io-mgr.h"
 #include "runtime/exec-env.h"
 #include "runtime/fragment-instance-state.h"
 #include "runtime/mem-tracker.h"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b840137c/be/src/runtime/tmp-file-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index dde6348..fbc0a36 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -47,6 +47,8 @@ DECLARE_int32(stress_scratch_write_delay_ms);
 
 namespace impala {
 
+using namespace io;
+
 class TmpFileMgrTest : public ::testing::Test {
  public:
   virtual void SetUp() {
@@ -130,7 +132,7 @@ class TmpFileMgrTest : public ::testing::Test {
     group->next_allocation_index_ = value;
   }
 
-  /// Helper to cancel the FileGroup DiskIoRequestContext.
+  /// Helper to cancel the FileGroup RequestContext.
   static void CancelIoContext(TmpFileMgr::FileGroup* group) {
     group->io_mgr_->CancelContext(group->io_ctx_.get());
   }
@@ -404,7 +406,7 @@ TEST_F(TmpFileMgrTest, TestScratchRangeRecycling) {
       std::iota(data[i].begin(), data[i].end(), i);
     }
 
-    DiskIoMgr::WriteRange::WriteDoneCallback callback =
+    WriteRange::WriteDoneCallback callback =
         bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
     vector<unique_ptr<TmpFileMgr::WriteHandle>> handles(BLOCKS);
     // 'file_group' should allocate extra scratch bytes for this 'alloc_size'.
@@ -449,7 +451,7 @@ TEST_F(TmpFileMgrTest, TestProcessMemLimitExceeded) {
   CancelIoContext(&file_group);
 
   // After this error, writing via the file group should fail.
-  DiskIoMgr::WriteRange::WriteDoneCallback callback =
+  WriteRange::WriteDoneCallback callback =
       bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
   unique_ptr<TmpFileMgr::WriteHandle> handle;
   Status status = file_group.Write(MemRange(data.data(), DATA_SIZE), callback, &handle);
@@ -483,7 +485,7 @@ TEST_F(TmpFileMgrTest, TestEncryptionDuringCancellation) {
 
   // Start a write in flight, which should encrypt the data and write it to disk.
   unique_ptr<TmpFileMgr::WriteHandle> handle;
-  DiskIoMgr::WriteRange::WriteDoneCallback callback =
+  WriteRange::WriteDoneCallback callback =
       bind(mem_fn(&TmpFileMgrTest::SignalCallback), this, _1);
   ASSERT_OK(file_group.Write(data_mem_range, callback, &handle));
   string file_path = handle->TmpFilePath();