You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/08/05 03:18:12 UTC

[01/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Repository: incubator-impala
Updated Branches:
  refs/heads/master d5b0c6b93 -> a98b90bd3


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test b/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
index 58fe1bf..920195b 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
@@ -21,7 +21,7 @@ Per-Host Resources: mem-estimate=24.00MB mem-reservation=1.06MB
 |  hash predicates: c_nationkey = n_nationkey
 |  fk/pk conjuncts: c_nationkey = n_nationkey
 |  runtime filters: RF000 <- n_nationkey
-|  mem-estimate=3.15KB mem-reservation=1.06MB
+|  mem-estimate=3.15KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=150000
 |
 |--03:EXCHANGE [BROADCAST]
@@ -66,7 +66,7 @@ Per-Host Resources: mem-estimate=48.01MB mem-reservation=2.12MB
 |  hash predicates: c_nationkey = n_nationkey
 |  fk/pk conjuncts: c_nationkey = n_nationkey
 |  runtime filters: RF000 <- n_nationkey
-|  mem-estimate=3.15KB mem-reservation=1.06MB
+|  mem-estimate=3.15KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=150000
 |
 |--F03:PLAN FRAGMENT [RANDOM] hosts=1 instances=2
@@ -104,7 +104,7 @@ select straight_join *
 from tpch_parquet.lineitem
     left join tpch_parquet.orders on l_orderkey = o_orderkey
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=420.41MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -117,11 +117,11 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1N row-size=454B cardinality=6001215
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=380.41MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=380.41MB mem-reservation=34.00MB
 02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  mem-estimate=300.41MB mem-reservation=136.00MB
+|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1N row-size=454B cardinality=6001215
 |
 |--03:EXCHANGE [BROADCAST]
@@ -146,7 +146,7 @@ Per-Host Resources: mem-estimate=380.41MB mem-reservation=136.00MB
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=840.83MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -159,12 +159,12 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1N row-size=454B cardinality=6001215
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=760.83MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=760.83MB mem-reservation=68.00MB
 02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
 |  hash-table-id=00
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
-|  mem-estimate=300.41MB mem-reservation=136.00MB
+|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1N row-size=454B cardinality=6001215
 |
 |--F03:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -219,7 +219,7 @@ Per-Host Resources: mem-estimate=18.69MB mem-reservation=34.00MB
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF000 <- c_custkey
-|  mem-estimate=18.69MB mem-reservation=34.00MB
+|  mem-estimate=18.69MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
 |--04:EXCHANGE [HASH(c_custkey)]
@@ -270,7 +270,7 @@ Per-Host Resources: mem-estimate=18.69MB mem-reservation=34.00MB
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF000 <- c_custkey
-|  mem-estimate=9.35MB mem-reservation=17.00MB
+|  mem-estimate=9.35MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
 |--F04:PLAN FRAGMENT [HASH(o_custkey)] hosts=1 instances=2
@@ -314,7 +314,7 @@ select straight_join *
 from tpch_parquet.orders
     join /*+broadcast*/ tpch_parquet.customer on o_custkey = c_custkey
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=68.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=101.38MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -327,12 +327,12 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
-Per-Host Resources: mem-estimate=77.38MB mem-reservation=68.00MB
+Per-Host Resources: mem-estimate=77.38MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF000 <- c_custkey
-|  mem-estimate=37.38MB mem-reservation=68.00MB
+|  mem-estimate=37.38MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
 |--03:EXCHANGE [BROADCAST]
@@ -358,7 +358,7 @@ Per-Host Resources: mem-estimate=77.38MB mem-reservation=68.00MB
    mem-estimate=40.00MB mem-reservation=0B
    tuple-ids=0 row-size=191B cardinality=1500000
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=202.76MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -371,13 +371,13 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
-Per-Host Resources: mem-estimate=154.76MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=154.76MB mem-reservation=68.00MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash-table-id=00
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF000 <- c_custkey
-|  mem-estimate=37.38MB mem-reservation=68.00MB
+|  mem-estimate=37.38MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=428B cardinality=1500000
 |
 |--F03:PLAN FRAGMENT [RANDOM] hosts=1 instances=2
@@ -415,7 +415,7 @@ select straight_join *
 from functional_parquet.alltypes
     left join functional_parquet.alltypestiny on alltypes.id = alltypestiny.id
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=2.03GB
 WARNING: The following tables are missing relevant table and/or column statistics.
 functional_parquet.alltypes, functional_parquet.alltypestiny
@@ -430,11 +430,11 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1N row-size=176B cardinality=unavailable
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=2.02GB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=2.02GB mem-reservation=34.00MB
 02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
 |  hash predicates: alltypes.id = alltypestiny.id
 |  fk/pk conjuncts: assumed fk/pk
-|  mem-estimate=2.00GB mem-reservation=136.00MB
+|  mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1N row-size=176B cardinality=unavailable
 |
 |--03:EXCHANGE [BROADCAST]
@@ -459,7 +459,7 @@ Per-Host Resources: mem-estimate=2.02GB mem-reservation=136.00MB
    mem-estimate=16.00MB mem-reservation=0B
    tuple-ids=0 row-size=88B cardinality=unavailable
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=4.06GB
 WARNING: The following tables are missing relevant table and/or column statistics.
 functional_parquet.alltypestiny
@@ -474,12 +474,12 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1N row-size=176B cardinality=unavailable
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=4.03GB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=4.03GB mem-reservation=68.00MB
 02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
 |  hash-table-id=00
 |  hash predicates: alltypes.id = alltypestiny.id
 |  fk/pk conjuncts: assumed fk/pk
-|  mem-estimate=2.00GB mem-reservation=136.00MB
+|  mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1N row-size=176B cardinality=unavailable
 |
 |--F03:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
@@ -516,7 +516,7 @@ select c_nationkey, avg(c_acctbal)
 from tpch_parquet.customer
 group by c_nationkey
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=2.12MB
+Per-Host Resource Reservation: Memory=1.12MB
 Per-Host Resource Estimates: Memory=44.00MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -529,11 +529,11 @@ PLAN-ROOT SINK
 |  tuple-ids=2 row-size=10B cardinality=25
 |
 F01:PLAN FRAGMENT [HASH(c_nationkey)] hosts=1 instances=1
-Per-Host Resources: mem-estimate=10.00MB mem-reservation=2.12MB
+Per-Host Resources: mem-estimate=10.00MB mem-reservation=1.12MB
 03:AGGREGATE [FINALIZE]
 |  output: avg:merge(c_acctbal)
 |  group by: c_nationkey
-|  mem-estimate=10.00MB mem-reservation=2.12MB
+|  mem-estimate=10.00MB mem-reservation=1.12MB spill-buffer=64.00KB
 |  tuple-ids=2 row-size=10B cardinality=25
 |
 02:EXCHANGE [HASH(c_nationkey)]
@@ -545,7 +545,7 @@ Per-Host Resources: mem-estimate=34.00MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: avg(c_acctbal)
 |  group by: c_nationkey
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=10B cardinality=25
 |
 00:SCAN HDFS [tpch_parquet.customer, RANDOM]
@@ -556,7 +556,7 @@ Per-Host Resources: mem-estimate=34.00MB mem-reservation=0B
    mem-estimate=24.00MB mem-reservation=0B
    tuple-ids=0 row-size=10B cardinality=150000
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=4.25MB
+Per-Host Resource Reservation: Memory=2.25MB
 Per-Host Resource Estimates: Memory=88.00MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -569,11 +569,11 @@ PLAN-ROOT SINK
 |  tuple-ids=2 row-size=10B cardinality=25
 |
 F01:PLAN FRAGMENT [HASH(c_nationkey)] hosts=1 instances=2
-Per-Host Resources: mem-estimate=20.00MB mem-reservation=4.25MB
+Per-Host Resources: mem-estimate=20.00MB mem-reservation=2.25MB
 03:AGGREGATE [FINALIZE]
 |  output: avg:merge(c_acctbal)
 |  group by: c_nationkey
-|  mem-estimate=10.00MB mem-reservation=2.12MB
+|  mem-estimate=10.00MB mem-reservation=1.12MB spill-buffer=64.00KB
 |  tuple-ids=2 row-size=10B cardinality=25
 |
 02:EXCHANGE [HASH(c_nationkey)]
@@ -585,7 +585,7 @@ Per-Host Resources: mem-estimate=68.00MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: avg(c_acctbal)
 |  group by: c_nationkey
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=10B cardinality=25
 |
 00:SCAN HDFS [tpch_parquet.customer, RANDOM]
@@ -603,7 +603,7 @@ from tpch_parquet.lineitem
 group by 1, 2
 having count(*) = 1
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=83.00MB
+Per-Host Resource Reservation: Memory=51.00MB
 Per-Host Resource Estimates: Memory=205.28MB
 
 F04:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -616,12 +616,12 @@ PLAN-ROOT SINK
 |  tuple-ids=2 row-size=33B cardinality=4690314
 |
 F03:PLAN FRAGMENT [HASH(l_orderkey,o_orderstatus)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=18.04MB mem-reservation=66.00MB
+Per-Host Resources: mem-estimate=18.04MB mem-reservation=34.00MB
 07:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
 |  group by: l_orderkey, o_orderstatus
 |  having: count(*) = 1
-|  mem-estimate=18.04MB mem-reservation=66.00MB
+|  mem-estimate=18.04MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2 row-size=33B cardinality=4690314
 |
 06:EXCHANGE [HASH(l_orderkey,o_orderstatus)]
@@ -633,14 +633,14 @@ Per-Host Resources: mem-estimate=67.24MB mem-reservation=17.00MB
 03:AGGREGATE [STREAMING]
 |  output: count(*)
 |  group by: l_orderkey, o_orderstatus
-|  mem-estimate=54.12MB mem-reservation=0B
+|  mem-estimate=54.12MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=2 row-size=33B cardinality=4690314
 |
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=13.11MB mem-reservation=17.00MB
+|  mem-estimate=13.11MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=33B cardinality=5757710
 |
 |--05:EXCHANGE [HASH(o_orderkey)]
@@ -672,7 +672,7 @@ Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=8B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=83.00MB
+Per-Host Resource Reservation: Memory=51.00MB
 Per-Host Resource Estimates: Memory=327.24MB
 
 F04:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -685,12 +685,12 @@ PLAN-ROOT SINK
 |  tuple-ids=2 row-size=33B cardinality=4690314
 |
 F03:PLAN FRAGMENT [HASH(l_orderkey,o_orderstatus)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=20.00MB mem-reservation=66.00MB
+Per-Host Resources: mem-estimate=20.00MB mem-reservation=34.00MB
 07:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
 |  group by: l_orderkey, o_orderstatus
 |  having: count(*) = 1
-|  mem-estimate=10.00MB mem-reservation=33.00MB
+|  mem-estimate=10.00MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=2 row-size=33B cardinality=4690314
 |
 06:EXCHANGE [HASH(l_orderkey,o_orderstatus)]
@@ -702,7 +702,7 @@ Per-Host Resources: mem-estimate=67.24MB mem-reservation=17.00MB
 03:AGGREGATE [STREAMING]
 |  output: count(*)
 |  group by: l_orderkey, o_orderstatus
-|  mem-estimate=27.06MB mem-reservation=0B
+|  mem-estimate=27.06MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=2 row-size=33B cardinality=4690314
 |
 02:HASH JOIN [INNER JOIN, PARTITIONED]
@@ -710,7 +710,7 @@ Per-Host Resources: mem-estimate=67.24MB mem-reservation=17.00MB
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=6.56MB mem-reservation=8.50MB
+|  mem-estimate=6.56MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0,1 row-size=33B cardinality=5757710
 |
 |--F05:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -753,7 +753,7 @@ Per-Host Resources: mem-estimate=160.00MB mem-reservation=0B
 select distinct *
 from tpch_parquet.lineitem
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=3.31GB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -766,10 +766,10 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 F01:PLAN FRAGMENT [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=1.62GB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=1.62GB mem-reservation=34.00MB
 03:AGGREGATE [FINALIZE]
 |  group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-|  mem-estimate=1.62GB mem-reservation=264.00MB
+|  mem-estimate=1.62GB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 02:EXCHANGE [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)]
@@ -780,7 +780,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
 Per-Host Resources: mem-estimate=1.69GB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-|  mem-estimate=1.62GB mem-reservation=0B
+|  mem-estimate=1.62GB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -791,7 +791,7 @@ Per-Host Resources: mem-estimate=1.69GB mem-reservation=0B
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=528.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=6.62GB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -804,10 +804,10 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 F01:PLAN FRAGMENT [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=3.23GB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=3.23GB mem-reservation=68.00MB
 03:AGGREGATE [FINALIZE]
 |  group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-|  mem-estimate=1.62GB mem-reservation=264.00MB
+|  mem-estimate=1.62GB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 02:EXCHANGE [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)]
@@ -818,7 +818,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
 Per-Host Resources: mem-estimate=3.39GB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-|  mem-estimate=1.62GB mem-reservation=0B
+|  mem-estimate=1.62GB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -834,7 +834,7 @@ select string_col, count(*)
 from functional_parquet.alltypestiny
 group by string_col
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=272.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 functional_parquet.alltypestiny
@@ -849,11 +849,11 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=24B cardinality=unavailable
 |
 F01:PLAN FRAGMENT [HASH(string_col)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=128.00MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=128.00MB mem-reservation=34.00MB
 03:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
 |  group by: string_col
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=24B cardinality=unavailable
 |
 02:EXCHANGE [HASH(string_col)]
@@ -865,7 +865,7 @@ Per-Host Resources: mem-estimate=144.00MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: count(*)
 |  group by: string_col
-|  mem-estimate=128.00MB mem-reservation=0B
+|  mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=24B cardinality=unavailable
 |
 00:SCAN HDFS [functional_parquet.alltypestiny, RANDOM]
@@ -876,7 +876,7 @@ Per-Host Resources: mem-estimate=144.00MB mem-reservation=0B
    mem-estimate=16.00MB mem-reservation=0B
    tuple-ids=0 row-size=16B cardinality=unavailable
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=528.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=544.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 functional_parquet.alltypestiny
@@ -891,11 +891,11 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=24B cardinality=unavailable
 |
 F01:PLAN FRAGMENT [HASH(string_col)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=256.00MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=256.00MB mem-reservation=68.00MB
 03:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
 |  group by: string_col
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=24B cardinality=unavailable
 |
 02:EXCHANGE [HASH(string_col)]
@@ -907,7 +907,7 @@ Per-Host Resources: mem-estimate=288.00MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: count(*)
 |  group by: string_col
-|  mem-estimate=128.00MB mem-reservation=0B
+|  mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=24B cardinality=unavailable
 |
 00:SCAN HDFS [functional_parquet.alltypestiny, RANDOM]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
index 868d6ca..4c208a4 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
@@ -154,14 +154,14 @@ select id from functional.alltypes t1 where exists (
   where t1.id = t2.id)
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=160.00MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=160.00MB mem-reservation=1.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 02:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: t1.id = t2.id
 |  runtime filters: RF000 <- t2.id
-|  mem-estimate=44B mem-reservation=136.00MB
+|  mem-estimate=44B mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=0 row-size=4B cardinality=10
 |
 |--01:SCAN HDFS [functional.alltypessmall t2]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
index e697914..27459ef 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
@@ -1583,18 +1583,20 @@ from functional.alltypestiny order by id
 int, bigint, bigint, double
 ====
 ---- QUERY
-# Regression test for IMPALA-2265, IMPALA-2559. The max_block_mgr_memory is tuned to
+# Regression test for IMPALA-2265, IMPALA-2559. The buffer_pool_limit is tuned to
 # reproduce the issue when running this query against functional_parquet.
-SET max_block_mgr_memory=16m;
+SET default_spillable_buffer_size=8m;
+SET buffer_pool_limit=16m;
 SELECT lag(-180, 13) over (ORDER BY t1.int_col ASC, t2.int_col ASC) AS int_col
 FROM functional_parquet.alltypes t1 CROSS JOIN functional_parquet.alltypes t2 LIMIT 10;
 ---- CATCH
-Memory limit exceeded
+Failed to get minimum memory reservation
 ====
 ---- QUERY
 # Check that the above query can succeed with the minimum buffers (3 buffers for sort,
-# 1 buffer for analytic).
-SET max_block_mgr_memory=32m;
+# 2 buffer for analytic).
+SET default_spillable_buffer_size=8m;
+SET buffer_pool_limit=40m;
 SELECT lag(-180, 13) over (ORDER BY t1.int_col ASC, t2.int_col ASC) AS int_col
 FROM functional_parquet.alltypes t1 CROSS JOIN functional_parquet.alltypes t2 LIMIT 10;
 ---- TYPES

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
index 64f9b45..122d928 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
@@ -5,7 +5,7 @@ explain
 select *
 from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 ---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
 'Per-Host Resource Estimates: Memory=476.41MB'
 ''
 'PLAN-ROOT SINK'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
index f59962c..475758d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
@@ -5,7 +5,7 @@ explain
 select *
 from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 ---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
 'Per-Host Resource Estimates: Memory=476.41MB'
 ''
 'PLAN-ROOT SINK'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
index 2736543..2fa7576 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
@@ -5,7 +5,7 @@ explain
 select *
 from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 ---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
 'Per-Host Resource Estimates: Memory=476.41MB'
 ''
 'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
@@ -18,12 +18,12 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 '|  tuple-ids=0,1 row-size=454B cardinality=5757710'
 '|'
 'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
-'Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB'
+'Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB'
 '02:HASH JOIN [INNER JOIN, BROADCAST]'
 '|  hash predicates: l_orderkey = o_orderkey'
 '|  fk/pk conjuncts: l_orderkey = o_orderkey'
 '|  runtime filters: RF000 <- o_orderkey'
-'|  mem-estimate=300.41MB mem-reservation=136.00MB'
+'|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
 '|  tuple-ids=0,1 row-size=454B cardinality=5757710'
 '|'
 '|--03:EXCHANGE [BROADCAST]'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
index 31f4f5b..76d74ce 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
@@ -5,7 +5,7 @@ explain
 select *
 from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 ---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
 'Per-Host Resource Estimates: Memory=476.41MB'
 ''
 'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
@@ -18,14 +18,14 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
 '     tuple-ids=0,1 row-size=454B cardinality=5757710'
 ''
 'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
-'Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB'
+'Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB'
 '  DATASTREAM SINK [FRAGMENT=F02, EXCHANGE=04, UNPARTITIONED]'
 '  |  mem-estimate=0B mem-reservation=0B'
 '  02:HASH JOIN [INNER JOIN, BROADCAST]'
 '  |  hash predicates: l_orderkey = o_orderkey'
 '  |  fk/pk conjuncts: l_orderkey = o_orderkey'
 '  |  runtime filters: RF000 <- o_orderkey'
-'  |  mem-estimate=300.41MB mem-reservation=136.00MB'
+'  |  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
 '  |  tuple-ids=0,1 row-size=454B cardinality=5757710'
 '  |'
 '  |--03:EXCHANGE [BROADCAST]'

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
index 626b315..c8a80b2 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
@@ -234,11 +234,10 @@ order by c_custkey
 bigint, bigint
 ====
 ---- QUERY
-# IMPALA-5446: dropped status from Sorter::Reset() when sort cannot get reserved buffer.
-# This query is designed to allow the initial subplan iterations to succeed, but have
-# later iterations fail because the aggregation outside the subplan has accumulated all
-# the memory.
-set max_block_mgr_memory=100m;
+# This was originally a regression test for IMPALA-5446: dropped status from
+# Sorter::Reset() when sort cannot get reserved buffer. However with the
+# IMPALA-3200 changes it now succeeds.
+set buffer_pool_limit=100m;
 select c_custkey, c_name, c_address, c_phone, c_acctbal, c_mktsegment, c_comment,
        o_orderdate, sum(o_totalprice), min(rnum)
 from customer c,
@@ -247,6 +246,17 @@ from customer c,
 group by 1, 2, 3, 4, 5, 6, 7, 8
 order by 9, 10 desc
 limit 10
----- CATCH
-Memory limit exceeded: Query did not have enough memory to get the minimum required buffers in the block manager.
+---- RESULTS
+3115,'Customer#000003115','oB 75yHls7ptt5zCheWJLQ','22-291-864-7521',8889.56,'BUILDING','ts are quickly across the bold deposits. carefully spe','1998-04-23',857.71,3
+53551,'Customer#000053551','e,fT3URuJDH,tE6a6Z3Pjg0DZMFSqWbtYgd','15-429-275-5686',1137.38,'FURNITURE',' detect evenly along the blithely pending asymptotes. furiously even notornis detect carefu','1992-04-18',866.90,25
+64043,'Customer#000064043','Snyi GOB00','22-446-332-2750',4627.24,'FURNITURE','the quickly express asymptotes are around the pe','1992-01-31',870.88,11
+107698,'Customer#000107698','stUoykCwpTBAO3OC3lw','33-686-199-1188',698.89,'AUTOMOBILE',' accounts eat carefully express packages. slyly even id','1993-11-21',875.52,15
+1351,'Customer#000001351','NYMFfkNlCGoTeaDrNO9nn','11-916-210-6616',3106.00,'FURNITURE',' accounts after the final deposits sleep fluffily ironic accoun','1994-01-14',877.30,13
+85468,'Customer#000085468','EuFCX4qk4k0O4bV3UHoNVBTP','23-876-106-3120',8926.31,'AUTOMOBILE','kages. slyly even requests according to the ironic, ironic accounts cajole furiou','1997-04-12',884.52,4
+148522,'Customer#000148522','PIDMm8ulW4oam3VsoZL4f ,dpAf3LEV','16-597-824-4946',-133.27,'BUILDING','ly quickly express deposits. regularly regular requests cajole carefully slyly even noto','1995-03-20',885.75,12
+83222,'Customer#000083222','vI3tUuqtUYGPfrXAYeonVD9','27-599-263-5978',289.66,'BUILDING','ost quietly idle foxes. packages at the slyly pending pa','1993-05-02',891.74,5
+25090,'Customer#000025090','92GyVjZZiCBUmn','23-396-651-8663',8497.56,'BUILDING','osits. slyly final pinto beans sleep carefully fluffily express deposits. packages affix. carefully spe','1995-08-12',895.39,15
+27490,'Customer#000027490','jRzZQ1z7T,nrX5F58P,ZH','26-121-240-6744',7512.30,'AUTOMOBILE','slyly quickly even pinto beans: pend','1995-07-25',896.59,14
+---- TYPES
+bigint,string,string,string,decimal,string,string,string,decimal,bigint
 ====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test b/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
index 8c8f770..66391a5 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
@@ -8,7 +8,7 @@
 # consumption / spilling behaviour.
 ####################################################
 
-SET MAX_BLOCK_MGR_MEMORY=275m;
+SET BUFFER_POOL_LIMIT=40m;
 SET RUNTIME_FILTER_MODE=GLOBAL;
 SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
 SET RUNTIME_BLOOM_FILTER_SIZE=16M;
@@ -82,7 +82,8 @@ SET RUNTIME_FILTER_MODE=GLOBAL;
 SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
 SET RUNTIME_FILTER_MIN_SIZE=128MB;
 SET RUNTIME_FILTER_MAX_SIZE=500MB;
-SET MEM_LIMIT=140MB;
+# Allocate enough memory for the join + filter + scan
+SET MEM_LIMIT=170MB;
 select STRAIGHT_JOIN * from alltypes a join [BROADCAST] alltypes b
     on a.month = b.id and b.int_col = -3
 ---- RESULTS

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
index d0ac79d..14ad2ed 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
@@ -16,7 +16,7 @@ row_regex: .*RowsProduced: 10.99..\W10995\W
 # Test to verify that is limit_ is correctly enforced when
 # output_batch is at AtCapacity.
 set batch_size=6;
-set max_block_mgr_memory=180m;
+set buffer_pool_limit=180m;
 select * from tpch.lineitem t1 full outer join tpch.lineitem t2 on
 t1.l_orderkey = t2.l_orderkey limit 10;
 ---- RUNTIME_PROFILE

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
index 74b7eee..93ed510 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
@@ -36,7 +36,7 @@ row_regex: .* SpilledRuns: [^0].*
 # Regression test for IMPALA-5554: first string column in sort tuple is null
 # on boundary of spilled block. Test does two sorts with a NULL and non-NULL
 # string column in both potential orders.
-set max_block_mgr_memory=50m;
+set buffer_pool_limit=50m;
 select *
 from (
   select *, first_value(col) over (order by sort_col) fv

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/spilling.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/spilling.test b/testdata/workloads/functional-query/queries/QueryTest/spilling.test
index 0f0e2ca..d8335c6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/spilling.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/spilling.test
@@ -1,6 +1,6 @@
 ====
 ---- QUERY
-set max_block_mgr_memory=25m;
+set buffer_pool_limit=10m;
 select l_orderkey, count(*)
 from lineitem
 group by 1
@@ -21,15 +21,12 @@ BIGINT, BIGINT
 ---- RUNTIME_PROFILE
 # Verify that spilling and passthrough were activated.
 row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
-row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
 row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # Test query with string grouping column and string agg columns
-# Could only get it to spill reliably with num_nodes=1.
-# TODO: revisit with new buffer pool.
+set buffer_pool_limit=10m;
 set num_nodes=1;
-set max_block_mgr_memory=25m;
 select l_returnflag, l_orderkey, avg(l_tax), min(l_shipmode)
 from lineitem
 group by 1,2
@@ -45,7 +42,7 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
 row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
-set max_block_mgr_memory=25m;
+set buffer_pool_limit=10m;
 select l_orderkey, count(*)
 from lineitem
 group by 1
@@ -65,15 +62,12 @@ order by 1 limit 10;
 BIGINT, BIGINT
 ---- RUNTIME_PROFILE
 row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
-row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
 row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # Test query with string grouping column
-# Could only get it to spill reliably with num_nodes=1.
-# TODO: revisit with new buffer pool.
+set buffer_pool_limit=10m;
 set num_nodes=1;
-set max_block_mgr_memory=25m;
 select l_comment, count(*)
 from lineitem
 group by 1
@@ -92,10 +86,8 @@ row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # Test query with string grouping column and string agg columns
-# Could only get it to spill reliably with num_nodes=1.
-# TODO: revisit with new buffer pool.
+set buffer_pool_limit=10m;
 set num_nodes=1;
-set max_block_mgr_memory=25m;
 select l_returnflag, l_orderkey, round(avg(l_tax),2), min(l_shipmode)
 from lineitem
 group by 1,2
@@ -113,7 +105,7 @@ row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # Test with string intermediate state (avg() uses string intermediate value).
-set max_block_mgr_memory=25m;
+set buffer_pool_limit=10m;
 select l_orderkey, avg(l_orderkey)
 from lineitem
 group by 1
@@ -129,12 +121,10 @@ BIGINT, DOUBLE
 ---- RUNTIME_PROFILE
 # Verify that passthrough and spilling happened in the pre and merge agg.
 row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
-row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
 row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
-set num_nodes=0;
-set max_block_mgr_memory=100m;
+set buffer_pool_limit=15m;
 select count(l1.l_tax)
 from
 lineitem l1,
@@ -156,8 +146,7 @@ BIGINT
 row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
-set num_nodes=0;
-set max_block_mgr_memory=40m;
+set buffer_pool_limit=2m;
 select max(t1.total_count), max(t1.l_shipinstruct), max(t1.l_comment) from
 (select l_shipinstruct, l_comment, count(*) over () total_count from lineitem) t1
 ---- RESULTS
@@ -165,13 +154,12 @@ select max(t1.total_count), max(t1.l_shipinstruct), max(t1.l_comment) from
 ---- TYPES
 BIGINT, STRING, STRING
 ---- RUNTIME_PROFILE
-# Indirectly verify that the analytic spilled: if it spills a block, it must repin it.
-row_regex: .*PinTime: [1-9][0-9]*.*
+# Verify that the analytic spilled
+row_regex: .*PeakUnpinnedBytes: [1-9][0-9]*.*
 ====
 ---- QUERY
-# Run this query with very low memory. Since the tables are small, the PA/PHJ should be
-# using buffers much smaller than the io buffer.
-set max_block_mgr_memory=10m;
+# Run this query with very low memory, but enough not to spill.
+set buffer_pool_limit=20m;
 select a.int_col, count(*)
 from functional.alltypessmall a, functional.alltypessmall b, functional.alltypessmall c
 where a.id = b.id and b.id = c.id group by a.int_col
@@ -192,12 +180,11 @@ INT, BIGINT
 # This query is not meant to spill.
 row_regex: .*SpilledPartitions: 0 .*
 ====
----- QUERY: TPCH-Q21
+---- QUERY: TPCH-Q22
 # Adding TPCH-Q21 in the spilling test to check for IMPALA-1471 (spilling left anti
 # and left outer joins were returning wrong results).
 # Q21 - Suppliers Who Kept Orders Waiting Query
-set num_nodes=0;
-set max_block_mgr_memory=65m;
+set buffer_pool_limit=20m;
 select
   s_name,
   count(*) as numwait
@@ -347,8 +334,7 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # Test aggregation spill with group_concat distinct
-set num_nodes=1;
-set max_block_mgr_memory=100m;
+set buffer_pool_limit=50m;
 select l_orderkey, count(*), group_concat(distinct l_linestatus, '|')
 from lineitem
 group by 1
@@ -376,7 +362,6 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
 # nodes. CastToChar will do "local" memory allocation. Without the fix of
 # IMPALA-2612, the peak memory consumption will be higher.
 set mem_limit=800m;
-set num_nodes=1;
 set num_scanner_threads=1;
 select count(distinct concat(cast(l_comment as char(120)), cast(l_comment as char(120)),
                              cast(l_comment as char(120)), cast(l_comment as char(120)),
@@ -394,8 +379,7 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
 # Test sort with small char column materialized by exprs.
 # Set low memory limit to force spilling.
 # IMPALA-3332: comparator makes local allocations that cause runaway memory consumption.
-set num_nodes=0;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
 set mem_limit=200m;
 set disable_outermost_topn=1;
 select cast(l_comment as char(50))
@@ -432,9 +416,8 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # Test sort with small input char column materialized before sort.
-set num_nodes=0;
 set mem_limit=200m;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
 set disable_outermost_topn=1;
 select char_col
 from (select cast(l_comment as char(50)) char_col
@@ -472,9 +455,8 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
 ---- QUERY
 # Test sort with large input char column materialized before sort.
 # Set low memory limit to force spilling.
-set num_nodes=0;
 set mem_limit=200m;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
 set disable_outermost_topn=1;
 select char_col
 from (select cast(l_comment as char(200)) char_col
@@ -512,8 +494,7 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
 ---- QUERY
 # Test sort with varchar column materialized by exprs.
 # Set low memory limit to force spilling.
-set num_nodes=0;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
 # IMPALA-3332: comparator makes local allocations that cause runaway memory consumption.
 set mem_limit=200m;
 set disable_outermost_topn=1;
@@ -552,9 +533,8 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
 ---- QUERY
 # Test sort with input varchar column materialized before sort.
 # Set low memory limit to force spilling.
-set num_nodes=0;
 set mem_limit=200m;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
 set disable_outermost_topn=1;
 select char_col
 from (select cast(l_comment as varchar(50)) char_col
@@ -592,9 +572,7 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
 ---- QUERY
 # IMPALA-1346/IMPALA-1546: fix sorter memory management so that it can complete
 # successfully when in same pipeline as a spilling join.
-set num_nodes=0;
-set mem_limit=200m;
-set max_block_mgr_memory=50m;
+set buffer_pool_limit=50m;
 set disable_outermost_topn=1;
 select * from lineitem
   inner join orders on l_orderkey = o_orderkey
@@ -632,7 +610,7 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
 # Tests for the case where a spilled partition has 0 probe rows and so we don't build the
 # hash table in a partitioned hash join.
 # INNER JOIN
-set max_block_mgr_memory=10m;
+set buffer_pool_limit=10m;
 select straight_join count(*)
 from
 lineitem a, lineitem b
@@ -648,7 +626,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # spilled partition with 0 probe rows, NULL AWARE LEFT ANTI JOIN
-set max_block_mgr_memory=10m;
+set buffer_pool_limit=10m;
 select straight_join count(*)
 from
 lineitem a
@@ -664,7 +642,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # spilled partition with 0 probe rows, RIGHT OUTER JOIN
-set max_block_mgr_memory=10m;
+set buffer_pool_limit=10m;
 select straight_join count(*)
 from
 supplier right outer join lineitem on s_suppkey = l_suppkey
@@ -678,7 +656,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
 ====
 ---- QUERY
 # spilled partition with 0 probe rows, RIGHT ANTI JOIN
-set max_block_mgr_memory=30m;
+set buffer_pool_limit=30m;
 with x as (select * from supplier limit 10)
 select straight_join count(*)
 from
@@ -698,7 +676,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
 #   where l1.l_quantity = 31.0 and l1.l_tax = 0.03 and l1.l_orderkey <= 100000
 # order by l_orderkey, l_partkey, l_suppkey, l_linenumber
 # limit 5
-set max_block_mgr_memory=7m;
+set buffer_pool_limit=7m;
 set num_nodes=1;
 select straight_join l.*
 from
@@ -726,3 +704,16 @@ bigint,bigint,bigint,int,decimal,decimal,decimal,decimal,string,string,string,st
 1382,156162,6163,5,31.00,37762.96,0.07,0.03,'R','F','1993-10-26','1993-10-15','1993-11-09','TAKE BACK RETURN','FOB','hely regular dependencies. f'
 1509,186349,3904,6,31.00,44495.54,0.04,0.03,'A','F','1993-07-14','1993-08-21','1993-08-06','COLLECT COD','SHIP','ic deposits cajole carefully. quickly bold '
 ====
+---- QUERY
+# Test aggregation with minimum required reservation to exercise IMPALA-2708.
+# Merge aggregation requires 17 buffers. The buffer size is 256k for this test.
+set buffer_pool_limit=4352k;
+select count(*)
+from (select distinct * from orders) t
+---- TYPES
+BIGINT
+---- RESULTS
+1500000
+---- RUNTIME_PROFILE
+row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/targeted-stress/queries/agg_stress.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/targeted-stress/queries/agg_stress.test b/testdata/workloads/targeted-stress/queries/agg_stress.test
index b2d45a9..a6657ba 100644
--- a/testdata/workloads/targeted-stress/queries/agg_stress.test
+++ b/testdata/workloads/targeted-stress/queries/agg_stress.test
@@ -1,7 +1,7 @@
 ====
 ---- QUERY
 # This memory limit causes a spill to happen for this query
-set max_block_mgr_memory=250m;
+set buffer_pool_limit=250m;
 # This query forces many joins and aggregations with spilling
 # and can expose race conditions in the spilling code if run in parallel
 select

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/tpch/queries/insert_parquet.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/queries/insert_parquet.test b/testdata/workloads/tpch/queries/insert_parquet.test
index 4707b7b..862548e 100644
--- a/testdata/workloads/tpch/queries/insert_parquet.test
+++ b/testdata/workloads/tpch/queries/insert_parquet.test
@@ -67,6 +67,8 @@ insert overwrite table test_insert_huge_vals
 ---- QUERY
 # Verify the values written to test_insert_huge_vals were as expected by counting
 # the results of an inner join of that table with the same query used in the insert.
+# Increase spillable buffer size to fit the large values on right side of hash join.
+set min_spillable_buffer_size=1m;
 select count(*) from
   (select cast(l_orderkey as string) s from tpch.lineitem union
    select group_concat(concat(s_name, s_address, s_phone)) from tpch.supplier union

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/comparison/discrepancy_searcher.py
----------------------------------------------------------------------
diff --git a/tests/comparison/discrepancy_searcher.py b/tests/comparison/discrepancy_searcher.py
index ccbdd66..e0e1725 100755
--- a/tests/comparison/discrepancy_searcher.py
+++ b/tests/comparison/discrepancy_searcher.py
@@ -315,7 +315,7 @@ class QueryExecutor(object):
         SET DISABLE_STREAMING_PREAGGREGATIONS={disable_streaming_preaggregations};
         SET DISABLE_UNSAFE_SPILLS={disable_unsafe_spills};
         SET EXEC_SINGLE_NODE_ROWS_THRESHOLD={exec_single_node_rows_threshold};
-        SET MAX_BLOCK_MGR_MEMORY={max_block_mgr_memory};
+        SET BUFFER_POOL_LIMIT={buffer_pool_limit};
         SET MAX_IO_BUFFERS={max_io_buffers};
         SET MAX_SCAN_RANGE_LENGTH={max_scan_range_length};
         SET NUM_NODES={num_nodes};
@@ -333,7 +333,7 @@ class QueryExecutor(object):
             disable_streaming_preaggregations=choice((0, 1)),
             disable_unsafe_spills=choice((0, 1)),
             exec_single_node_rows_threshold=randint(1, 100000000),
-            max_block_mgr_memory=randint(1, 100000000),
+            buffer_pool_limit=randint(1, 100000000),
             max_io_buffers=randint(1, 100000000),
             max_scan_range_length=randint(1, 100000000),
             num_nodes=randint(3, 3),

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/custom_cluster/test_scratch_disk.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_scratch_disk.py b/tests/custom_cluster/test_scratch_disk.py
index 7e02de5..579ca1e 100644
--- a/tests/custom_cluster/test_scratch_disk.py
+++ b/tests/custom_cluster/test_scratch_disk.py
@@ -40,7 +40,7 @@ class TestScratchDir(CustomClusterTestSuite):
   # Block manager memory limit that is low enough to force Impala to spill to disk when
   # executing spill_query and high enough that we can execute in_mem_query without
   # spilling.
-  max_block_mgr_memory = "64m"
+  buffer_pool_limit = "64m"
 
   def count_nonempty_dirs(self, dirs):
     count = 0
@@ -87,7 +87,7 @@ class TestScratchDir(CustomClusterTestSuite):
     self.assert_impalad_log_contains("INFO", "Using scratch directory ",
                                     expected_count=1)
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     self.execute_query_expect_success(client, self.spill_query, exec_option)
@@ -100,7 +100,7 @@ class TestScratchDir(CustomClusterTestSuite):
     self.assert_impalad_log_contains("WARNING",
         "Running without spill to disk: no scratch directories provided\.")
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     # Expect spill to disk to fail
@@ -121,7 +121,7 @@ class TestScratchDir(CustomClusterTestSuite):
     self.assert_impalad_log_contains("WARNING", "Could not remove and recreate directory "
             + ".*: cannot use it for scratch\. Error was: .*", expected_count=5)
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     # Expect spill to disk to fail
@@ -144,7 +144,7 @@ class TestScratchDir(CustomClusterTestSuite):
         + "Encountered exception while verifying existence of directory path",
         expected_count=5)
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     impalad = self.cluster.get_any_impalad()
     client = impalad.service.create_beeswax_client()
     # Expect spill to disk to fail
@@ -164,7 +164,7 @@ class TestScratchDir(CustomClusterTestSuite):
     self.assert_impalad_log_contains("INFO", "Using scratch directory ",
                                     expected_count=len(dirs))
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     # Trigger errors when writing the first two directories.
     shutil.rmtree(dirs[0]) # Remove the first directory.
     # Make all subdirectories in the second directory non-writable.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/custom_cluster/test_spilling.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_spilling.py b/tests/custom_cluster/test_spilling.py
deleted file mode 100644
index 774e83f..0000000
--- a/tests/custom_cluster/test_spilling.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-from copy import deepcopy
-
-from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
-from tests.common.test_dimensions import (
-    create_single_exec_option_dimension,
-    create_parquet_dimension)
-
-class TestSpilling(CustomClusterTestSuite):
-  @classmethod
-  def get_workload(self):
-    return 'functional-query'
-
-  @classmethod
-  def add_test_dimensions(cls):
-    super(TestSpilling, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.clear_constraints()
-    cls.ImpalaTestMatrix.add_dimension(create_parquet_dimension('tpch'))
-    cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
-
-  # Reduce the IO read size. This reduces the memory required to trigger spilling.
-  @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(
-      impalad_args="--read_size=200000",
-      catalogd_args="--load_catalog_in_background=false")
-  def test_spilling(self, vector):
-    new_vector = deepcopy(vector)
-    # remove this. the test cases set this explicitly.
-    del new_vector.get_value('exec_option')['num_nodes']
-    self.run_test_case('QueryTest/spilling', new_vector)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_cancellation.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_cancellation.py b/tests/query_test/test_cancellation.py
index bb1fc0d..91be5d4 100644
--- a/tests/query_test/test_cancellation.py
+++ b/tests/query_test/test_cancellation.py
@@ -52,7 +52,7 @@ DEBUG_ACTIONS = [None, 'WAIT']
 # Extra dimensions to test order by without limit
 SORT_QUERY = 'select * from lineitem order by l_orderkey'
 SORT_CANCEL_DELAY = range(6, 10)
-SORT_BLOCK_MGR_LIMIT = ['0', '300m'] # Test spilling and non-spilling sorts.
+SORT_BUFFER_POOL_LIMIT = ['0', '300m'] # Test spilling and non-spilling sorts.
 
 class TestCancellation(ImpalaTestSuite):
   @classmethod
@@ -71,7 +71,7 @@ class TestCancellation(ImpalaTestSuite):
     cls.ImpalaTestMatrix.add_dimension(
         ImpalaTestDimension('action', *DEBUG_ACTIONS))
     cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('max_block_mgr_memory', 0))
+        ImpalaTestDimension('buffer_pool_limit', 0))
 
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('query_type') != 'CTAS' or (\
@@ -121,8 +121,8 @@ class TestCancellation(ImpalaTestSuite):
     debug_action = '0:GETNEXT:' + action if action != None else ''
     vector.get_value('exec_option')['debug_action'] = debug_action
 
-    vector.get_value('exec_option')['max_block_mgr_memory'] =\
-        vector.get_value('max_block_mgr_memory')
+    vector.get_value('exec_option')['buffer_pool_limit'] =\
+        vector.get_value('buffer_pool_limit')
 
     # Execute the query multiple times, cancelling it each time.
     for i in xrange(NUM_CANCELATION_ITERATIONS):
@@ -216,7 +216,7 @@ class TestCancellationFullSort(TestCancellation):
     cls.ImpalaTestMatrix.add_dimension(
         ImpalaTestDimension('cancel_delay', *SORT_CANCEL_DELAY))
     cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('max_block_mgr_memory', *SORT_BLOCK_MGR_LIMIT))
+        ImpalaTestDimension('buffer_pool_limit', *SORT_BUFFER_POOL_LIMIT))
     cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('action', None))
     cls.ImpalaTestMatrix.add_constraint(lambda v:\
        v.get_value('table_format').file_format =='parquet' and\

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_mem_usage_scaling.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_mem_usage_scaling.py b/tests/query_test/test_mem_usage_scaling.py
index e6eccf9..bbdc771 100644
--- a/tests/query_test/test_mem_usage_scaling.py
+++ b/tests/query_test/test_mem_usage_scaling.py
@@ -82,7 +82,8 @@ class TestExprMemUsage(ImpalaTestSuite):
 
 class TestLowMemoryLimits(ImpalaTestSuite):
   '''Super class for the memory limit tests with the TPC-H and TPC-DS queries'''
-  EXPECTED_ERROR_MSG = "Memory limit exceeded"
+  EXPECTED_ERROR_MSGS = ["Memory limit exceeded",
+      "Failed to get minimum memory reservation"]
 
   def low_memory_limit_test(self, vector, tpch_query, limit, xfail_mem_limit=None):
     mem = vector.get_value('mem_limit')
@@ -93,28 +94,36 @@ class TestLowMemoryLimits(ImpalaTestSuite):
     # If memory limit larger than the minimum threshold, then it is not expected to fail.
     expects_error = mem < limit
     new_vector = copy(vector)
-    new_vector.get_value('exec_option')['mem_limit'] = str(mem) + "m"
+    exec_options = new_vector.get_value('exec_option')
+    exec_options['mem_limit'] = str(mem) + "m"
+
+    # Reduce the page size to better exercise page boundary logic.
+    exec_options['default_spillable_buffer_size'] = "256k"
     try:
       self.run_test_case(tpch_query, new_vector)
     except ImpalaBeeswaxException as e:
       if not expects_error and not xfail_mem_limit: raise
-      assert TestLowMemoryLimits.EXPECTED_ERROR_MSG in str(e)
+      found_expected_error = False
+      for error_msg in TestLowMemoryLimits.EXPECTED_ERROR_MSGS:
+        if error_msg in str(e): found_expected_error = True
+      assert found_expected_error, str(e)
       if not expects_error and xfail_mem_limit:
         pytest.xfail(xfail_mem_limit)
 
 
 class TestTpchMemLimitError(TestLowMemoryLimits):
-  # TODO: After we stabilize the mem usage test, we should move this test to exhaustive.
+  # TODO: consider moving this test to exhaustive.
   # The mem limits that will be used.
-  MEM_IN_MB = [20, 140, 180, 275, 450, 700, 980]
+  MEM_IN_MB = [20, 140, 180, 220, 275, 450, 700]
 
   # Different values of mem limits and minimum mem limit (in MBs) each query is expected
-  # to run without problem. Those values were determined by manual testing.
-  MIN_MEM_FOR_TPCH = { 'Q1' : 140, 'Q2' : 120, 'Q3' : 240, 'Q4' : 125, 'Q5' : 235,\
-                       'Q6' : 25, 'Q7' : 265, 'Q8' : 250, 'Q9' : 400, 'Q10' : 240,\
-                       'Q11' : 110, 'Q12' : 125, 'Q13' : 110, 'Q14' : 229, 'Q15' : 125,\
-                       'Q16' : 125, 'Q17' : 130, 'Q18' : 475, 'Q19' : 240, 'Q20' : 250,\
-                       'Q21' : 620, 'Q22' : 125}
+  # to run without problem. These were determined using the query_runtime_info.json file
+  # produced by the stress test (i.e. concurrent_select.py).
+  MIN_MEM_FOR_TPCH = { 'Q1' : 125, 'Q2' : 125, 'Q3' : 112, 'Q4' : 137, 'Q5' : 137,\
+                       'Q6' : 25, 'Q7' : 200, 'Q8' : 125, 'Q9' : 200, 'Q10' : 162,\
+                       'Q11' : 112, 'Q12' : 150, 'Q13' : 125, 'Q14' : 125, 'Q15' : 125,\
+                       'Q16' : 137, 'Q17' : 137, 'Q18' : 196, 'Q19' : 112, 'Q20' : 162,\
+                       'Q21' : 187, 'Q22' : 125}
 
   @classmethod
   def get_workload(self):

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_nested_types.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_nested_types.py b/tests/query_test/test_nested_types.py
index 96a170b..bb74faa 100644
--- a/tests/query_test/test_nested_types.py
+++ b/tests/query_test/test_nested_types.py
@@ -27,7 +27,6 @@ from tests.common.skip import (
     SkipIfS3,
     SkipIfADLS,
     SkipIfLocal)
-
 from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
 
 class TestNestedTypes(ImpalaTestSuite):

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_scratch_limit.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scratch_limit.py b/tests/query_test/test_scratch_limit.py
index 6a13318..6e19bb5 100644
--- a/tests/query_test/test_scratch_limit.py
+++ b/tests/query_test/test_scratch_limit.py
@@ -28,7 +28,7 @@ class TestScratchLimit(ImpalaTestSuite):
 
   # Block manager memory limit that is low enough to
   # force Impala to spill to disk when executing 'spill_query'
-  max_block_mgr_memory = "64m"
+  buffer_pool_limit = "64m"
 
   @classmethod
   def get_workload(self):
@@ -48,7 +48,7 @@ class TestScratchLimit(ImpalaTestSuite):
     its required scratch space which in this case is 128m.
     """
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     exec_option['scratch_limit'] = '500m'
     self.execute_query_expect_success(self.client, self.spill_query, exec_option)
 
@@ -58,7 +58,7 @@ class TestScratchLimit(ImpalaTestSuite):
     its required scratch space which in this case is 128m.
     """
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     exec_option['scratch_limit'] = '24m'
     expected_error = 'Scratch space limit of %s bytes exceeded'
     scratch_limit_in_bytes = 24 * 1024 * 1024
@@ -74,7 +74,7 @@ class TestScratchLimit(ImpalaTestSuite):
     zero which means no scratch space can be allocated.
     """
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     exec_option['scratch_limit'] = '0'
     self.execute_query_expect_failure(self.spill_query, exec_option)
 
@@ -83,7 +83,7 @@ class TestScratchLimit(ImpalaTestSuite):
     Query runs to completion with a scratch Limit of -1 means default/no limit.
     """
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     exec_option['scratch_limit'] = '-1'
     self.execute_query_expect_success(self.client, self.spill_query, exec_option)
 
@@ -92,7 +92,7 @@ class TestScratchLimit(ImpalaTestSuite):
     Query runs to completion with the default setting of no scratch limit.
     """
     exec_option = vector.get_value('exec_option')
-    exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+    exec_option['buffer_pool_limit'] = self.buffer_pool_limit
     self.execute_query_expect_success(self.client, self.spill_query, exec_option)
 
   def test_with_zero_scratch_limit_no_memory_limit(self, vector):

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_sort.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_sort.py b/tests/query_test/test_sort.py
index b048c9f..df95ddd 100644
--- a/tests/query_test/test_sort.py
+++ b/tests/query_test/test_sort.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from copy import copy
+
 from tests.common.impala_test_suite import ImpalaTestSuite
 
 def transpose_results(result, map_fn=lambda x: x):
@@ -46,7 +48,7 @@ class TestQueryFullSort(ImpalaTestSuite):
        takes about a minute"""
     query = """select l_comment, l_partkey, l_orderkey, l_suppkey, l_commitdate
             from lineitem order by l_comment limit 100000"""
-    exec_option = vector.get_value('exec_option')
+    exec_option = copy(vector.get_value('exec_option'))
     exec_option['disable_outermost_topn'] = 1
     table_format = vector.get_value('table_format')
 
@@ -63,16 +65,18 @@ class TestQueryFullSort(ImpalaTestSuite):
     query = """select o_orderdate, o_custkey, o_comment
       from orders
       order by o_orderdate"""
-    exec_option = vector.get_value('exec_option')
+    exec_option = copy(vector.get_value('exec_option'))
     table_format = vector.get_value('table_format')
 
-    max_block_mgr_memory_values = ['-1', '48M'] # Unlimited and minimum memory.
+    # The below memory value assume 8M pages.
+    exec_option['default_spillable_buffer_size'] = '8M'
+    buffer_pool_limit_values = ['-1', '48M'] # Unlimited and minimum memory.
     if self.exploration_strategy() == 'exhaustive' and \
         table_format.file_format == 'parquet':
       # Test some intermediate values for parquet on exhaustive.
-      max_block_mgr_memory_values += ['64M', '128M', '256M']
-    for max_block_mgr_memory in max_block_mgr_memory_values:
-      exec_option['max_block_mgr_memory'] = max_block_mgr_memory
+      buffer_pool_limit_values += ['64M', '128M', '256M']
+    for buffer_pool_limit in buffer_pool_limit_values:
+      exec_option['buffer_pool_limit'] = buffer_pool_limit
       result = transpose_results(self.execute_query(
         query, exec_option, table_format=table_format).data)
       assert(result[0] == sorted(result[0]))
@@ -83,7 +87,7 @@ class TestQueryFullSort(ImpalaTestSuite):
     query = """select o1.o_orderdate, o2.o_custkey, o1.o_comment from orders o1 join
     orders o2 on (o1.o_orderkey = o2.o_orderkey) order by o1.o_orderdate limit 100000"""
 
-    exec_option = vector.get_value('exec_option')
+    exec_option = copy(vector.get_value('exec_option'))
     exec_option['disable_outermost_topn'] = 1
     exec_option['mem_limit'] = "1200m"
     table_format = vector.get_value('table_format')
@@ -97,7 +101,7 @@ class TestQueryFullSort(ImpalaTestSuite):
     select * from orders union all select * from orders) as i
     order by o_orderdate limit 100000"""
 
-    exec_option = vector.get_value('exec_option')
+    exec_option = copy(vector.get_value('exec_option'))
     exec_option['disable_outermost_topn'] = 1
     exec_option['mem_limit'] = "3000m"
     table_format = vector.get_value('table_format')
@@ -120,7 +124,7 @@ class TestQueryFullSort(ImpalaTestSuite):
       select * from lineitem limit 300000) t
     order by l_orderkey"""
 
-    exec_option = vector.get_value('exec_option')
+    exec_option = copy(vector.get_value('exec_option'))
     exec_option['disable_outermost_topn'] = 1
     # Run with a single scanner thread so that the input doesn't get reordered.
     exec_option['num_nodes'] = "1"
@@ -145,9 +149,9 @@ class TestQueryFullSort(ImpalaTestSuite):
     limit 100000
     """
 
-    exec_option = vector.get_value('exec_option')
+    exec_option = copy(vector.get_value('exec_option'))
     exec_option['disable_outermost_topn'] = 1
-    exec_option['max_block_mgr_memory'] = "256m"
+    exec_option['buffer_pool_limit'] = "256m"
     exec_option['num_nodes'] = "1"
     table_format = vector.get_value('table_format')
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_spilling.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_spilling.py b/tests/query_test/test_spilling.py
new file mode 100644
index 0000000..e2d5141
--- /dev/null
+++ b/tests/query_test/test_spilling.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.test_dimensions import (create_exec_option_dimension_from_dict,
+    create_parquet_dimension)
+
+class TestSpilling(ImpalaTestSuite):
+  @classmethod
+  def get_workload(self):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestSpilling, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.clear_constraints()
+    cls.ImpalaTestMatrix.add_dimension(create_parquet_dimension('tpch'))
+    # Tests are calibrated so that they can execute and spill with this page size.
+    cls.ImpalaTestMatrix.add_dimension(
+        create_exec_option_dimension_from_dict({'default_spillable_buffer_size' : ['256k']}))
+
+  def test_spilling(self, vector):
+    self.run_test_case('QueryTest/spilling', vector)


[05/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.h b/be/src/runtime/buffered-tuple-stream.h
deleted file mode 100644
index 41d63bf..0000000
--- a/be/src/runtime/buffered-tuple-stream.h
+++ /dev/null
@@ -1,561 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_H
-#define IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_H
-
-#include <vector>
-#include <set>
-
-#include "common/status.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/row-batch.h"
-
-namespace impala {
-
-class BufferedBlockMgr;
-class RuntimeProfile;
-class RuntimeState;
-class RowDescriptor;
-class SlotDescriptor;
-class TupleRow;
-
-/// Class that provides an abstraction for a stream of tuple rows. Rows can be
-/// added to the stream and returned. Rows are returned in the order they are added.
-///
-/// The underlying memory management is done by the BufferedBlockMgr.
-///
-/// The tuple stream consists of a number of small (less than IO-sized blocks) before
-/// an arbitrary number of IO-sized blocks. The smaller blocks do not spill and are
-/// there to lower the minimum buffering requirements. For example, an operator that
-/// needs to maintain 64 streams (1 buffer per partition) would need, by default,
-/// 64 * 8MB = 512MB of buffering. A query with 5 of these operators would require
-/// 2.56GB just to run, regardless of how much of that is used. This is
-/// problematic for small queries. Instead we will start with a fixed number of small
-/// buffers (currently 2 small buffers: one 64KB and one 512KB) and only start using IO
-/// sized buffers when those fill up. The small buffers never spill.
-/// The stream will *not* automatically switch from using small buffers to IO-sized
-/// buffers when all the small buffers for this stream have been used.
-///
-/// The BufferedTupleStream is *not* thread safe from the caller's point of view. It is
-/// expected that all the APIs are called from a single thread. Internally, the
-/// object is thread safe wrt to the underlying block mgr.
-///
-/// Buffer management:
-/// The stream is either pinned or unpinned, set via PinStream() and UnpinStream().
-/// Blocks are optionally deleted as they are read, set with the delete_on_read argument
-/// to PrepareForRead().
-///
-/// Block layout:
-/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a
-/// bitstring at the start of each block with null indicators for all tuples in each row
-/// in the block. The length of the  bitstring is a function of the block size. Row data
-/// is stored after the null indicators if present, or at the start of the block
-/// otherwise. Rows are stored back to back in the stream, with no interleaving of data
-/// from different rows. There is no padding or alignment between rows.
-///
-/// Null tuples:
-/// The order of bits in the null indicators bitstring corresponds to the order of
-/// tuples in the block. The NULL tuples are not stored in the row iself, only as set
-/// bits in the null indicators bitstring.
-///
-/// Tuple row layout:
-/// The fixed length parts of the row's tuples are stored first, followed by var len data
-/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can
-/// point to var len data outside the stream. When reading the stream, the length of each
-/// row's var len data in the stream must be computed to find the next row's start.
-///
-/// The tuple stream supports reading from the stream into RowBatches without copying
-/// out any data: the RowBatches' Tuple pointers will point directly into the stream's
-/// blocks. The fixed length parts follow Impala's internal tuple format, so for the
-/// tuple to be valid, we only need to update pointers to point to the var len data
-/// in the stream. These pointers need to be updated by the stream because a spilled
-/// block may be relocated to a different location in memory. The pointers are updated
-/// lazily upon reading the stream via GetNext() or GetRows().
-///
-/// Example layout for a row with two tuples ((1, "hello"), (2, "world")) with all var
-/// len data stored in the stream:
-///  <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ...
-/// +--------+-----------+-----------+-----------+-------------+
-/// | IntVal | StringVal | BigIntVal | StringVal |             | ...
-/// +--------+-----------+-----------+-----------++------------+
-/// | val: 1 | len: 5    | val: 2    | len: 5    | helloworld  | ...
-/// |        | ptr: 0x.. |           | ptr: 0x.. |             | ...
-/// +--------+-----------+-----------+-----------+-------------+
-///  <--4b--> <---12b---> <----8b---> <---12b---> <----10b---->
-//
-/// Example layout for a row with a single tuple (("hello", "world")) with the second
-/// string slot stored externally to the stream:
-///  <------ tuple 1 ------> <- var len ->  <- next row ...
-/// +-----------+-----------+-------------+
-/// | StringVal | StringVal |             | ...
-/// +-----------+-----------+-------------+
-/// | len: 5    | len: 5    |  hello      | ...
-/// | ptr: 0x.. | ptr: 0x.. |             | ...
-/// +-----------+-----------+-------------+
-///  <---12b---> <---12b---> <-----5b---->
-///
-/// The behavior of reads and writes is as follows:
-/// Read:
-///   1. Delete on read (delete_on_read_): Blocks are deleted as we go through the stream.
-///   The data returned by the tuple stream is valid until the next read call so the
-///   caller does not need to copy if it is streaming.
-///   2. Unpinned: Blocks remain in blocks_ and are unpinned after reading.
-///   3. Pinned: Blocks remain in blocks_ and are left pinned after reading. If the next
-///   block in the stream cannot be pinned, the read call will fail and the caller needs
-///   to free memory from the underlying block mgr.
-/// Write:
-///   1. Unpinned: Unpin blocks as they fill up. This means only a single (i.e. the
-///   current) block needs to be in memory regardless of the input size (if read_write is
-///   true, then two blocks need to be in memory).
-///   2. Pinned: Blocks are left pinned. If we run out of blocks, the write will fail and
-///   the caller needs to free memory from the underlying block mgr.
-///
-/// Memory lifetime of rows read from stream:
-/// If the stream is pinned, it is valid to access any tuples returned via
-/// GetNext() or GetRows() until the stream is unpinned. If the stream is unpinned, and
-/// the batch returned from GetNext() has the needs_deep_copy flag set, any tuple memory
-/// returned so far from the stream may be freed on the next call to GetNext().
-///
-/// Manual construction of rows with AllocateRow():
-/// The BufferedTupleStream supports allocation of uninitialized rows with AllocateRow().
-/// The caller of AllocateRow() is responsible for writing the row with exactly the
-/// layout described above.
-///
-/// If a caller constructs a tuple in this way, the caller can set the pointers and they
-/// will not be modified until the stream is read via GetNext() or GetRows().
-///
-/// TODO: we need to be able to do read ahead in the BufferedBlockMgr. It currently
-/// only has PinAllBlocks() which is blocking. We need a non-blocking version of this or
-/// some way to indicate a block will need to be pinned soon.
-/// TODO: see if this can be merged with Sorter::Run. The key difference is that this
-/// does not need to return rows in the order they were added, which allows it to be
-/// simpler.
-/// TODO: we could compact the small buffers when we need to spill but they use very
-/// little memory so ths might not be very useful.
-/// TODO: improvements:
-///   - It would be good to allocate the null indicators at the end of each block and grow
-///     this array as new rows are inserted in the block. If we do so, then there will be
-///     fewer gaps in case of many rows with NULL tuples.
-///   - We will want to multithread this. Add a AddBlock() call so the synchronization
-///     happens at the block level. This is a natural extension.
-///   - Instead of allocating all blocks from the block_mgr, allocate some blocks that
-///     are much smaller (e.g. 16K and doubling up to the block size). This way, very
-///     small streams (a common case) will use very little memory. This small blocks
-///     are always in memory since spilling them frees up negligible memory.
-///   - Return row batches in GetNext() instead of filling one in
-class BufferedTupleStream {
- public:
-  /// Ordinal index into the stream to retrieve a row in O(1) time. This index can
-  /// only be used if the stream is pinned.
-  /// To read a row from a stream we need three pieces of information that we squeeze in
-  /// 64 bits:
-  ///  - The index of the block. The block id is stored in 16 bits. We can have up to
-  ///    64K blocks per tuple stream. With 8MB blocks that is 512GB per stream.
-  ///  - The offset of the start of the row (data) within the block. Since blocks are 8MB
-  ///    we use 24 bits for the offsets. (In theory we could use 23 bits.)
-  ///  - The idx of the row in the block. We need this for retrieving the null indicators.
-  ///    We use 24 bits for this index as well.
-  struct RowIdx {
-    static const uint64_t BLOCK_MASK  = 0xFFFF;
-    static const uint64_t BLOCK_SHIFT = 0;
-    static const uint64_t OFFSET_MASK  = 0xFFFFFF0000;
-    static const uint64_t OFFSET_SHIFT = 16;
-    static const uint64_t IDX_MASK  = 0xFFFFFF0000000000;
-    static const uint64_t IDX_SHIFT = 40;
-
-    uint64_t block() const {
-      return (data & BLOCK_MASK);
-    }
-
-    uint64_t offset() const {
-      return (data & OFFSET_MASK) >> OFFSET_SHIFT;
-    }
-
-    uint64_t idx() const {
-      return (data & IDX_MASK) >> IDX_SHIFT;
-    }
-
-    uint64_t set(uint64_t block, uint64_t offset, uint64_t idx) {
-      DCHECK_LE(block, BLOCK_MASK)
-          << "Cannot have more than 2^16 = 64K blocks in a tuple stream.";
-      DCHECK_LE(offset, OFFSET_MASK >> OFFSET_SHIFT)
-          << "Cannot have blocks larger than 2^24 = 16MB";
-      DCHECK_LE(idx, IDX_MASK >> IDX_SHIFT)
-          << "Cannot have more than 2^24 = 16M rows in a block.";
-      data = block | (offset << OFFSET_SHIFT) | (idx << IDX_SHIFT);
-      return data;
-    }
-
-    std::string DebugString() const;
-
-    uint64_t data;
-  };
-
-  /// row_desc: description of rows stored in the stream. This is the desc for rows
-  /// that are added and the rows being returned.
-  /// block_mgr: Underlying block mgr that owns the data blocks.
-  /// use_initial_small_buffers: If true, the initial N buffers allocated for the
-  /// tuple stream use smaller than IO-sized buffers.
-  /// read_write: Stream allows interchanging read and write operations. Requires at
-  /// least two blocks may be pinned.
-  /// ext_varlen_slots: set of varlen slots with data stored externally to the stream
-  BufferedTupleStream(RuntimeState* state, const RowDescriptor* row_desc,
-      BufferedBlockMgr* block_mgr, BufferedBlockMgr::Client* client,
-      bool use_initial_small_buffers, bool read_write,
-      const std::set<SlotId>& ext_varlen_slots = std::set<SlotId>());
-
-  ~BufferedTupleStream();
-
-  /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called
-  /// once before any of the other APIs.
-  /// If 'pinned' is true, the tuple stream starts of pinned, otherwise it is unpinned.
-  /// If 'profile' is non-NULL, counters are created.
-  /// 'node_id' is only used for error reporting.
-  Status Init(int node_id, RuntimeProfile* profile, bool pinned);
-
-  /// Prepares the stream for writing by attempting to allocate a write block.
-  /// Called after Init() and before the first AddRow() call.
-  /// 'got_buffer': set to true if the first write block was successfully pinned, or
-  ///     false if the block could not be pinned and no error was encountered. Undefined
-  ///     if an error status is returned.
-  Status PrepareForWrite(bool* got_buffer);
-
-  /// Must be called for streams using small buffers to switch to IO-sized buffers.
-  /// If it fails to get a buffer (i.e. the switch fails) it resets the use_small_buffers_
-  /// back to false.
-  /// TODO: IMPALA-3200: remove this when small buffers are removed.
-  Status SwitchToIoBuffers(bool* got_buffer);
-
-  /// Adds a single row to the stream. Returns true if the append succeeded, returns false
-  /// and sets 'status' to OK if appending failed but can be retried or returns false and
-  /// sets 'status' to an error if an error occurred.
-  /// BufferedTupleStream will do a deep copy of the memory in the row. After AddRow()
-  /// returns an error, it should not be called again. If appending failed without an
-  /// error and the stream is using small buffers, it is valid to call
-  /// SwitchToIoBuffers() then AddRow() again.
-  bool AddRow(TupleRow* row, Status* status) noexcept;
-
-  /// Allocates space to store a row of with fixed length 'fixed_size' and variable
-  /// length data 'varlen_size'. If successful, returns the pointer where fixed length
-  /// data should be stored and assigns 'varlen_data' to where var-len data should
-  /// be stored. Returns NULL if there is not enough memory or an error occurred.
-  /// Sets *status if an error occurred. The returned memory is guaranteed to all
-  /// be allocated in the same block. AllocateRow does not currently support nullable
-  /// tuples.
-  uint8_t* AllocateRow(int fixed_size, int varlen_size, uint8_t** varlen_data,
-      Status* status);
-
-  /// Populates 'row' with the row at 'idx'. The stream must be pinned. The row must have
-  /// been allocated with the stream's row desc.
-  void GetTupleRow(const RowIdx& idx, TupleRow* row) const;
-
-  /// Prepares the stream for reading. If read_write_, this can be called at any time to
-  /// begin reading. Otherwise this must be called after the last AddRow() and
-  /// before GetNext().
-  /// delete_on_read: Blocks are deleted after they are read.
-  /// got_buffer: set to true if the first read block was successfully pinned, or
-  ///     false if the block could not be pinned and no error was encountered.
-  Status PrepareForRead(bool delete_on_read, bool* got_buffer);
-
-  /// Pins all blocks in this stream and switches to pinned mode.
-  /// If there is not enough memory, *pinned is set to false and the stream is unmodified.
-  /// If already_reserved is true, the caller has already made a reservation on
-  /// block_mgr_client_ to pin the stream.
-  Status PinStream(bool already_reserved, bool* pinned);
-
-  /// Modes for UnpinStream().
-  enum UnpinMode {
-    /// All blocks in the stream are unpinned and the read/write positions in the stream
-    /// are reset. No more rows can be written to the stream after this. The stream can
-    /// be re-read from the beginning by calling PrepareForRead().
-    UNPIN_ALL,
-    /// All blocks are unpinned aside from the current read and write blocks (if any),
-    /// which is left in the same state. The unpinned stream can continue being read
-    /// or written from the current read or write positions.
-    UNPIN_ALL_EXCEPT_CURRENT,
-  };
-
-  /// Unpins stream with the given 'mode' as described above.
-  Status UnpinStream(UnpinMode mode);
-
-  /// Get the next batch of output rows. Memory is still owned by the BufferedTupleStream
-  /// and must be copied out by the caller.
-  Status GetNext(RowBatch* batch, bool* eos);
-
-  /// Same as above, but also populate 'indices' with the index of each returned row.
-  Status GetNext(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
-
-  /// Returns all the rows in the stream in batch. This pins the entire stream in the
-  /// process.
-  /// *got_rows is false if the stream could not be pinned.
-  Status GetRows(boost::scoped_ptr<RowBatch>* batch, bool* got_rows);
-
-  /// Must be called once at the end to cleanup all resources. If 'batch' is non-NULL,
-  /// attaches any pinned blocks to the batch and deletes unpinned blocks. Otherwise
-  /// deletes all blocks. Does nothing if the stream was already closed. The 'flush'
-  /// mode is forwarded to RowBatch::AddBlock() when attaching blocks.
-  void Close(RowBatch* batch, RowBatch::FlushMode flush);
-
-  /// Number of rows in the stream.
-  int64_t num_rows() const { return num_rows_; }
-
-  /// Number of rows returned via GetNext().
-  int64_t rows_returned() const { return rows_returned_; }
-
-  /// Returns the byte size necessary to store the entire stream in memory.
-  int64_t byte_size() const { return total_byte_size_; }
-
-  /// Returns the byte size of the stream that is currently pinned in memory.
-  /// If ignore_current is true, the write_block_ memory is not included.
-  int64_t bytes_in_mem(bool ignore_current) const;
-
-  bool is_closed() const { return closed_; }
-  bool is_pinned() const { return pinned_; }
-  int blocks_pinned() const { return num_pinned_; }
-  int blocks_unpinned() const { return blocks_.size() - num_pinned_ - num_small_blocks_; }
-  bool has_read_block() const { return read_block_ != blocks_.end(); }
-  bool has_write_block() const { return write_block_ != NULL; }
-  bool using_small_buffers() const { return use_small_buffers_; }
-
-  /// Returns true if the row consumes any memory. If false, the stream only needs to
-  /// store the count of rows.
-  bool RowConsumesMemory() const {
-    return fixed_tuple_row_size_ > 0 || has_nullable_tuple_;
-  }
-
-  std::string DebugString() const;
-
- private:
-  friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
-  friend class ArrayTupleStreamTest_TestComputeRowSize_Test;
-  friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test;
-  friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test;
-
-  /// Runtime state instance used to check for cancellation. Not owned.
-  RuntimeState* const state_;
-
-  /// Description of rows stored in the stream.
-  const RowDescriptor* desc_;
-
-  /// Sum of the fixed length portion of all the tuples in desc_.
-  int fixed_tuple_row_size_;
-
-  /// The size of the fixed length portion for each tuple in the row.
-  std::vector<int> fixed_tuple_sizes_;
-
-  /// Max size (in bytes) of null indicators bitmap in the current read and write
-  /// blocks. If 0, it means that there is no need to store null indicators for this
-  /// RowDesc. We calculate this value based on the block's size and the
-  /// fixed_tuple_row_size_. When not 0, this value is also an upper bound for the number
-  /// of (rows * tuples_per_row) in this block.
-  int read_block_null_indicators_size_;
-  int write_block_null_indicators_size_;
-
-  /// Size (in bytes) of the null indicators bitmap reserved in a block of maximum
-  /// size (i.e. IO block size). 0 if no tuple is nullable.
-  int max_null_indicators_size_;
-
-  /// Vectors of all the strings slots that have their varlen data stored in stream
-  /// grouped by tuple_idx.
-  std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_string_slots_;
-
-  /// Vectors of all the collection slots that have their varlen data stored in the
-  /// stream, grouped by tuple_idx.
-  std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_coll_slots_;
-
-  /// Block manager and client used to allocate, pin and release blocks. Not owned.
-  BufferedBlockMgr* block_mgr_;
-  BufferedBlockMgr::Client* block_mgr_client_;
-
-  /// List of blocks in the stream.
-  std::list<BufferedBlockMgr::Block*> blocks_;
-
-  /// Total size of blocks_, including small blocks.
-  int64_t total_byte_size_;
-
-  /// Iterator pointing to the current block for read. Equal to list.end() until
-  /// PrepareForRead() is called.
-  std::list<BufferedBlockMgr::Block*>::iterator read_block_;
-
-  /// For each block in the stream, the buffer of the start of the block. This is only
-  /// valid when the stream is pinned, giving random access to data in the stream.
-  /// This is not maintained for delete_on_read_.
-  std::vector<uint8_t*> block_start_idx_;
-
-  /// Current idx of the tuple read from the read_block_ buffer.
-  uint32_t read_tuple_idx_;
-
-  /// Current offset in read_block_ of the end of the last data read.
-  uint8_t* read_ptr_;
-
-  /// Pointer to one byte past the end of read_block_.
-  uint8_t* read_end_ptr_;
-
-  /// Current idx of the tuple written at the write_block_ buffer.
-  uint32_t write_tuple_idx_;
-
-  /// Pointer into write_block_ of the end of the last data written.
-  uint8_t*  write_ptr_;
-
-  /// Pointer to one byte past the end of write_block_.
-  uint8_t* write_end_ptr_;
-
-  /// Number of rows returned to the caller from GetNext().
-  int64_t rows_returned_;
-
-  /// The block index of the current read block in blocks_.
-  int read_block_idx_;
-
-  /// The current block for writing. NULL if there is no available block to write to.
-  /// The entire write_block_ buffer is marked as allocated, so any data written into
-  /// the buffer will be spilled without having to allocate additional space.
-  BufferedBlockMgr::Block* write_block_;
-
-  /// Number of pinned blocks in blocks_, stored to avoid iterating over the list
-  /// to compute bytes_in_mem and bytes_unpinned.
-  /// This does not include small blocks.
-  int num_pinned_;
-
-  /// The total number of small blocks in blocks_;
-  int num_small_blocks_;
-
-  /// Number of rows stored in the stream.
-  int64_t num_rows_;
-
-  /// Counters added by this object to the parent runtime profile.
-  RuntimeProfile::Counter* pin_timer_;
-  RuntimeProfile::Counter* unpin_timer_;
-  RuntimeProfile::Counter* get_new_block_timer_;
-
-  /// If true, read and write operations may be interleaved. Otherwise all calls
-  /// to AddRow() must occur before calling PrepareForRead() and subsequent calls to
-  /// GetNext().
-  const bool read_write_;
-
-  /// Whether any tuple in the rows is nullable.
-  const bool has_nullable_tuple_;
-
-  /// If true, this stream is still using small buffers.
-  bool use_small_buffers_;
-
-  /// If true, blocks are deleted after they are read.
-  bool delete_on_read_;
-
-  bool closed_; // Used for debugging.
-
-  /// If true, this stream has been explicitly pinned by the caller. This changes the
-  /// memory management of the stream. The blocks are not unpinned until the caller calls
-  /// UnpinAllBlocks(). If false, only the write_block_ and/or read_block_ are pinned
-  /// (both are if read_write_ is true).
-  bool pinned_;
-
-  /// The slow path for AddRow() that is called if there is not sufficient space in
-  /// the current block.
-  bool AddRowSlow(TupleRow* row, Status* status) noexcept;
-
-  /// Copies 'row' into write_block_. Returns false if there is not enough space in
-  /// 'write_block_'. After returning false, write_ptr_ may be left pointing to the
-  /// partially-written row, and no more data can be written to write_block_.
-  template <bool HAS_NULLABLE_TUPLE>
-  bool DeepCopyInternal(TupleRow* row) noexcept;
-
-  /// Helper function to copy strings in string_slots from tuple into write_block_.
-  /// Updates write_ptr_ to the end of the string data added. Returns false if the data
-  /// does not fit in the current write block. After returning false, write_ptr_ is left
-  /// pointing to the partially-written row, and no more data can be written to
-  /// write_block_.
-  bool CopyStrings(const Tuple* tuple, const std::vector<SlotDescriptor*>& string_slots);
-
-  /// Helper function to deep copy collections in collection_slots from tuple into
-  /// write_block_. Updates write_ptr_ to the end of the collection data added. Returns
-  /// false if the data does not fit in the current write block.. After returning false,
-  /// write_ptr_ is left pointing to the partially-written row, and no more data can be
-  /// written to write_block_.
-  bool CopyCollections(const Tuple* tuple,
-      const std::vector<SlotDescriptor*>& collection_slots);
-
-  /// Wrapper of the templated DeepCopyInternal() function.
-  bool DeepCopy(TupleRow* row) noexcept;
-
-  /// Gets a new block of 'block_len' bytes from the block_mgr_, updating write_block_,
-  /// write_tuple_idx_, write_ptr_ and write_end_ptr_. 'null_indicators_size' is the
-  /// number of bytes that will be reserved in the block for the null indicators bitmap.
-  /// *got_block is set to true if a block was successfully acquired. Null indicators
-  /// (if any) will also be reserved and initialized. If there are no blocks available,
-  /// *got_block is set to false and write_block_ is unchanged.
-  Status NewWriteBlock(
-      int64_t block_len, int64_t null_indicators_size, bool* got_block) noexcept;
-
-  /// A wrapper around NewWriteBlock(). 'row_size' is the size of the tuple row to be
-  /// appended to this block. This function determines the block size required in order
-  /// to fit the row and null indicators.
-  Status NewWriteBlockForRow(int64_t row_size, bool* got_block) noexcept;
-
-  /// Reads the next block from the block_mgr_. This blocks if necessary.
-  /// Updates read_block_, read_ptr_, read_tuple_idx_ and read_end_ptr_.
-  Status NextReadBlock();
-
-  /// Returns the total additional bytes that this row will consume in write_block_ if
-  /// appended to the block. This includes the fixed length part of the row and the
-  /// data for inlined_string_slots_ and inlined_coll_slots_.
-  int64_t ComputeRowSize(TupleRow* row) const noexcept;
-
-  /// Unpins block if it is an IO-sized block and updates tracking stats.
-  Status UnpinBlock(BufferedBlockMgr::Block* block);
-
-  /// Templated GetNext implementations.
-  template <bool FILL_INDICES>
-  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
-  template <bool FILL_INDICES, bool HAS_NULLABLE_TUPLE>
-  Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
-
-  /// Helper function for GetNextInternal(). For each string slot in string_slots,
-  /// update StringValue's ptr field to point to the corresponding string data stored
-  /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the
-  /// StringValue's length field.
-  void FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots, Tuple* tuple);
-
-  /// Helper function for GetNextInternal(). For each collection slot in collection_slots,
-  /// recursively update any pointers in the CollectionValue to point to the corresponding
-  /// var len data stored inline in the stream, advancing read_ptr_ as data is read.
-  /// Assumes that the collection was serialized to the stream in DeepCopy()'s format.
-  void FixUpCollectionsForRead(const vector<SlotDescriptor*>& collection_slots,
-      Tuple* tuple);
-
-  /// Computes the number of bytes needed for null indicators for a block of 'block_size'.
-  /// Return 0 if no tuple is nullable. Return -1 if a single row of fixed-size tuples
-  /// plus its null indicator (if any) cannot fit in the block.
-  int ComputeNumNullIndicatorBytes(int block_size) const;
-
-  uint32_t read_block_bytes_remaining() const {
-    DCHECK_GE(read_end_ptr_, read_ptr_);
-    DCHECK_LE(read_end_ptr_ - read_ptr_, (*read_block_)->buffer_len());
-    return read_end_ptr_ - read_ptr_;
-  }
-
-  uint32_t write_block_bytes_remaining() const {
-    DCHECK_GE(write_end_ptr_, write_ptr_);
-    DCHECK_LE(write_end_ptr_ - write_ptr_, write_block_->buffer_len());
-    return write_end_ptr_ - write_ptr_;
-  }
-
-};
-
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.inline.h b/be/src/runtime/buffered-tuple-stream.inline.h
deleted file mode 100644
index ba6bb8c..0000000
--- a/be/src/runtime/buffered-tuple-stream.inline.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_TUPLE_BUFFERED_STREAM_INLINE_H
-#define IMPALA_RUNTIME_TUPLE_BUFFERED_STREAM_INLINE_H
-
-#include "runtime/buffered-tuple-stream.h"
-
-#include "runtime/descriptors.h"
-#include "runtime/tuple-row.h"
-
-namespace impala {
-
-inline bool BufferedTupleStream::AddRow(TupleRow* row, Status* status) noexcept {
-  DCHECK(!closed_);
-  if (LIKELY(DeepCopy(row))) return true;
-  return AddRowSlow(row, status);
-}
-
-inline uint8_t* BufferedTupleStream::AllocateRow(int fixed_size, int varlen_size,
-    uint8_t** varlen_data, Status* status) {
-  DCHECK(!closed_);
-  DCHECK(!has_nullable_tuple_) << "AllocateRow does not support nullable tuples";
-  const int total_size = fixed_size + varlen_size;
-  if (UNLIKELY(write_block_ == NULL || write_block_bytes_remaining() < total_size)) {
-    bool got_block;
-    *status = NewWriteBlockForRow(total_size, &got_block);
-    if (!status->ok() || !got_block) return NULL;
-  }
-  DCHECK(write_block_ != NULL);
-  DCHECK(write_block_->is_pinned());
-  DCHECK_GE(write_block_bytes_remaining(), total_size);
-  ++num_rows_;
-  write_block_->AddRow();
-
-  uint8_t* fixed_data = write_ptr_;
-  write_ptr_ += fixed_size;
-  *varlen_data = write_ptr_;
-  write_ptr_ += varlen_size;
-  return fixed_data;
-}
-
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/bufferpool/buffer-pool.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool.cc b/be/src/runtime/bufferpool/buffer-pool.cc
index 9b16112..83f2e6a 100644
--- a/be/src/runtime/bufferpool/buffer-pool.cc
+++ b/be/src/runtime/bufferpool/buffer-pool.cc
@@ -308,6 +308,16 @@ int64_t BufferPool::ClientHandle::GetUnusedReservation() const {
   return impl_->reservation()->GetUnusedReservation();
 }
 
+bool BufferPool::ClientHandle::TransferReservationFrom(
+    ReservationTracker* src, int64_t bytes) {
+  return src->TransferReservationTo(impl_->reservation(), bytes);
+}
+
+bool BufferPool::ClientHandle::TransferReservationTo(
+    ReservationTracker* dst, int64_t bytes) {
+  return impl_->reservation()->TransferReservationTo(dst, bytes);
+}
+
 void BufferPool::ClientHandle::SaveReservation(SubReservation* dst, int64_t bytes) {
   DCHECK_EQ(dst->tracker_->parent(), impl_->reservation());
   bool success = impl_->reservation()->TransferReservationTo(dst->tracker_.get(), bytes);
@@ -355,7 +365,7 @@ BufferPool::Client::Client(BufferPool* pool, TmpFileMgr::FileGroup* file_group,
   RuntimeProfile* child_profile = profile->CreateChild("Buffer pool", true, true);
   reservation_.InitChildTracker(
       child_profile, parent_reservation, mem_tracker, reservation_limit);
-  counters_.alloc_time = ADD_TIMER(profile, "AllocTime");
+  counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime");
   counters_.cumulative_allocations =
       ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT);
   counters_.cumulative_bytes_alloced =

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/bufferpool/buffer-pool.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool.h b/be/src/runtime/bufferpool/buffer-pool.h
index f2ff99b..e3df8df 100644
--- a/be/src/runtime/bufferpool/buffer-pool.h
+++ b/be/src/runtime/bufferpool/buffer-pool.h
@@ -338,6 +338,14 @@ class BufferPool::ClientHandle {
   int64_t GetUsedReservation() const;
   int64_t GetUnusedReservation() const;
 
+  /// Try to transfer 'bytes' of reservation from 'src' to this client using
+  /// ReservationTracker::TransferReservationTo().
+  bool TransferReservationFrom(ReservationTracker* src, int64_t bytes);
+
+  /// Transfer 'bytes' of reservation from this client to 'dst' using
+  /// ReservationTracker::TransferReservationTo().
+  bool TransferReservationTo(ReservationTracker* dst, int64_t bytes);
+
   bool is_registered() const { return impl_ != NULL; }
 
   std::string DebugString() const;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/bufferpool/reservation-tracker.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/reservation-tracker.h b/be/src/runtime/bufferpool/reservation-tracker.h
index 4d525c0..80084bc 100644
--- a/be/src/runtime/bufferpool/reservation-tracker.h
+++ b/be/src/runtime/bufferpool/reservation-tracker.h
@@ -127,6 +127,10 @@ class ReservationTracker {
   /// Returns true if the reservation increase was successful or not necessary.
   bool IncreaseReservationToFit(int64_t bytes) WARN_UNUSED_RESULT;
 
+  /// Decrease reservation by 'bytes' on this tracker and all ancestors. This tracker's
+  /// reservation must be at least 'bytes' before calling this method.
+  void DecreaseReservation(int64_t bytes) { DecreaseReservation(bytes, false); }
+
   /// Transfer reservation from this tracker to 'other'. Both trackers must be in the
   /// same query subtree of the hierarchy. One tracker can be the ancestor of the other,
   /// or they can share a common ancestor. The subtree root must be at the query level

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
index 3393ab3..55042d8 100644
--- a/be/src/runtime/disk-io-mgr.cc
+++ b/be/src/runtime/disk-io-mgr.cc
@@ -83,7 +83,7 @@ DEFINE_int32(num_adls_io_threads, 16, "Number of ADLS I/O threads");
 // not introduce seeks.  The literature seems to agree that with 8 MB reads, random
 // io and sequential io perform similarly.
 DEFINE_int32(read_size, 8 * 1024 * 1024, "Read Size (in bytes)");
-DEFINE_int32(min_buffer_size, 1024, "The minimum read buffer size (in bytes)");
+DECLARE_int64(min_buffer_size);
 
 // With 1024B through 8MB buffers, this is up to ~2GB of buffers.
 DEFINE_int32(max_free_io_buffers, 128,
@@ -937,9 +937,8 @@ void DiskIoMgr::HandleWriteFinished(
   int disk_id = write_range->disk_id_;
 
   // Execute the callback before decrementing the thread count. Otherwise CancelContext()
-  // that waits for the disk ref count to be 0 will return, creating a race, e.g.
-  // between BufferedBlockMgr::WriteComplete() and BufferedBlockMgr::~BufferedBlockMgr().
-  // See IMPALA-1890.
+  // that waits for the disk ref count to be 0 will return, creating a race, e.g. see
+  // IMPALA-1890.
   // The status of the write does not affect the status of the writer context.
   write_range->callback_(write_status);
   {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/exec-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc
index 960e3c9..f2ee6f0 100644
--- a/be/src/runtime/exec-env.cc
+++ b/be/src/runtime/exec-env.cc
@@ -75,6 +75,8 @@ DEFINE_int32(state_store_subscriber_port, 23000,
 DEFINE_int32(num_hdfs_worker_threads, 16,
     "(Advanced) The number of threads in the global HDFS operation pool");
 DEFINE_bool(disable_admission_control, false, "Disables admission control.");
+DEFINE_int64(min_buffer_size, 64 * 1024,
+    "(Advanced) The minimum buffer size to use in the buffer pool");
 
 DECLARE_int32(state_store_port);
 DECLARE_int32(num_threads_per_core);
@@ -204,13 +206,14 @@ Status ExecEnv::StartServices() {
   // memory limit either based on the available physical memory, or if overcommitting
   // is turned off, we use the memory commit limit from /proc/meminfo (see
   // IMPALA-1690).
-  // --mem_limit="" means no memory limit
+  // --mem_limit="" means no memory limit. TODO: IMPALA-5652: deprecate this mode
   int64_t bytes_limit = 0;
   bool is_percent;
+  int64_t system_mem;
   if (MemInfo::vm_overcommit() == 2 &&
       MemInfo::commit_limit() < MemInfo::physical_mem()) {
-    bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent,
-        MemInfo::commit_limit());
+    system_mem = MemInfo::commit_limit();
+    bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent, system_mem);
     // There might be the case of misconfiguration, when on a system swap is disabled
     // and overcommitting is turned off the actual usable memory is less than the
     // available physical memory.
@@ -225,14 +228,23 @@ Status ExecEnv::StartServices() {
                  << "/proc/sys/vm/overcommit_memory and "
                  << "/proc/sys/vm/overcommit_ratio.";
   } else {
-    bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent,
-        MemInfo::physical_mem());
+    system_mem = MemInfo::physical_mem();
+    bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent, system_mem);
   }
-
+  // ParseMemSpec returns 0 to mean unlimited. TODO: IMPALA-5652: deprecate this mode.
+  bool no_process_mem_limit = bytes_limit == 0;
   if (bytes_limit < 0) {
     return Status("Failed to parse mem limit from '" + FLAGS_mem_limit + "'.");
   }
 
+  if (!BitUtil::IsPowerOf2(FLAGS_min_buffer_size)) {
+    return Status(Substitute(
+        "--min_buffer_size must be a power-of-two: $0", FLAGS_min_buffer_size));
+  }
+  int64_t buffer_pool_capacity = BitUtil::RoundDown(
+      no_process_mem_limit ? system_mem : bytes_limit * 4 / 5, FLAGS_min_buffer_size);
+  InitBufferPool(FLAGS_min_buffer_size, buffer_pool_capacity);
+
   metrics_->Init(enable_webserver_ ? webserver_.get() : nullptr);
   impalad_client_cache_->InitMetrics(metrics_.get(), "impala-server.backends");
   catalogd_client_cache_->InitMetrics(metrics_.get(), "catalog.server");
@@ -240,8 +252,8 @@ Status ExecEnv::StartServices() {
       metrics_.get(), true, buffer_reservation_.get(), buffer_pool_.get()));
 
   // Limit of -1 means no memory limit.
-  mem_tracker_.reset(new MemTracker(
-      AggregateMemoryMetrics::TOTAL_USED, bytes_limit > 0 ? bytes_limit : -1, "Process"));
+  mem_tracker_.reset(new MemTracker(AggregateMemoryMetrics::TOTAL_USED,
+      no_process_mem_limit ? -1 : bytes_limit, "Process"));
   if (buffer_pool_ != nullptr) {
     // Add BufferPool MemTrackers for cached memory that is not tracked against queries
     // but is included in process memory consumption.
@@ -270,6 +282,8 @@ Status ExecEnv::StartServices() {
   }
   LOG(INFO) << "Using global memory limit: "
             << PrettyPrinter::Print(bytes_limit, TUnit::BYTES);
+  LOG(INFO) << "Buffer pool capacity: "
+            << PrettyPrinter::Print(buffer_pool_capacity, TUnit::BYTES);
 
   RETURN_IF_ERROR(disk_io_mgr_->Init(mem_tracker_.get()));
 
@@ -310,9 +324,8 @@ Status ExecEnv::StartServices() {
   return Status::OK();
 }
 
-void ExecEnv::InitBufferPool(int64_t min_page_size, int64_t capacity) {
-  DCHECK(buffer_pool_ == nullptr);
-  buffer_pool_.reset(new BufferPool(min_page_size, capacity));
+void ExecEnv::InitBufferPool(int64_t min_buffer_size, int64_t capacity) {
+  buffer_pool_.reset(new BufferPool(min_buffer_size, capacity));
   buffer_reservation_.reset(new ReservationTracker());
   buffer_reservation_->InitRootTracker(nullptr, capacity);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/exec-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.h b/be/src/runtime/exec-env.h
index 4674072..63d2e0b 100644
--- a/be/src/runtime/exec-env.h
+++ b/be/src/runtime/exec-env.h
@@ -159,8 +159,8 @@ class ExecEnv {
   boost::scoped_ptr<QueryExecMgr> query_exec_mgr_;
 
   /// Query-wide buffer pool and the root reservation tracker for the pool. The
-  /// reservation limit is equal to the maximum capacity of the pool.
-  /// For now this is only used by backend tests that create them via InitBufferPool();
+  /// reservation limit is equal to the maximum capacity of the pool. Created in
+  /// InitBufferPool();
   boost::scoped_ptr<ReservationTracker> buffer_reservation_;
   boost::scoped_ptr<BufferPool> buffer_pool_;
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/fragment-instance-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/fragment-instance-state.cc b/be/src/runtime/fragment-instance-state.cc
index 2385eab..07b3f1c 100644
--- a/be/src/runtime/fragment-instance-state.cc
+++ b/be/src/runtime/fragment-instance-state.cc
@@ -126,8 +126,6 @@ Status FragmentInstanceState::Prepare() {
   profile()->AddChild(timings_profile_);
   SCOPED_TIMER(ADD_TIMER(timings_profile_, PREPARE_TIMER_NAME));
 
-  // TODO: move this into a RuntimeState::Init()
-  RETURN_IF_ERROR(runtime_state_->CreateBlockMgr());
   runtime_state_->InitFilterBank();
 
   // Reserve one main thread from the pool

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/initial-reservations.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/initial-reservations.cc b/be/src/runtime/initial-reservations.cc
new file mode 100644
index 0000000..4987ec3
--- /dev/null
+++ b/be/src/runtime/initial-reservations.cc
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/initial-reservations.h"
+
+#include <limits>
+
+#include <boost/thread/mutex.hpp>
+#include <gflags/gflags.h>
+
+#include "common/logging.h"
+#include "common/object-pool.h"
+#include "runtime/exec-env.h"
+#include "runtime/mem-tracker.h"
+#include "util/debug-util.h"
+
+#include "common/names.h"
+
+using std::numeric_limits;
+
+DECLARE_int32(be_port);
+DECLARE_string(hostname);
+
+namespace impala {
+
+InitialReservations::InitialReservations(ObjectPool* obj_pool,
+    ReservationTracker* query_reservation, MemTracker* query_mem_tracker,
+    int64_t initial_reservation_total_claims)
+  : remaining_initial_reservation_claims_(initial_reservation_total_claims) {
+  MemTracker* initial_reservation_tracker = obj_pool->Add(
+      new MemTracker(-1, "Unclaimed reservations", query_mem_tracker, false));
+  initial_reservations_.InitChildTracker(nullptr, query_reservation,
+      initial_reservation_tracker, numeric_limits<int64_t>::max());
+}
+
+Status InitialReservations::Init(
+    const TUniqueId& query_id, int64_t query_min_reservation) {
+  DCHECK_EQ(0, initial_reservations_.GetReservation()) << "Already inited";
+  if (!initial_reservations_.IncreaseReservation(query_min_reservation)) {
+    return Status(TErrorCode::MINIMUM_RESERVATION_UNAVAILABLE,
+        PrettyPrinter::Print(query_min_reservation, TUnit::BYTES), FLAGS_hostname,
+        FLAGS_be_port, PrintId(query_id),
+        ExecEnv::GetInstance()->process_mem_tracker()->LogUsage());
+  }
+  VLOG_QUERY << "Successfully claimed initial reservations ("
+            << PrettyPrinter::Print(query_min_reservation, TUnit::BYTES) << ") for"
+            << " query " << PrintId(query_id);
+  return Status::OK();
+}
+
+void InitialReservations::Claim(BufferPool::ClientHandle* dst, int64_t bytes) {
+  DCHECK_GE(bytes, 0);
+  lock_guard<SpinLock> l(lock_);
+  DCHECK_LE(bytes, remaining_initial_reservation_claims_);
+  bool success = dst->TransferReservationFrom(&initial_reservations_, bytes);
+  DCHECK(success) << "Planner computation should ensure enough initial reservations";
+  remaining_initial_reservation_claims_ -= bytes;
+}
+
+void InitialReservations::Return(BufferPool::ClientHandle* src, int64_t bytes) {
+  lock_guard<SpinLock> l(lock_);
+  bool success = src->TransferReservationTo(&initial_reservations_, bytes);
+  // No limits on our tracker - no way this should fail.
+  DCHECK(success);
+  // Check to see if we can release any reservation.
+  int64_t excess_reservation =
+    initial_reservations_.GetReservation() - remaining_initial_reservation_claims_;
+  if (excess_reservation > 0) {
+    initial_reservations_.DecreaseReservation(excess_reservation);
+  }
+}
+
+void InitialReservations::ReleaseResources() {
+  initial_reservations_.Close();
+}
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/initial-reservations.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/initial-reservations.h b/be/src/runtime/initial-reservations.h
new file mode 100644
index 0000000..dfcb114
--- /dev/null
+++ b/be/src/runtime/initial-reservations.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_INITIAL_RESERVATIONS_H
+#define IMPALA_RUNTIME_INITIAL_RESERVATIONS_H
+
+#include "common/status.h"
+#include "gen-cpp/Types_types.h" // for TUniqueId
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/reservation-tracker.h"
+#include "util/spinlock.h"
+
+namespace impala {
+
+class ObjectPool;
+
+/**
+ * Manages the pool of initial reservations for different nodes in the plan tree.
+ * Each plan node and sink claims its initial reservation from here, then returns it when
+ * it is done executing. The frontend is responsible for making sure that enough initial
+ * reservation is in this pool for all of the concurrent claims.
+ */
+class InitialReservations {
+ public:
+  /// 'query_reservation' and 'query_mem_tracker' are the top-level trackers for the
+  /// query. This creates trackers for initial reservations under those.
+  /// 'initial_reservation_total_claims' is the total of initial reservations that will be
+  /// claimed over the lifetime of the query. The total bytes claimed via Claim()
+  /// cannot exceed this. Allocated objects are stored in 'obj_pool'.
+  InitialReservations(ObjectPool* obj_pool, ReservationTracker* query_reservation,
+      MemTracker* query_mem_tracker, int64_t initial_reservation_total_claims);
+
+  /// Initialize the query's pool of initial reservations by acquiring the minimum
+  /// reservation required for the query on this host. Fails if the reservation could
+  /// not be acquired, e.g. because it would exceed a pool or process limit.
+  Status Init(
+      const TUniqueId& query_id, int64_t query_min_reservation) WARN_UNUSED_RESULT;
+
+  /// Claim the initial reservation of 'bytes' for 'dst'. Assumes that the transfer will
+  /// not violate any reservation limits on 'dst'.
+  void Claim(BufferPool::ClientHandle* dst, int64_t bytes);
+
+  /// Return the initial reservation of 'bytes' from 'src'. The reservation is returned
+  /// to the pool of reservations if it may be needed to satisfy a subsequent claim or
+  /// otherwise is released.
+  void Return(BufferPool::ClientHandle* src, int64_t bytes);
+
+  /// Release any reservations held onto by this object.
+  void ReleaseResources();
+
+ private:
+  // Protects all below members to ensure that the internal state is consistent.
+  SpinLock lock_;
+
+  // The pool of initial reservations that Claim() returns reservations from and
+  // Return() returns reservations to.
+  ReservationTracker initial_reservations_;
+
+  /// The total bytes of additional reservations that we expect to be claimed.
+  /// initial_reservations_->GetReservation() <= remaining_initial_reservation_claims_.
+  int64_t remaining_initial_reservation_claims_;
+};
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/query-exec-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-exec-mgr.cc b/be/src/runtime/query-exec-mgr.cc
index 6057b52..22c2826 100644
--- a/be/src/runtime/query-exec-mgr.cc
+++ b/be/src/runtime/query-exec-mgr.cc
@@ -124,6 +124,8 @@ void QueryExecMgr::StartQueryHelper(QueryState* qs) {
   }
 #endif
 
+  // decrement refcount taken in QueryState::Init();
+  qs->ReleaseInitialReservationRefcount();
   // decrement refcount taken in StartQuery()
   ReleaseQueryState(qs);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/query-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.cc b/be/src/runtime/query-state.cc
index 21f35fb..64a8c5a 100644
--- a/be/src/runtime/query-state.cc
+++ b/be/src/runtime/query-state.cc
@@ -21,11 +21,12 @@
 #include <boost/thread/locks.hpp>
 
 #include "exprs/expr.h"
+#include "runtime/backend-client.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/reservation-tracker.h"
-#include "runtime/backend-client.h"
 #include "runtime/exec-env.h"
 #include "runtime/fragment-instance-state.h"
+#include "runtime/initial-reservations.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/query-exec-mgr.h"
 #include "runtime/runtime-state.h"
@@ -37,6 +38,20 @@
 
 using namespace impala;
 
+// The fraction of the query mem limit that is used for buffer reservations. Most
+// operators that accumulate memory use reservations, so the majority of memory should
+// be allocated to buffer reservations, as a heuristic.
+// TODO: this will go away once all operators use buffer reservations.
+static const double RESERVATION_MEM_FRACTION = 0.8;
+
+// The minimum amount of memory that should be left after buffer reservations.
+// The limit on reservations is computed as:
+// min(query_limit * RESERVATION_MEM_FRACTION,
+//     query_limit - RESERVATION_MEM_MIN_REMAINING)
+// TODO: this will go away once all operators use buffer reservations and we have accurate
+// minimum requirements.
+static const int64_t RESERVATION_MEM_MIN_REMAINING = 100 * 1024 * 1024;
+
 QueryState::ScopedRef::ScopedRef(const TUniqueId& query_id) {
   DCHECK(ExecEnv::GetInstance()->query_exec_mgr() != nullptr);
   query_state_ = ExecEnv::GetInstance()->query_exec_mgr()->GetQueryState(query_id);
@@ -49,8 +64,10 @@ QueryState::ScopedRef::~ScopedRef() {
 
 QueryState::QueryState(const TQueryCtx& query_ctx, const string& request_pool)
   : query_ctx_(query_ctx),
+    initial_reservation_refcnt_(0),
     refcnt_(0),
-    is_cancelled_(0) {
+    is_cancelled_(0),
+    query_spilled_(0) {
   if (query_ctx_.request_pool.empty()) {
     // fix up pool name for tests
     DCHECK(!request_pool.empty());
@@ -75,6 +92,7 @@ void QueryState::ReleaseResources() {
   // Clean up temporary files.
   if (file_group_ != nullptr) file_group_->Close();
   // Release any remaining reservation.
+  if (initial_reservations_ != nullptr) initial_reservations_->ReleaseResources();
   if (buffer_reservation_ != nullptr) buffer_reservation_->Close();
   // Avoid dangling reference from the parent of 'query_mem_tracker_'.
   if (query_mem_tracker_ != nullptr) query_mem_tracker_->UnregisterFromParent();
@@ -85,6 +103,7 @@ void QueryState::ReleaseResources() {
 QueryState::~QueryState() {
   DCHECK(released_resources_);
   DCHECK_EQ(refcnt_.Load(), 0);
+  DCHECK_EQ(initial_reservation_refcnt_.Load(), 0);
 }
 
 Status QueryState::Init(const TExecQueryFInstancesParams& rpc_params) {
@@ -99,9 +118,8 @@ Status QueryState::Init(const TExecQueryFInstancesParams& rpc_params) {
         "is over its memory limit", PrintId(query_id()));
     RETURN_IF_ERROR(process_mem_tracker->MemLimitExceeded(NULL, msg, 0));
   }
-  // Do buffer-pool-related setup if running in a backend test that explicitly created
-  // the pool.
-  if (exec_env->buffer_pool() != nullptr) RETURN_IF_ERROR(InitBufferPoolState());
+
+  RETURN_IF_ERROR(InitBufferPoolState());
 
   // don't copy query_ctx, it's large and we already did that in the c'tor
   rpc_params_.__set_coord_state_idx(rpc_params.coord_state_idx);
@@ -112,6 +130,15 @@ Status QueryState::Init(const TExecQueryFInstancesParams& rpc_params) {
   rpc_params_.fragment_instance_ctxs.swap(non_const_params.fragment_instance_ctxs);
   rpc_params_.__isset.fragment_instance_ctxs = true;
 
+  // Claim the query-wide minimum reservation. Do this last so that we don't need
+  // to handle releasing it if a later step fails.
+  initial_reservations_ = obj_pool_.Add(new InitialReservations(&obj_pool_,
+      buffer_reservation_, query_mem_tracker_,
+      query_ctx_.per_host_initial_reservation_total_claims));
+  RETURN_IF_ERROR(
+      initial_reservations_->Init(query_id(), query_ctx_.per_host_min_reservation));
+  DCHECK_EQ(0, initial_reservation_refcnt_.Load());
+  initial_reservation_refcnt_.Add(1); // Decremented in QueryExecMgr::StartQueryHelper().
   return Status::OK();
 }
 
@@ -129,19 +156,23 @@ void QueryState::InitMemTrackers() {
 
 Status QueryState::InitBufferPoolState() {
   ExecEnv* exec_env = ExecEnv::GetInstance();
-  int64_t query_mem_limit = query_mem_tracker_->limit();
-  if (query_mem_limit == -1) query_mem_limit = numeric_limits<int64_t>::max();
-
-  // TODO: IMPALA-3200: add a default upper bound to buffer pool memory derived from
-  // query_mem_limit.
-  int64_t max_reservation = numeric_limits<int64_t>::max();
-  if (query_options().__isset.max_block_mgr_memory
-      && query_options().max_block_mgr_memory > 0) {
-    max_reservation = query_options().max_block_mgr_memory;
+  int64_t mem_limit = query_mem_tracker_->lowest_limit();
+  int64_t max_reservation;
+  if (query_options().__isset.buffer_pool_limit
+      && query_options().buffer_pool_limit > 0) {
+    max_reservation = query_options().buffer_pool_limit;
+  } else if (mem_limit == -1) {
+    // No query mem limit. The process-wide reservation limit is the only limit on
+    // reservations.
+    max_reservation = numeric_limits<int64_t>::max();
+  } else {
+    DCHECK_GE(mem_limit, 0);
+    max_reservation = min<int64_t>(
+        mem_limit * RESERVATION_MEM_FRACTION, mem_limit - RESERVATION_MEM_MIN_REMAINING);
+    max_reservation = max<int64_t>(0, max_reservation);
   }
+  VLOG_QUERY << "Buffer pool limit for " << PrintId(query_id()) << ": " << max_reservation;
 
-  // TODO: IMPALA-3748: claim the query-wide minimum reservation.
-  // For now, rely on exec nodes to grab their minimum reservation during Prepare().
   buffer_reservation_ = obj_pool_.Add(new ReservationTracker);
   buffer_reservation_->InitChildTracker(
       NULL, exec_env->buffer_reservation(), query_mem_tracker_, max_reservation);
@@ -256,6 +287,7 @@ void QueryState::StartFInstances() {
   VLOG_QUERY << "StartFInstances(): query_id=" << PrintId(query_id())
       << " #instances=" << rpc_params_.fragment_instance_ctxs.size();
   DCHECK_GT(refcnt_.Load(), 0);
+  DCHECK_GT(initial_reservation_refcnt_.Load(), 0) << "Should have been taken in Init()";
 
   // set up desc tbl
   DCHECK(query_ctx().__isset.desc_tbl);
@@ -290,6 +322,7 @@ void QueryState::StartFInstances() {
 
     // start new thread to execute instance
     refcnt_.Add(1);  // decremented in ExecFInstance()
+    initial_reservation_refcnt_.Add(1);  // decremented in ExecFInstance()
     string thread_name = Substitute(
         "exec-finstance (finst:$0)", PrintId(instance_ctx.fragment_instance_id));
     Thread t(FragmentInstanceState::FINST_THREAD_GROUP_NAME, thread_name,
@@ -311,6 +344,12 @@ void QueryState::StartFInstances() {
   instances_prepared_promise_.Set(prepare_status);
 }
 
+void QueryState::ReleaseInitialReservationRefcount() {
+  int32_t new_val = initial_reservation_refcnt_.Add(-1);
+  DCHECK_GE(new_val, 0);
+  if (new_val == 0) initial_reservations_->ReleaseResources();
+}
+
 void QueryState::ExecFInstance(FragmentInstanceState* fis) {
   ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS_IN_FLIGHT->Increment(1L);
   ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS->Increment(1L);
@@ -327,6 +366,8 @@ void QueryState::ExecFInstance(FragmentInstanceState* fis) {
   // initiate cancellation if nobody has done so yet
   if (!status.ok()) Cancel();
   // decrement refcount taken in StartFInstances()
+  ReleaseInitialReservationRefcount();
+  // decrement refcount taken in StartFInstances()
   ExecEnv::GetInstance()->query_exec_mgr()->ReleaseQueryState(this);
 }
 
@@ -345,3 +386,21 @@ void QueryState::PublishFilter(int32_t filter_id, int fragment_idx,
     fis->PublishFilter(filter_id, thrift_bloom_filter);
   }
 }
+
+Status QueryState::StartSpilling(RuntimeState* runtime_state, MemTracker* mem_tracker) {
+  // Return an error message with the root cause of why spilling is disabled.
+  if (query_options().scratch_limit == 0) {
+    return mem_tracker->MemLimitExceeded(
+        runtime_state, "Could not free memory by spilling to disk: scratch_limit is 0");
+  } else if (query_ctx_.disable_spilling) {
+    return mem_tracker->MemLimitExceeded(runtime_state,
+        "Could not free memory by spilling to disk: spilling was disabled by planner. "
+        "Re-enable spilling by setting the query option DISABLE_UNSAFE_SPILLS=false");
+  }
+  // 'file_group_' must be non-NULL for spilling to be enabled.
+  DCHECK(file_group_ != nullptr);
+  if (query_spilled_.CompareAndSwap(0, 1)) {
+    ImpaladMetrics::NUM_QUERIES_SPILLED->Increment(1);
+  }
+  return Status::OK();
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/query-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.h b/be/src/runtime/query-state.h
index 9ce4316..fc71772 100644
--- a/be/src/runtime/query-state.h
+++ b/be/src/runtime/query-state.h
@@ -34,8 +34,10 @@
 namespace impala {
 
 class FragmentInstanceState;
+class InitialReservations;
 class MemTracker;
 class ReservationTracker;
+class RuntimeState;
 
 /// Central class for all backend execution state (example: the FragmentInstanceStates
 /// of the individual fragment instances) created for a particular query.
@@ -110,6 +112,7 @@ class QueryState {
 
   // the following getters are only valid after Prepare()
   ReservationTracker* buffer_reservation() const { return buffer_reservation_; }
+  InitialReservations* initial_reservations() const { return initial_reservations_; }
   TmpFileMgr::FileGroup* file_group() const { return file_group_; }
   const TExecQueryFInstancesParams& rpc_params() const { return rpc_params_; }
 
@@ -117,8 +120,10 @@ class QueryState {
   const DescriptorTbl& desc_tbl() const { return *desc_tbl_; }
 
   /// Sets up state required for fragment execution: memory reservations, etc. Fails
-  /// if resources could not be acquired. Uses few cycles and never blocks.
-  /// Not idempotent, not thread-safe.
+  /// if resources could not be acquired. On success, acquires an initial reservation
+  /// refcount for the caller, which the caller must release by calling
+  /// ReleaseInitialReservationRefcount().
+  /// Uses few cycles and never blocks. Not idempotent, not thread-safe.
   /// The remaining public functions must be called only after Init().
   Status Init(const TExecQueryFInstancesParams& rpc_params) WARN_UNUSED_RESULT;
 
@@ -155,6 +160,12 @@ class QueryState {
   /// If there is an error during the rpc, initiates cancellation.
   void ReportExecStatus(bool done, const Status& status, FragmentInstanceState* fis);
 
+  /// Checks whether spilling is enabled for this query. Must be called before the first
+  /// call to BufferPool::Unpin() for the query. Returns OK if spilling is enabled. If
+  /// spilling is not enabled, logs a MEM_LIMIT_EXCEEDED error from
+  /// tracker->MemLimitExceeded() to 'runtime_state'.
+  Status StartSpilling(RuntimeState* runtime_state, MemTracker* mem_tracker);
+
   ~QueryState();
 
  private:
@@ -162,6 +173,7 @@ class QueryState {
 
   /// test execution
   friend class RuntimeState;
+  friend class TestEnv;
 
   static const int DEFAULT_BATCH_SIZE = 1024;
 
@@ -176,16 +188,21 @@ class QueryState {
   /// TODO: find a way not to have to copy this
   TExecQueryFInstancesParams rpc_params_;
 
-  /// Buffer reservation for this query (owned by obj_pool_)
-  /// Only non-null in backend tests that explicitly enabled the new buffer pool
-  /// Set in Prepare().
-  /// TODO: this will always be non-null once IMPALA-3200 is done
+  /// Buffer reservation for this query (owned by obj_pool_). Set in Prepare().
   ReservationTracker* buffer_reservation_ = nullptr;
 
-  /// Temporary files for this query (owned by obj_pool_)
-  /// Only non-null in backend tests the explicitly enabled the new buffer pool
-  /// Set in Prepare().
-  /// TODO: this will always be non-null once IMPALA-3200 is done
+  /// Pool of buffer reservations used to distribute initial reservations to operators
+  /// in the query. Contains a ReservationTracker that is a child of
+  /// 'buffer_reservation_'. Owned by 'obj_pool_'. Set in Prepare().
+  InitialReservations* initial_reservations_ = nullptr;
+
+  /// Number of fragment instances executing, which may need to claim
+  /// from 'initial_reservations_'.
+  /// TODO: not needed if we call ReleaseResources() in a timely manner (IMPALA-1575).
+  AtomicInt32 initial_reservation_refcnt_;
+
+  /// Temporary files for this query (owned by obj_pool_). Non-null if spilling is
+  /// enabled. Set in Prepare().
   TmpFileMgr::FileGroup* file_group_ = nullptr;
 
   /// created in StartFInstances(), owned by obj_pool_
@@ -214,6 +231,11 @@ class QueryState {
   /// True if and only if ReleaseResources() has been called.
   bool released_resources_ = false;
 
+  /// Whether the query has spilled. 0 if the query has not spilled. Atomically set to 1
+  /// when the query first starts to spill. Required to correctly maintain the
+  /// "num-queries-spilled" metric.
+  AtomicInt32 query_spilled_;
+
   /// Create QueryState w/ refcnt of 0.
   /// The query is associated with the resource pool query_ctx.request_pool or
   /// 'request_pool', if the former is not set (needed for tests).
@@ -222,13 +244,16 @@ class QueryState {
   /// Execute the fragment instance and decrement the refcnt when done.
   void ExecFInstance(FragmentInstanceState* fis);
 
-  /// Called from Prepare() to initialize MemTrackers.
+  /// Called from constructor to initialize MemTrackers.
   void InitMemTrackers();
 
-  /// Called from Prepare() to setup buffer reservations and the
-  /// file group. Fails if required resources are not available.
+  /// Called from Init() to set up buffer reservations and the file group.
   Status InitBufferPoolState() WARN_UNUSED_RESULT;
 
+  /// Decrement 'initial_reservation_refcnt_' and release the initial reservation if it
+  /// goes to zero.
+  void ReleaseInitialReservationRefcount();
+
   /// Same behavior as ReportExecStatus().
   /// Cancel on error only if instances_started is true.
   void ReportExecStatusAux(bool done, const Status& status, FragmentInstanceState* fis,

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/row-batch.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.cc b/be/src/runtime/row-batch.cc
index 11cf363..942ac05 100644
--- a/be/src/runtime/row-batch.cc
+++ b/be/src/runtime/row-batch.cc
@@ -147,9 +147,6 @@ RowBatch::~RowBatch() {
   for (int i = 0; i < io_buffers_.size(); ++i) {
     ExecEnv::GetInstance()->disk_io_mgr()->ReturnBuffer(move(io_buffers_[i]));
   }
-  for (int i = 0; i < blocks_.size(); ++i) {
-    blocks_[i]->Delete();
-  }
   for (BufferInfo& buffer_info : buffers_) {
     ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(
         buffer_info.client, &buffer_info.buffer);
@@ -295,14 +292,6 @@ void RowBatch::AddIoBuffer(unique_ptr<DiskIoMgr::BufferDescriptor> buffer) {
   io_buffers_.emplace_back(move(buffer));
 }
 
-void RowBatch::AddBlock(BufferedBlockMgr::Block* block, FlushMode flush) {
-  DCHECK(block != NULL);
-  DCHECK(block->is_pinned());
-  blocks_.push_back(block);
-  auxiliary_mem_usage_ += block->buffer_len();
-  if (flush == FlushMode::FLUSH_RESOURCES) MarkFlushResources();
-}
-
 void RowBatch::AddBuffer(BufferPool::ClientHandle* client,
     BufferPool::BufferHandle&& buffer, FlushMode flush) {
   auxiliary_mem_usage_ += buffer.len();
@@ -322,10 +311,6 @@ void RowBatch::Reset() {
     ExecEnv::GetInstance()->disk_io_mgr()->ReturnBuffer(move(io_buffers_[i]));
   }
   io_buffers_.clear();
-  for (int i = 0; i < blocks_.size(); ++i) {
-    blocks_[i]->Delete();
-  }
-  blocks_.clear();
   for (BufferInfo& buffer_info : buffers_) {
     ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(
         buffer_info.client, &buffer_info.buffer);
@@ -342,10 +327,6 @@ void RowBatch::TransferResourceOwnership(RowBatch* dest) {
     dest->AddIoBuffer(move(io_buffers_[i]));
   }
   io_buffers_.clear();
-  for (int i = 0; i < blocks_.size(); ++i) {
-    dest->AddBlock(blocks_[i], FlushMode::NO_FLUSH_RESOURCES);
-  }
-  blocks_.clear();
   for (BufferInfo& buffer_info : buffers_) {
     dest->AddBuffer(
         buffer_info.client, std::move(buffer_info.buffer), FlushMode::NO_FLUSH_RESOURCES);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/row-batch.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.h b/be/src/runtime/row-batch.h
index 1b75ebb..35a8f14 100644
--- a/be/src/runtime/row-batch.h
+++ b/be/src/runtime/row-batch.h
@@ -25,7 +25,6 @@
 #include "codegen/impala-ir.h"
 #include "common/compiler-util.h"
 #include "common/logging.h"
-#include "runtime/buffered-block-mgr.h"
 #include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/descriptors.h"
 #include "runtime/disk-io-mgr.h"
@@ -207,7 +206,6 @@ class RowBatch {
   int row_byte_size() { return num_tuples_per_row_ * sizeof(Tuple*); }
   MemPool* tuple_data_pool() { return &tuple_data_pool_; }
   int num_io_buffers() const { return io_buffers_.size(); }
-  int num_blocks() const { return blocks_.size(); }
   int num_buffers() const { return buffers_.size(); }
 
   /// Resets the row batch, returning all resources it has accumulated.
@@ -216,13 +214,6 @@ class RowBatch {
   /// Add io buffer to this row batch.
   void AddIoBuffer(std::unique_ptr<DiskIoMgr::BufferDescriptor> buffer);
 
-  /// Adds a block to this row batch. The block must be pinned. The blocks must be
-  /// deleted when freeing resources. The block's memory remains accounted against
-  /// the original owner, even when the ownership of batches is transferred. If the
-  /// original owner wants the memory to be released, it should call this with 'mode'
-  /// FLUSH_RESOURCES (see MarkFlushResources() for further explanation).
-  void AddBlock(BufferedBlockMgr::Block* block, FlushMode flush);
-
   /// Adds a buffer to this row batch. The buffer is deleted when freeing resources.
   /// The buffer's memory remains accounted against the original owner, even when the
   /// ownership of batches is transferred. If the original owner wants the memory to be
@@ -426,10 +417,6 @@ class RowBatch {
   /// (i.e. they are not ref counted) so most row batches don't own any.
   std::vector<std::unique_ptr<DiskIoMgr::BufferDescriptor>> io_buffers_;
 
-  /// Blocks attached to this row batch. The underlying memory and block manager client
-  /// are owned by the BufferedBlockMgr.
-  std::vector<BufferedBlockMgr::Block*> blocks_;
-
   struct BufferInfo {
     BufferPool::ClientHandle* client;
     BufferPool::BufferHandle buffer;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/runtime-filter.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter.h b/be/src/runtime/runtime-filter.h
index ab70d4a..7b6066a 100644
--- a/be/src/runtime/runtime-filter.h
+++ b/be/src/runtime/runtime-filter.h
@@ -23,6 +23,7 @@
 #include "runtime/runtime-filter-bank.h"
 #include "util/bloom-filter.h"
 #include "util/spinlock.h"
+#include "util/time.h"
 
 namespace impala {
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/runtime-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.cc b/be/src/runtime/runtime-state.cc
index 89eec29..ba8e75d 100644
--- a/be/src/runtime/runtime-state.cc
+++ b/be/src/runtime/runtime-state.cc
@@ -17,21 +17,21 @@
 
 #include "runtime/runtime-state.h"
 
-#include <iostream>
 #include <jni.h>
+#include <iostream>
 #include <sstream>
 #include <string>
 
-#include "common/logging.h"
 #include <boost/algorithm/string/join.hpp>
 #include <gutil/strings/substitute.h>
+#include "common/logging.h"
 
 #include "codegen/llvm-codegen.h"
 #include "common/object-pool.h"
 #include "common/status.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-fn-call.h"
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/buffer-pool.h"
 #include "runtime/bufferpool/reservation-tracker.h"
 #include "runtime/data-stream-mgr.h"
 #include "runtime/data-stream-recvr.h"
@@ -54,22 +54,10 @@
 #include "common/names.h"
 
 using namespace llvm;
+using strings::Substitute;
 
 DECLARE_int32(max_errors);
 
-// The fraction of the query mem limit that is used for the block mgr. Operators
-// that accumulate memory all use the block mgr so the majority of the memory should
-// be allocated to the block mgr. The remaining memory is used by the non-spilling
-// operators and should be independent of data size.
-static const float BLOCK_MGR_MEM_FRACTION = 0.8f;
-
-// The minimum amount of memory that must be left after the block mgr reserves the
-// BLOCK_MGR_MEM_FRACTION. The block limit is:
-// min(query_limit * BLOCK_MGR_MEM_FRACTION, query_limit - BLOCK_MGR_MEM_MIN_REMAINING)
-// TODO: this value was picked arbitrarily and the tests are written to rely on this
-// for the minimum memory required to run the query. Revisit.
-static const int64_t BLOCK_MGR_MEM_MIN_REMAINING = 100 * 1024 * 1024;
-
 namespace impala {
 
 RuntimeState::RuntimeState(QueryState* query_state, const TPlanFragmentCtx& fragment_ctx,
@@ -82,7 +70,7 @@ RuntimeState::RuntimeState(QueryState* query_state, const TPlanFragmentCtx& frag
         query_state->query_ctx().utc_timestamp_string))),
     exec_env_(exec_env),
     profile_(obj_pool(), "Fragment " + PrintId(instance_ctx.fragment_instance_id)),
-    instance_buffer_reservation_(nullptr),
+    instance_buffer_reservation_(new ReservationTracker),
     is_cancelled_(false),
     root_node_id_(-1) {
   Init();
@@ -127,8 +115,7 @@ void RuntimeState::Init() {
   instance_mem_tracker_.reset(new MemTracker(
       runtime_profile(), -1, runtime_profile()->name(), query_mem_tracker()));
 
-  if (query_state_ != nullptr && exec_env_->buffer_pool() != nullptr) {
-    instance_buffer_reservation_ = obj_pool()->Add(new ReservationTracker);
+  if (instance_buffer_reservation_ != nullptr) {
     instance_buffer_reservation_->InitChildTracker(&profile_,
         query_state_->buffer_reservation(), instance_mem_tracker_.get(),
         numeric_limits<int64_t>::max());
@@ -139,28 +126,6 @@ void RuntimeState::InitFilterBank() {
   filter_bank_.reset(new RuntimeFilterBank(query_ctx(), this));
 }
 
-Status RuntimeState::CreateBlockMgr() {
-  DCHECK(block_mgr_.get() == NULL);
-
-  // Compute the max memory the block mgr will use.
-  int64_t block_mgr_limit = query_mem_tracker()->lowest_limit();
-  if (block_mgr_limit < 0) block_mgr_limit = numeric_limits<int64_t>::max();
-  block_mgr_limit = min(static_cast<int64_t>(block_mgr_limit * BLOCK_MGR_MEM_FRACTION),
-      block_mgr_limit - BLOCK_MGR_MEM_MIN_REMAINING);
-  if (block_mgr_limit < 0) block_mgr_limit = 0;
-  if (query_options().__isset.max_block_mgr_memory &&
-      query_options().max_block_mgr_memory > 0) {
-    block_mgr_limit = query_options().max_block_mgr_memory;
-    LOG(WARNING) << "Block mgr mem limit: "
-                 << PrettyPrinter::Print(block_mgr_limit, TUnit::BYTES);
-  }
-
-  RETURN_IF_ERROR(BufferedBlockMgr::Create(this, query_mem_tracker(),
-      runtime_profile(), exec_env()->tmp_file_mgr(), block_mgr_limit,
-      io_mgr()->max_read_buffer_size(), &block_mgr_));
-  return Status::OK();
-}
-
 Status RuntimeState::CreateCodegen() {
   if (codegen_.get() != NULL) return Status::OK();
   // TODO: add the fragment ID to the codegen ID as well
@@ -179,6 +144,10 @@ Status RuntimeState::CodegenScalarFns() {
   return Status::OK();
 }
 
+Status RuntimeState::StartSpilling(MemTracker* mem_tracker) {
+  return query_state_->StartSpilling(this, mem_tracker);
+}
+
 string RuntimeState::ErrorLog() {
   lock_guard<SpinLock> l(error_log_lock_);
   return PrintErrorMapToString(error_log_);
@@ -270,7 +239,6 @@ void RuntimeState::ReleaseResources() {
   if (resource_pool_ != nullptr) {
     exec_env_->thread_mgr()->UnregisterPool(resource_pool_);
   }
-  block_mgr_.reset(); // Release any block mgr memory, if this is the last reference.
   codegen_.reset(); // Release any memory associated with codegen.
 
   // Release the reservation, which should be unused at the point.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/runtime-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.h b/be/src/runtime/runtime-state.h
index 9a1d0b2..12e7d8c 100644
--- a/be/src/runtime/runtime-state.h
+++ b/be/src/runtime/runtime-state.h
@@ -32,7 +32,7 @@
 
 namespace impala {
 
-class BufferedBlockMgr;
+class BufferPool;
 class DataStreamRecvr;
 class DescriptorTbl;
 class DiskIoMgr;
@@ -92,9 +92,6 @@ class RuntimeState {
   /// Initializes the runtime filter bank.
   void InitFilterBank();
 
-  /// Gets/Creates the query wide block mgr.
-  Status CreateBlockMgr();
-
   QueryState* query_state() const { return query_state_; }
   /// Return the query's ObjectPool
   ObjectPool* obj_pool() const;
@@ -132,7 +129,7 @@ class RuntimeState {
   MemTracker* instance_mem_tracker() { return instance_mem_tracker_.get(); }
   MemTracker* query_mem_tracker();  // reference to the query_state_'s memtracker
   ReservationTracker* instance_buffer_reservation() {
-    return instance_buffer_reservation_;
+    return instance_buffer_reservation_.get();
   }
   ThreadResourceMgr::ResourcePool* resource_pool() { return resource_pool_; }
 
@@ -206,11 +203,6 @@ class RuntimeState {
   /// Unregisters all reader contexts acquired through AcquireReaderContext().
   void UnregisterReaderContexts();
 
-  BufferedBlockMgr* block_mgr() {
-    DCHECK(block_mgr_.get() != NULL);
-    return block_mgr_.get();
-  }
-
   inline Status GetQueryStatus() {
     // Do a racy check for query_status_ to avoid unnecessary spinlock acquisition.
     if (UNLIKELY(!query_status_.ok())) {
@@ -307,21 +299,19 @@ class RuntimeState {
   /// TODO: Fix IMPALA-4233
   Status CodegenScalarFns();
 
+  /// Helper to call QueryState::StartSpilling().
+  Status StartSpilling(MemTracker* mem_tracker);
+
   /// Release resources and prepare this object for destruction.
   void ReleaseResources();
 
  private:
-  /// Allow TestEnv to set block_mgr manually for testing.
+  /// Allow TestEnv to use private methods for testing.
   friend class TestEnv;
 
   /// Set per-fragment state.
   void Init();
 
-  /// Use a custom block manager for the query for testing purposes.
-  void set_block_mgr(const std::shared_ptr<BufferedBlockMgr>& block_mgr) {
-    block_mgr_ = block_mgr;
-  }
-
   /// Lock protecting error_log_
   SpinLock error_log_lock_;
 
@@ -382,9 +372,8 @@ class RuntimeState {
   boost::scoped_ptr<MemTracker> instance_mem_tracker_;
 
   /// Buffer reservation for this fragment instance - a child of the query buffer
-  /// reservation. Non-NULL if 'query_state_' is not NULL and ExecEnv::buffer_pool_
-  /// was created by a backend test. Owned by obj_pool().
-  ReservationTracker* instance_buffer_reservation_;
+  /// reservation. Non-NULL if 'query_state_' is not NULL.
+  boost::scoped_ptr<ReservationTracker> instance_buffer_reservation_;
 
   /// if true, execution should stop with a CANCELLED status
   bool is_cancelled_;
@@ -401,11 +390,6 @@ class RuntimeState {
   SpinLock reader_contexts_lock_;
   std::vector<DiskIoRequestContext*> reader_contexts_;
 
-  /// BufferedBlockMgr object used to allocate and manage blocks of input data in memory
-  /// with a fixed memory budget.
-  /// The block mgr is shared by all fragments for this query.
-  std::shared_ptr<BufferedBlockMgr> block_mgr_;
-
   /// This is the node id of the root node for this plan fragment. This is used as the
   /// hash seed and has two useful properties:
   /// 1) It is the same for all exec nodes in a fragment, so the resulting hash values


[04/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/sorter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc
index b4ef279..ee0e4be 100644
--- a/be/src/runtime/sorter.cc
+++ b/be/src/runtime/sorter.cc
@@ -17,15 +17,20 @@
 
 #include "runtime/sorter.h"
 
+#include <limits>
+
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/uniform_int.hpp>
 #include <gutil/strings/substitute.h>
 
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/reservation-tracker.h"
+#include "runtime/exec-env.h"
 #include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "runtime/sorted-run-merger.h"
+#include "util/pretty-printer.h"
 #include "util/runtime-profile-counters.h"
 
 #include "common/names.h"
@@ -36,7 +41,7 @@ using namespace strings;
 
 namespace impala {
 
-// Number of pinned blocks required for a merge with fixed-length data only.
+// Number of pinned pages required for a merge with fixed-length data only.
 const int MIN_BUFFERS_PER_MERGE = 3;
 
 // Maximum number of buffers to use in each merge to prevent sorter trying to grab
@@ -46,35 +51,140 @@ const int MIN_BUFFERS_PER_MERGE = 3;
 // we should base this on the number of reservations.
 const int MAX_BUFFERS_PER_MERGE = 128;
 
-const string MEM_ALLOC_FAILED_ERROR_MSG = "Failed to allocate block for $0-length "
-    "data needed for sorting. Reducing query concurrency or increasing the "
-    "memory limit may help this query to complete successfully.";
-
-const string MERGE_FAILED_ERROR_MSG = "Failed to allocate block to merge spilled runs "
+const string MERGE_FAILED_ERROR_MSG = "Failed to allocate page to merge spilled runs "
     "during sorting. Only $0 runs could be merged, but must be able to merge at least 2 "
     "to make progress. Reducing query concurrency or increasing the memory limit may "
     "help this query to complete successfully.";
 
-/// Delete all non-null blocks in blocks and clear vector.
-static void DeleteAndClearBlocks(vector<BufferedBlockMgr::Block*>* blocks) {
-  for (BufferedBlockMgr::Block* block: *blocks) {
-    if (block != NULL) block->Delete();
+/// Wrapper around BufferPool::PageHandle that tracks additional info about the page.
+/// The Page can be in four states:
+/// * Closed: The page starts in this state before Init() is called. Calling
+///   ExtractBuffer() or Close() puts the page back in this state. No other operations
+///   are valid on a closed page.
+/// * In memory: the page is pinned and the buffer is in memory. data() is valid. The
+///   page is in this state after Init(). If the page is pinned but not in memory, it
+///   can be brought into this state by calling WaitForBuffer().
+/// * Unpinned: the page was unpinned by calling Unpin(). It is invalid to access the
+///   page's buffer.
+/// * Pinned but not in memory: Pin() was called on the unpinned page, but
+///   WaitForBuffer() has not been called. It is invalid to access the page's buffer.
+class Sorter::Page {
+ public:
+  Page() { Reset(); }
+
+  /// Create a new page of length 'sorter->page_len_' bytes using
+  /// 'sorter->buffer_pool_client_'. Caller must ensure the client has enough
+  /// reservation for the page.
+  Status Init(Sorter* sorter) WARN_UNUSED_RESULT {
+    const BufferPool::BufferHandle* page_buffer;
+    RETURN_IF_ERROR(pool()->CreatePage(sorter->buffer_pool_client_, sorter->page_len_,
+        &handle_, &page_buffer));
+    data_ = page_buffer->data();
+    return Status::OK();
   }
-  blocks->clear();
-}
 
-static int NumNonNullBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
-  int count = 0;
-  for (BufferedBlockMgr::Block* block: blocks) {
-    if (block != NULL) ++count;
+  /// Extract the buffer from the page. The page must be in memory. When this function
+  /// returns the page is closed.
+  BufferPool::BufferHandle ExtractBuffer(BufferPool::ClientHandle* client) {
+    DCHECK(data_ != nullptr) << "Page must be in memory";
+    BufferPool::BufferHandle buffer;
+    Status status = pool()->ExtractBuffer(client, &handle_, &buffer);
+    DCHECK(status.ok()) << "Page was in memory, ExtractBuffer() shouldn't fail";
+    Reset();
+    return buffer;
+  }
+
+  /// Allocate 'len' bytes in the current page. The page must be in memory, and the
+  /// amount to allocate cannot exceed BytesRemaining().
+  uint8_t* AllocateBytes(int64_t len) {
+    DCHECK_GE(len, 0);
+    DCHECK_LE(len, BytesRemaining());
+    DCHECK(data_ != nullptr);
+    uint8_t* result = data_ + valid_data_len_;
+    valid_data_len_ += len;
+    return result;
+  }
+
+  /// Free the last 'len' bytes allocated from AllocateBytes(). The page must be in
+  /// memory.
+  void FreeBytes(int64_t len) {
+    DCHECK_GE(len, 0);
+    DCHECK_LE(len, valid_data_len_);
+    DCHECK(data_ != nullptr);
+    valid_data_len_ -= len;
+  }
+
+  /// Return number of bytes remaining in page.
+  int64_t BytesRemaining() { return len() - valid_data_len_; }
+
+  /// Brings a pinned page into memory, if not already in memory, and sets 'data_' to
+  /// point to the page's buffer.
+  Status WaitForBuffer() WARN_UNUSED_RESULT {
+    DCHECK(handle_.is_pinned());
+    if (data_ != nullptr) return Status::OK();
+    const BufferPool::BufferHandle* page_buffer;
+    RETURN_IF_ERROR(handle_.GetBuffer(&page_buffer));
+    data_ = page_buffer->data();
+    return Status::OK();
+  }
+
+  /// Helper to pin the page. Caller must ensure the client has enough reservation
+  /// remaining to pin the page. Only valid to call on an unpinned page.
+  Status Pin(BufferPool::ClientHandle* client) WARN_UNUSED_RESULT {
+    DCHECK(!handle_.is_pinned());
+    return pool()->Pin(client, &handle_);
+  }
+
+  /// Helper to unpin the page.
+  void Unpin(BufferPool::ClientHandle* client) {
+    pool()->Unpin(client, &handle_);
+    data_ = nullptr;
+  }
+
+  /// Destroy the page with 'client'.
+  void Close(BufferPool::ClientHandle* client) {
+    pool()->DestroyPage(client, &handle_);
+    Reset();
   }
-  return count;
-}
+
+  int64_t valid_data_len() const { return valid_data_len_; }
+  /// Returns a pointer to the start of the page's buffer. Only valid to call if the
+  /// page is in memory.
+  uint8_t* data() const {
+    DCHECK(data_ != nullptr);
+    return data_;
+  }
+  int64_t len() const { return handle_.len(); }
+  bool is_open() const { return handle_.is_open(); }
+  bool is_pinned() const { return handle_.is_pinned(); }
+  std::string DebugString() const { return handle_.DebugString(); }
+
+ private:
+  /// Reset the page to an unitialized state. 'handle_' must already be closed.
+  void Reset() {
+    DCHECK(!handle_.is_open());
+    valid_data_len_ = 0;
+    data_ = nullptr;
+  }
+
+  /// Helper to get the singleton buffer pool.
+  static BufferPool* pool() { return ExecEnv::GetInstance()->buffer_pool(); }
+
+  BufferPool::PageHandle handle_;
+
+  /// Length of valid data written to the page.
+  int64_t valid_data_len_;
+
+  /// Cached pointer to the buffer in 'handle_'. NULL if the page is unpinned. May be NULL
+  /// or not NULL if the page is pinned. Can be populated by calling WaitForBuffer() on a
+  /// pinned page.
+  uint8_t* data_;
+};
 
 /// A run is a sequence of tuples. The run can be sorted or unsorted (in which case the
-/// Sorter will sort it). A run comprises a sequence of fixed-length blocks containing the
+/// Sorter will sort it). A run comprises a sequence of fixed-length pages containing the
 /// tuples themselves (i.e. fixed-len slots that may contain ptrs to var-length data), and
-/// an optional sequence of var-length blocks containing the var-length data.
+/// an optional sequence of var-length pages containing the var-length data.
 ///
 /// Runs are either "initial runs" constructed from the sorter's input by evaluating
 /// the expressions in 'sort_tuple_exprs_' or "intermediate runs" constructed
@@ -84,7 +194,7 @@ static int NumNonNullBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
 /// sorted run.
 ///
 /// The expected calling sequence of functions is as follows:
-/// * Init() to initialize the run and allocate initial blocks.
+/// * Init() to initialize the run and allocate initial pages.
 /// * Add*Batch() to add batches of tuples to the run.
 /// * FinalizeInput() to signal that no more batches will be added.
 /// * If the run is unsorted, it must be sorted. After that set_sorted() must be called.
@@ -92,29 +202,30 @@ static int NumNonNullBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
 /// * PrepareRead() to allocate resources for reading the run.
 /// * GetNext() (if there was a single run) or GetNextBatch() (when merging multiple runs)
 ///   to read from the run.
-/// * Once reading is done, DeleteAllBlocks() should be called to free resources.
+/// * Once reading is done, CloseAllPages() should be called to free resources.
 class Sorter::Run {
  public:
   Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool initial_run);
 
   ~Run() {
-    DCHECK(fixed_len_blocks_.empty());
-    DCHECK(var_len_blocks_.empty());
-    DCHECK(var_len_copy_block_ == NULL);
+    DCHECK(fixed_len_pages_.empty());
+    DCHECK(var_len_pages_.empty());
+    DCHECK(!var_len_copy_page_.is_open());
   }
 
   /// Initialize the run for input rows by allocating the minimum number of required
-  /// blocks - one block for fixed-len data added to fixed_len_blocks_, one for the
-  /// initially unsorted var-len data added to var_len_blocks_, and one to copy sorted
-  /// var-len data into var_len_copy_block_.
-  Status Init();
+  /// pages - one page for fixed-len data added to fixed_len_pages_, one for the
+  /// initially unsorted var-len data added to var_len_pages_, and one to copy sorted
+  /// var-len data into var_len_copy_page_.
+  Status Init() WARN_UNUSED_RESULT;
 
   /// Add the rows from 'batch' starting at 'start_index' to the current run. Returns the
-  /// number of rows actually added in 'num_processed'. If the run is full (no more blocks
+  /// number of rows actually added in 'num_processed'. If the run is full (no more pages
   /// can be allocated), 'num_processed' may be less than the number of remaining rows in
   /// the batch. AddInputBatch() materializes the input rows using the expressions in
   /// sorter_->sort_tuple_expr_evals_, while AddIntermediateBatch() just copies rows.
-  Status AddInputBatch(RowBatch* batch, int start_index, int* num_processed) {
+  Status AddInputBatch(
+      RowBatch* batch, int start_index, int* num_processed) WARN_UNUSED_RESULT {
     DCHECK(initial_run_);
     if (has_var_len_slots_) {
       return AddBatchInternal<true, true>(batch, start_index, num_processed);
@@ -122,7 +233,9 @@ class Sorter::Run {
       return AddBatchInternal<false, true>(batch, start_index, num_processed);
     }
   }
-  Status AddIntermediateBatch(RowBatch* batch, int start_index, int* num_processed) {
+
+  Status AddIntermediateBatch(
+      RowBatch* batch, int start_index, int* num_processed) WARN_UNUSED_RESULT {
     DCHECK(!initial_run_);
     if (has_var_len_slots_) {
       return AddBatchInternal<true, false>(batch, start_index, num_processed);
@@ -133,53 +246,53 @@ class Sorter::Run {
 
   /// Called after the final call to Add*Batch() to do any bookkeeping necessary to
   /// finalize the run. Must be called before sorting or merging the run.
-  Status FinalizeInput();
+  Status FinalizeInput() WARN_UNUSED_RESULT;
 
-  /// Unpins all the blocks in a sorted run. Var-length column data is copied into new
-  /// blocks in sorted order. Pointers in the original tuples are converted to offsets
-  /// from the beginning of the sequence of var-len data blocks. Returns an error and
-  /// may leave some blocks pinned if an error is encountered in the block mgr.
-  Status UnpinAllBlocks();
+  /// Unpins all the pages in a sorted run. Var-length column data is copied into new
+  /// pages in sorted order. Pointers in the original tuples are converted to offsets
+  /// from the beginning of the sequence of var-len data pages. Returns an error and
+  /// may leave some pages pinned if an error is encountered.
+  Status UnpinAllPages() WARN_UNUSED_RESULT;
 
-  /// Deletes all blocks.
-  void DeleteAllBlocks();
+  /// Closes all pages and clears vectors of pages.
+  void CloseAllPages();
 
-  /// Prepare to read a sorted run. Pins the first block(s) in the run if the run was
+  /// Prepare to read a sorted run. Pins the first page(s) in the run if the run was
   /// previously unpinned. If the run was unpinned, try to pin the initial fixed and
-  /// var len blocks in the run. If it couldn't pin them, set pinned_all_blocks to false.
-  /// In that case, none or one of the initial blocks may be pinned and it is valid to
-  /// call PrepareRead() again to retry pinning the remainder. pinned_all_blocks is
-  /// always set to true if the run is pinned.
-  Status PrepareRead(bool* pinned_all_blocks);
+  /// var len pages in the run. If it couldn't pin them, set pinned to false.
+  /// In that case, none of the initial pages will be pinned and it is valid to
+  /// call PrepareRead() again to retry pinning. pinned is always set to
+  /// true if the run was pinned.
+  Status PrepareRead(bool* pinned) WARN_UNUSED_RESULT;
 
   /// Interface for merger - get the next batch of rows from this run. This run still
   /// owns the returned batch. Calls GetNext(RowBatch*, bool*).
-  Status GetNextBatch(RowBatch** sorted_batch);
+  Status GetNextBatch(RowBatch** sorted_batch) WARN_UNUSED_RESULT;
 
   /// Fill output_batch with rows from this run. If CONVERT_OFFSET_TO_PTR is true, offsets
   /// in var-length slots are converted back to pointers. Only row pointers are copied
   /// into output_batch. eos is set to true after all rows from the run are returned.
-  /// If eos is true, the returned output_batch has zero rows and has no attached blocks.
-  /// If this run was unpinned, one block (two if there are var-len slots) is pinned while
-  /// rows are filled into output_batch. The block is unpinned before the next block is
-  /// pinned, so at most one (two if there are var-len slots) block(s) will be pinned at
-  /// once. If the run was pinned, the blocks are not unpinned and each block is attached
-  /// to 'output_batch' once all rows referencing data in the block have been returned,
+  /// If eos is true, the returned output_batch has zero rows and has no attached pages.
+  /// If this run was unpinned, one page (two if there are var-len slots) is pinned while
+  /// rows are filled into output_batch. The page is unpinned before the next page is
+  /// pinned, so at most one (two if there are var-len slots) page(s) will be pinned at
+  /// once. If the run was pinned, the pages are not unpinned and each page is attached
+  /// to 'output_batch' once all rows referencing data in the page have been returned,
   /// either in the current batch or previous batches. In both pinned and unpinned cases,
-  /// all rows in output_batch will reference at most one fixed-len and one var-len block.
+  /// all rows in output_batch will reference at most one fixed-len and one var-len page.
   template <bool CONVERT_OFFSET_TO_PTR>
-  Status GetNext(RowBatch* output_batch, bool* eos);
+  Status GetNext(RowBatch* output_batch, bool* eos) WARN_UNUSED_RESULT;
 
-  /// Delete all blocks in 'runs' and clear 'runs'.
+  /// Delete all pages in 'runs' and clear 'runs'.
   static void CleanupRuns(deque<Run*>* runs) {
-    for (Run* run: *runs) {
-      run->DeleteAllBlocks();
+    for (Run* run : *runs) {
+      run->CloseAllPages();
     }
     runs->clear();
   }
 
-  /// Return total amount of fixed and var len data in run, not including blocks that
-  /// were already transferred.
+  /// Return total amount of fixed and var len data in run, not including pages that
+  /// were already transferred or closed.
   int64_t TotalBytes() const;
 
   inline bool is_pinned() const { return is_pinned_; }
@@ -196,34 +309,42 @@ class Sorter::Run {
   /// INITIAL_RUN and HAS_VAR_LEN_SLOTS are template arguments for performance and must
   /// match 'initial_run_' and 'has_var_len_slots_'.
   template <bool HAS_VAR_LEN_SLOTS, bool INITIAL_RUN>
-  Status AddBatchInternal(RowBatch* batch, int start_index, int* num_processed);
+  Status AddBatchInternal(
+      RowBatch* batch, int start_index, int* num_processed) WARN_UNUSED_RESULT;
 
-  /// Finalize the list of blocks: delete empty final blocks and unpin the previous block
+  /// Finalize the list of pages: delete empty final pages and unpin the previous page
   /// if the run is unpinned.
-  Status FinalizeBlocks(vector<BufferedBlockMgr::Block*>* blocks);
+  Status FinalizePages(vector<Page>* pages) WARN_UNUSED_RESULT;
 
   /// Collect the non-null var-len (e.g. STRING) slots from 'src' in 'var_len_values' and
   /// return the total length of all var-len values in 'total_var_len'.
-  void CollectNonNullVarSlots(Tuple* src, vector<StringValue*>* var_len_values,
-      int* total_var_len);
+  void CollectNonNullVarSlots(
+      Tuple* src, vector<StringValue*>* var_len_values, int* total_var_len);
 
-  enum AddBlockMode { KEEP_PREV_PINNED, UNPIN_PREV };
+  enum AddPageMode { KEEP_PREV_PINNED, UNPIN_PREV };
 
-  /// Try to extend the current run by a block. If 'mode' is KEEP_PREV_PINNED, try to
-  /// allocate a new block, which may fail to extend the run due to lack of memory. If
-  /// mode is 'UNPIN_PREV', unpin the previous block in block_sequence before allocating
-  /// and adding a new block - this never fails due to lack of memory.
+  /// Try to extend the current run by a page. If 'mode' is KEEP_PREV_PINNED, try to
+  /// allocate a new page, which may fail to extend the run due to lack of memory. If
+  /// mode is 'UNPIN_PREV', unpin the previous page in page_sequence before allocating
+  /// and adding a new page - this never fails due to lack of memory.
   ///
-  /// Returns an error status only if the block manager returns an error. If no error is
+  /// Returns an error status only if the buffer pool returns an error. If no error is
   /// encountered, sets 'added' to indicate whether the run was extended and returns
-  /// Status::OK(). The new block is appended to 'block_sequence'.
-  Status TryAddBlock(AddBlockMode mode, vector<BufferedBlockMgr::Block*>* block_sequence,
-      bool* added);
+  /// Status::OK(). The new page is appended to 'page_sequence'.
+  Status TryAddPage(
+      AddPageMode mode, vector<Page>* page_sequence, bool* added) WARN_UNUSED_RESULT;
+
+  /// Adds a new page to 'page_sequence' by a page. Caller must ensure enough
+  /// reservation is available to create the page.
+  ///
+  /// Returns an error status only if the buffer pool returns an error. If an error
+  /// is returned 'page_sequence' is left unmodified.
+  Status AddPage(vector<Page>* page_sequence) WARN_UNUSED_RESULT;
 
-  /// Advance to the next read block. If the run is pinned, has no effect. If the run
-  /// is unpinned, atomically pin the block at 'block_index' + 1 in 'blocks' and delete
-  /// the block at 'block_index'.
-  Status PinNextReadBlock(vector<BufferedBlockMgr::Block*>* blocks, int block_index);
+  /// Advance to the next read page. If the run is pinned, has no effect. If the run
+  /// is unpinned, atomically pin the page at 'page_index' + 1 in 'pages' and delete
+  /// the page at 'page_index'.
+  Status PinNextReadPage(vector<Page>* pages, int page_index) WARN_UNUSED_RESULT;
 
   /// Copy the StringValues in 'var_values' to 'dest' in order and update the StringValue
   /// ptrs in 'dest' to point to the copied data.
@@ -231,25 +352,41 @@ class Sorter::Run {
 
   /// Copy the StringValues in 'var_values' to 'dest' in order. Update the StringValue
   /// ptrs in 'dest' to contain a packed offset for the copied data comprising
-  /// block_index and the offset relative to block_start.
-  void CopyVarLenDataConvertOffset(const vector<StringValue*>& var_values,
-      int block_index, const uint8_t* block_start, uint8_t* dest);
+  /// page_index and the offset relative to page_start.
+  void CopyVarLenDataConvertOffset(const vector<StringValue*>& var_values, int page_index,
+      const uint8_t* page_start, uint8_t* dest);
 
   /// Convert encoded offsets to valid pointers in tuple with layout 'sort_tuple_desc_'.
-  /// 'tuple' is modified in-place. Returns true if the pointers refer to the block at
-  /// 'var_len_blocks_index_' and were successfully converted or false if the var len
-  /// data is in the next block, in which case 'tuple' is unmodified.
+  /// 'tuple' is modified in-place. Returns true if the pointers refer to the page at
+  /// 'var_len_pages_index_' and were successfully converted or false if the var len
+  /// data is in the next page, in which case 'tuple' is unmodified.
   bool ConvertOffsetsToPtrs(Tuple* tuple);
 
-  /// Returns true if we have var-len blocks in the run.
-  inline bool HasVarLenBlocks() const {
-    // Shouldn't have any blocks unless there are slots.
-    DCHECK(var_len_blocks_.empty() || has_var_len_slots_);
-    return !var_len_blocks_.empty();
+  /// Returns true if we have var-len pages in the run.
+  inline bool HasVarLenPages() const {
+    // Shouldn't have any pages unless there are slots.
+    DCHECK(var_len_pages_.empty() || has_var_len_slots_);
+    return !var_len_pages_.empty();
+  }
+
+  static int NumOpenPages(const vector<Page>& pages) {
+    int count = 0;
+    for (const Page& page : pages) {
+      if (page.is_open()) ++count;
+    }
+    return count;
+  }
+
+  /// Close all open pages and clear vector.
+  void DeleteAndClearPages(vector<Page>* pages) {
+    for (Page& page : *pages) {
+      if (page.is_open()) page.Close(sorter_->buffer_pool_client_);
+    }
+    pages->clear();
   }
 
   /// Parent sorter object.
-  const Sorter* sorter_;
+  Sorter* const sorter_;
 
   /// Materialized sort tuple. Input rows are materialized into 1 tuple (with descriptor
   /// sort_tuple_desc_) before sorting.
@@ -258,10 +395,10 @@ class Sorter::Run {
   /// The size in bytes of the sort tuple.
   const int sort_tuple_size_;
 
-  /// Number of tuples per block in a run. This gets multiplied with
-  /// TupleIterator::block_index_ in various places and to make sure we don't overflow the
+  /// Number of tuples per page in a run. This gets multiplied with
+  /// TupleIterator::page_index_ in various places and to make sure we don't overflow the
   /// result of that operation we make this int64_t here.
-  const int64_t block_capacity_;
+  const int64_t page_capacity_;
 
   const bool has_var_len_slots_;
 
@@ -269,7 +406,7 @@ class Sorter::Run {
   /// resulting from merging other runs.
   const bool initial_run_;
 
-  /// True if all blocks in the run are pinned. Initial runs start off pinned and
+  /// True if all pages in the run are pinned. Initial runs start off pinned and
   /// can be unpinned. Intermediate runs are always unpinned.
   bool is_pinned_;
 
@@ -281,27 +418,27 @@ class Sorter::Run {
   /// Always true for intermediate runs.
   bool is_sorted_;
 
-  /// Sequence of blocks in this run containing the fixed-length portion of the sort
+  /// Sequence of pages in this run containing the fixed-length portion of the sort
   /// tuples comprising this run. The data pointed to by the var-len slots are in
-  /// var_len_blocks_. A run can have zero blocks if no rows are appended.
-  /// If the run is sorted, the tuples in fixed_len_blocks_ will be in sorted order.
-  /// fixed_len_blocks_[i] is NULL iff it has been transferred or deleted.
-  vector<BufferedBlockMgr::Block*> fixed_len_blocks_;
+  /// var_len_pages_. A run can have zero pages if no rows are appended.
+  /// If the run is sorted, the tuples in fixed_len_pages_ will be in sorted order.
+  /// fixed_len_pages_[i] is closed iff it has been transferred or deleted.
+  vector<Page> fixed_len_pages_;
 
-  /// Sequence of blocks in this run containing the var-length data corresponding to the
-  /// var-length columns from fixed_len_blocks_. In intermediate runs, the var-len data is
+  /// Sequence of pages in this run containing the var-length data corresponding to the
+  /// var-length columns from fixed_len_pages_. In intermediate runs, the var-len data is
   /// always stored in the same order as the fixed-length tuples. In initial runs, the
   /// var-len data is initially in unsorted order, but is reshuffled into sorted order in
-  /// UnpinAllBlocks(). A run can have no var len blocks if there are no var len slots or
+  /// UnpinAllPages(). A run can have no var len pages if there are no var len slots or
   /// if all the var len data is empty or NULL.
-  /// var_len_blocks_[i] is NULL iff it has been transferred or deleted.
-  vector<BufferedBlockMgr::Block*> var_len_blocks_;
+  /// var_len_pages_[i] is closed iff it has been transferred or deleted.
+  vector<Page> var_len_pages_;
 
-  /// For initial unsorted runs, an extra pinned block is needed to reorder var-len data
-  /// into fixed order in UnpinAllBlocks(). 'var_len_copy_block_' stores this extra
-  /// block. Deleted in UnpinAllBlocks().
+  /// For initial unsorted runs, an extra pinned page is needed to reorder var-len data
+  /// into fixed order in UnpinAllPages(). 'var_len_copy_page_' stores this extra
+  /// page. Deleted in UnpinAllPages().
   /// TODO: in case of in-memory runs, this could be deleted earlier to free up memory.
-  BufferedBlockMgr::Block* var_len_copy_block_;
+  Page var_len_copy_page_;
 
   /// Number of tuples added so far to this run.
   int64_t num_tuples_;
@@ -313,18 +450,18 @@ class Sorter::Run {
   scoped_ptr<RowBatch> buffered_batch_;
 
   /// Members used when a run is read in GetNext().
-  /// The index into 'fixed_' and 'var_len_blocks_' of the blocks being read in GetNext().
-  int fixed_len_blocks_index_;
-  int var_len_blocks_index_;
+  /// The index into 'fixed_' and 'var_len_pages_' of the pages being read in GetNext().
+  int fixed_len_pages_index_;
+  int var_len_pages_index_;
 
   /// If true, the last call to GetNext() reached the end of the previous fixed or
-  /// var-len block. The next call to GetNext() must increment 'fixed_len_blocks_index_'
-  /// or 'var_len_blocks_index_'. It must also pin the next block if the run is unpinned.
-  bool end_of_fixed_len_block_;
-  bool end_of_var_len_block_;
+  /// var-len page. The next call to GetNext() must increment 'fixed_len_pages_index_'
+  /// or 'var_len_pages_index_'. It must also pin the next page if the run is unpinned.
+  bool end_of_fixed_len_page_;
+  bool end_of_var_len_page_;
 
-  /// Offset into the current fixed length data block being processed.
-  int fixed_len_block_offset_;
+  /// Offset into the current fixed length data page being processed.
+  int fixed_len_page_offset_;
 };
 
 /// Helper class used to iterate over tuples in a run during sorting.
@@ -340,7 +477,7 @@ class Sorter::TupleIterator {
   /// Default constructor used for local variable. Produces invalid iterator that must
   /// be assigned before use.
   TupleIterator() : index_(-1), tuple_(NULL), buffer_start_index_(-1),
-      buffer_end_index_(-1), block_index_(-1) { }
+      buffer_end_index_(-1), page_index_(-1) { }
 
   /// Create an iterator pointing to the first tuple in the run.
   static inline TupleIterator Begin(Sorter::Run* run) { return TupleIterator(run, 0); }
@@ -351,8 +488,8 @@ class Sorter::TupleIterator {
   }
 
   /// Increments 'index_' and sets 'tuple_' to point to the next tuple in the run.
-  /// Increments 'block_index_' and advances to the next block if the next tuple is in
-  /// the next block. Can be advanced one past the last tuple in the run, but is not
+  /// Increments 'page_index_' and advances to the next page if the next tuple is in
+  /// the next page. Can be advanced one past the last tuple in the run, but is not
   /// valid to dereference 'tuple_' in that case. 'run' and 'tuple_size' are passed as
   /// arguments to avoid redundantly storing the same values in multiple iterators in
   /// perf-critical algorithms.
@@ -370,13 +507,13 @@ class Sorter::TupleIterator {
   }
 
  private:
-  // Move to the next block in the run (or do nothing if at end of run).
+  // Move to the next page in the run (or do nothing if at end of run).
   // This is the slow path for Next();
-  void NextBlock(Sorter::Run* run, int tuple_size);
+  void NextPage(Sorter::Run* run, int tuple_size);
 
-  // Move to the previous block in the run (or do nothing if at beginning of run).
+  // Move to the previous page in the run (or do nothing if at beginning of run).
   // This is the slow path for Prev();
-  void PrevBlock(Sorter::Run* run, int tuple_size);
+  void PrevPage(Sorter::Run* run, int tuple_size);
 
   /// Index of the current tuple in the run.
   /// Can be -1 or run->num_rows() if Next() or Prev() moves iterator outside of run.
@@ -387,15 +524,15 @@ class Sorter::TupleIterator {
   /// iterator outside of run.
   uint8_t* tuple_;
 
-  /// Indices of start and end tuples of block at block_index_. I.e. the current block
+  /// Indices of start and end tuples of page at page_index_. I.e. the current page
   /// has tuples with indices in range [buffer_start_index_, buffer_end_index).
   int64_t buffer_start_index_;
   int64_t buffer_end_index_;
 
-  /// Index into fixed_len_blocks_ of the block containing the current tuple.
-  /// If index_ is negative or past end of run, will point to the first or last block
+  /// Index into fixed_len_pages_ of the page containing the current tuple.
+  /// If index_ is negative or past end of run, will point to the first or last page
   /// in run respectively.
-  int block_index_;
+  int page_index_;
 };
 
 /// Sorts a sequence of tuples from a run in place using a provided tuple comparator.
@@ -404,16 +541,16 @@ class Sorter::TupleIterator {
 /// instance to check for cancellation during an in-memory sort.
 class Sorter::TupleSorter {
  public:
-  TupleSorter(const TupleRowComparator& comparator, int64_t block_size,
-      int tuple_size, RuntimeState* state);
+  TupleSorter(const TupleRowComparator& comparator, int64_t page_size, int tuple_size,
+      RuntimeState* state);
 
   ~TupleSorter();
 
   /// Performs a quicksort for tuples in 'run' followed by an insertion sort to
-  /// finish smaller blocks. Only valid to call if this is an initial run that has not
+  /// finish smaller ranges. Only valid to call if this is an initial run that has not
   /// yet been sorted. Returns an error status if any error is encountered or if the
   /// query is cancelled.
-  Status Sort(Run* run);
+  Status Sort(Run* run) WARN_UNUSED_RESULT;
 
  private:
   static const int INSERTION_THRESHOLD = 16;
@@ -451,7 +588,8 @@ class Sorter::TupleSorter {
 
   /// Perform an insertion sort for rows in the range [begin, end) in a run.
   /// Only valid to call for ranges of size at least 1.
-  Status InsertionSort(const TupleIterator& begin, const TupleIterator& end);
+  Status InsertionSort(
+      const TupleIterator& begin, const TupleIterator& end) WARN_UNUSED_RESULT;
 
   /// Partitions the sequence of tuples in the range [begin, end) in a run into two
   /// groups around the pivot tuple - i.e. tuples in first group are <= the pivot, and
@@ -459,12 +597,12 @@ class Sorter::TupleSorter {
   /// groups and the index to the first element in the second group is returned in 'cut'.
   /// Return an error status if any error is encountered or if the query is cancelled.
   Status Partition(TupleIterator begin, TupleIterator end, const Tuple* pivot,
-      TupleIterator* cut);
+      TupleIterator* cut) WARN_UNUSED_RESULT;
 
   /// Performs a quicksort of rows in the range [begin, end) followed by insertion sort
   /// for smaller groups of elements. Return an error status for any errors or if the
   /// query is cancelled.
-  Status SortHelper(TupleIterator begin, TupleIterator end);
+  Status SortHelper(TupleIterator begin, TupleIterator end) WARN_UNUSED_RESULT;
 
   /// Select a pivot to partition [begin, end).
   Tuple* SelectPivot(TupleIterator begin, TupleIterator end);
@@ -477,45 +615,33 @@ class Sorter::TupleSorter {
 };
 
 // Sorter::Run methods
-Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc,
-    bool initial_run)
+Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool initial_run)
   : sorter_(parent),
     sort_tuple_desc_(sort_tuple_desc),
     sort_tuple_size_(sort_tuple_desc->byte_size()),
-    block_capacity_(parent->block_mgr_->max_block_size() / sort_tuple_size_),
+    page_capacity_(parent->page_len_ / sort_tuple_size_),
     has_var_len_slots_(sort_tuple_desc->HasVarlenSlots()),
     initial_run_(initial_run),
     is_pinned_(initial_run),
     is_finalized_(false),
     is_sorted_(!initial_run),
-    var_len_copy_block_(NULL),
-    num_tuples_(0) { }
+    num_tuples_(0) {}
 
 Status Sorter::Run::Init() {
-  BufferedBlockMgr::Block* block = NULL;
-  RETURN_IF_ERROR(
-      sorter_->block_mgr_->GetNewBlock(sorter_->block_mgr_client_, NULL, &block));
-  if (block == NULL) {
-    return sorter_->mem_tracker_->MemLimitExceeded(
-        sorter_->state_, Substitute(MEM_ALLOC_FAILED_ERROR_MSG, "fixed"));
-  }
-  fixed_len_blocks_.push_back(block);
+  int num_to_create = 1 + has_var_len_slots_ + (has_var_len_slots_ && initial_run_);
+  int64_t required_mem = num_to_create * sorter_->page_len_;
+  if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(required_mem)) {
+    return Status(Substitute(
+        "Unexpected error trying to reserve $0 bytes for a sorted run: $2",
+        required_mem, sorter_->buffer_pool_client_->DebugString()));
+  }
+
+  RETURN_IF_ERROR(AddPage(&fixed_len_pages_));
   if (has_var_len_slots_) {
-    RETURN_IF_ERROR(
-        sorter_->block_mgr_->GetNewBlock(sorter_->block_mgr_client_, NULL, &block));
-    if (block == NULL) {
-      return sorter_->mem_tracker_->MemLimitExceeded(
-          sorter_->state_, Substitute(MEM_ALLOC_FAILED_ERROR_MSG, "variable"));
-    }
-    var_len_blocks_.push_back(block);
+    RETURN_IF_ERROR(AddPage(&var_len_pages_));
     if (initial_run_) {
-      // Need additional var len block to reorder var len data in UnpinAllBlocks().
-      RETURN_IF_ERROR(sorter_->block_mgr_->GetNewBlock(
-          sorter_->block_mgr_client_, NULL, &var_len_copy_block_));
-      if (var_len_copy_block_ == NULL) {
-        return sorter_->mem_tracker_->MemLimitExceeded(
-            sorter_->state_, Substitute(MEM_ALLOC_FAILED_ERROR_MSG, "variable"));
-      }
+      // Need additional var len page to reorder var len data in UnpinAllPages().
+      RETURN_IF_ERROR(var_len_copy_page_.Init(sorter_));
     }
   }
   if (initial_run_) {
@@ -527,14 +653,15 @@ Status Sorter::Run::Init() {
 }
 
 template <bool HAS_VAR_LEN_SLOTS, bool INITIAL_RUN>
-Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_processed) {
+Status Sorter::Run::AddBatchInternal(
+    RowBatch* batch, int start_index, int* num_processed) {
   DCHECK(!is_finalized_);
-  DCHECK(!fixed_len_blocks_.empty());
+  DCHECK(!fixed_len_pages_.empty());
   DCHECK_EQ(HAS_VAR_LEN_SLOTS, has_var_len_slots_);
   DCHECK_EQ(INITIAL_RUN, initial_run_);
 
   *num_processed = 0;
-  BufferedBlockMgr::Block* cur_fixed_len_block = fixed_len_blocks_.back();
+  Page* cur_fixed_len_page = &fixed_len_pages_.back();
 
   if (!INITIAL_RUN) {
     // For intermediate merges, the input row is the sort tuple.
@@ -543,13 +670,13 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
   }
 
   /// Keep initial unsorted runs pinned in memory so we can sort them.
-  const AddBlockMode add_mode = INITIAL_RUN ? KEEP_PREV_PINNED : UNPIN_PREV;
+  const AddPageMode add_mode = INITIAL_RUN ? KEEP_PREV_PINNED : UNPIN_PREV;
 
-  // Input rows are copied/materialized into tuples allocated in fixed_len_blocks_.
-  // The variable length column data are copied into blocks stored in var_len_blocks_.
+  // Input rows are copied/materialized into tuples allocated in fixed_len_pages_.
+  // The variable length column data are copied into pages stored in var_len_pages_.
   // Input row processing is split into two loops.
-  // The inner loop processes as many input rows as will fit in cur_fixed_len_block.
-  // The outer loop allocates a new block for fixed-len data if the input batch is
+  // The inner loop processes as many input rows as will fit in cur_fixed_len_page.
+  // The outer loop allocates a new page for fixed-len data if the input batch is
   // not exhausted.
 
   // cur_input_index is the index into the input 'batch' of the current input row being
@@ -559,22 +686,23 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
   string_values.reserve(sort_tuple_desc_->string_slots().size());
   while (cur_input_index < batch->num_rows()) {
     // tuples_remaining is the number of tuples to copy/materialize into
-    // cur_fixed_len_block.
-    int tuples_remaining = cur_fixed_len_block->BytesRemaining() / sort_tuple_size_;
+    // cur_fixed_len_page.
+    int tuples_remaining = cur_fixed_len_page->BytesRemaining() / sort_tuple_size_;
     tuples_remaining = min(batch->num_rows() - cur_input_index, tuples_remaining);
 
     for (int i = 0; i < tuples_remaining; ++i) {
       int total_var_len = 0;
       TupleRow* input_row = batch->GetRow(cur_input_index);
-      Tuple* new_tuple = cur_fixed_len_block->Allocate<Tuple>(sort_tuple_size_);
+      Tuple* new_tuple =
+          reinterpret_cast<Tuple*>(cur_fixed_len_page->AllocateBytes(sort_tuple_size_));
       if (INITIAL_RUN) {
         new_tuple->MaterializeExprs<HAS_VAR_LEN_SLOTS, true>(input_row,
             *sort_tuple_desc_, sorter_->sort_tuple_expr_evals_, NULL,
             &string_values, &total_var_len);
-        if (total_var_len > sorter_->block_mgr_->max_block_size()) {
-          return Status(ErrorMsg(TErrorCode::INTERNAL_ERROR, Substitute(
-              "Variable length data in a single tuple larger than block size $0 > $1",
-              total_var_len, sorter_->block_mgr_->max_block_size())));
+        if (total_var_len > sorter_->page_len_) {
+          return Status(TErrorCode::MAX_ROW_SIZE,
+              PrettyPrinter::Print(total_var_len, TUnit::BYTES), sorter_->node_id_,
+              PrettyPrinter::Print(0, TUnit::BYTES));
         }
       } else {
         memcpy(new_tuple, input_row->GetTuple(0), sort_tuple_size_);
@@ -584,17 +712,17 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
       }
 
       if (HAS_VAR_LEN_SLOTS) {
-        DCHECK_GT(var_len_blocks_.size(), 0);
-        BufferedBlockMgr::Block* cur_var_len_block = var_len_blocks_.back();
-        if (cur_var_len_block->BytesRemaining() < total_var_len) {
+        DCHECK_GT(var_len_pages_.size(), 0);
+        Page* cur_var_len_page = &var_len_pages_.back();
+        if (cur_var_len_page->BytesRemaining() < total_var_len) {
           bool added;
-          RETURN_IF_ERROR(TryAddBlock(add_mode, &var_len_blocks_, &added));
+          RETURN_IF_ERROR(TryAddPage(add_mode, &var_len_pages_, &added));
           if (added) {
-            cur_var_len_block = var_len_blocks_.back();
+            cur_var_len_page = &var_len_pages_.back();
           } else {
-            // There was not enough space in the last var-len block for this tuple, and
+            // There was not enough space in the last var-len page for this tuple, and
             // the run could not be extended. Return the fixed-len allocation and exit.
-            cur_fixed_len_block->ReturnAllocation(sort_tuple_size_);
+            cur_fixed_len_page->FreeBytes(sort_tuple_size_);
             return Status::OK();
           }
         }
@@ -605,13 +733,13 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
           DCHECK(new_tuple->IsNull(coll_slot->null_indicator_offset()));
         }
 
-        uint8_t* var_data_ptr = cur_var_len_block->Allocate<uint8_t>(total_var_len);
+        uint8_t* var_data_ptr = cur_var_len_page->AllocateBytes(total_var_len);
         if (INITIAL_RUN) {
           CopyVarLenData(string_values, var_data_ptr);
         } else {
-          DCHECK_EQ(var_len_blocks_.back(), cur_var_len_block);
-          CopyVarLenDataConvertOffset(string_values, var_len_blocks_.size() - 1,
-              reinterpret_cast<uint8_t*>(cur_var_len_block->buffer()), var_data_ptr);
+          DCHECK_EQ(&var_len_pages_.back(), cur_var_len_page);
+          CopyVarLenDataConvertOffset(string_values, var_len_pages_.size() - 1,
+              cur_var_len_page->data(), var_data_ptr);
         }
       }
       ++num_tuples_;
@@ -619,13 +747,13 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
       ++cur_input_index;
     }
 
-    // If there are still rows left to process, get a new block for the fixed-length
+    // If there are still rows left to process, get a new page for the fixed-length
     // tuples. If the run is already too long, return.
     if (cur_input_index < batch->num_rows()) {
       bool added;
-      RETURN_IF_ERROR(TryAddBlock(add_mode, &fixed_len_blocks_, &added));
+      RETURN_IF_ERROR(TryAddPage(add_mode, &fixed_len_pages_, &added));
       if (!added) return Status::OK();
-      cur_fixed_len_block = fixed_len_blocks_.back();
+      cur_fixed_len_page = &fixed_len_pages_.back();
     }
   }
   return Status::OK();
@@ -634,158 +762,146 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
 Status Sorter::Run::FinalizeInput() {
   DCHECK(!is_finalized_);
 
-  RETURN_IF_ERROR(FinalizeBlocks(&fixed_len_blocks_));
+  RETURN_IF_ERROR(FinalizePages(&fixed_len_pages_));
   if (has_var_len_slots_) {
-    RETURN_IF_ERROR(FinalizeBlocks(&var_len_blocks_));
+    RETURN_IF_ERROR(FinalizePages(&var_len_pages_));
   }
   is_finalized_ = true;
   return Status::OK();
 }
 
-Status Sorter::Run::FinalizeBlocks(vector<BufferedBlockMgr::Block*>* blocks) {
-  DCHECK_GT(blocks->size(), 0);
-  BufferedBlockMgr::Block* last_block = blocks->back();
-  if (last_block->valid_data_len() > 0) {
+Status Sorter::Run::FinalizePages(vector<Page>* pages) {
+  DCHECK_GT(pages->size(), 0);
+  Page* last_page = &pages->back();
+  if (last_page->valid_data_len() > 0) {
     DCHECK_EQ(initial_run_, is_pinned_);
     if (!is_pinned_) {
-      // Unpin the last block of this unpinned run. We've finished writing the run so
-      // all blocks in the run can now be unpinned.
-      RETURN_IF_ERROR(last_block->Unpin());
+      // Unpin the last page of this unpinned run. We've finished writing the run so
+      // all pages in the run can now be unpinned.
+      last_page->Unpin(sorter_->buffer_pool_client_);
     }
   } else {
-    last_block->Delete();
-    blocks->pop_back();
+    last_page->Close(sorter_->buffer_pool_client_);
+    pages->pop_back();
   }
   return Status::OK();
 }
 
-void Sorter::Run::DeleteAllBlocks() {
-  DeleteAndClearBlocks(&fixed_len_blocks_);
-  DeleteAndClearBlocks(&var_len_blocks_);
-  if (var_len_copy_block_ != NULL) var_len_copy_block_->Delete();
-  var_len_copy_block_ = NULL;
+void Sorter::Run::CloseAllPages() {
+  DeleteAndClearPages(&fixed_len_pages_);
+  DeleteAndClearPages(&var_len_pages_);
+  if (var_len_copy_page_.is_open()) {
+    var_len_copy_page_.Close(sorter_->buffer_pool_client_);
+  }
 }
 
-Status Sorter::Run::UnpinAllBlocks() {
+Status Sorter::Run::UnpinAllPages() {
   DCHECK(is_sorted_);
   DCHECK(initial_run_);
   DCHECK(is_pinned_);
   DCHECK(is_finalized_);
-  // A list of var len blocks to replace 'var_len_blocks_'. Note that after we are done
-  // we may have a different number of blocks, because internal fragmentation may leave
-  // slightly different amounts of wasted space at the end of each block.
-  // We need to be careful to clean up these blocks if we run into an error in this method.
-  vector<BufferedBlockMgr::Block*> sorted_var_len_blocks;
-  sorted_var_len_blocks.reserve(var_len_blocks_.size());
+  // A list of var len pages to replace 'var_len_pages_'. Note that after we are done
+  // we may have a different number of pages, because internal fragmentation may leave
+  // slightly different amounts of wasted space at the end of each page.
+  // We need to be careful to clean up these pages if we run into an error in this method.
+  vector<Page> sorted_var_len_pages;
+  sorted_var_len_pages.reserve(var_len_pages_.size());
 
   vector<StringValue*> string_values;
   int total_var_len;
   string_values.reserve(sort_tuple_desc_->string_slots().size());
-  BufferedBlockMgr::Block* cur_sorted_var_len_block = NULL;
-  if (HasVarLenBlocks()) {
-    DCHECK(var_len_copy_block_ != NULL);
-    sorted_var_len_blocks.push_back(var_len_copy_block_);
-    cur_sorted_var_len_block = var_len_copy_block_;
-    // Set var_len_copy_block_ to NULL since it was moved to var_len_blocks_.
-    var_len_copy_block_ = NULL;
+  Page* cur_sorted_var_len_page = NULL;
+  if (HasVarLenPages()) {
+    DCHECK(var_len_copy_page_.is_open());
+    sorted_var_len_pages.push_back(move(var_len_copy_page_));
+    cur_sorted_var_len_page = &sorted_var_len_pages.back();
   } else if (has_var_len_slots_) {
-    // If we don't have any var-len blocks, clean up the copy block.
-    DCHECK(var_len_copy_block_ != NULL);
-    var_len_copy_block_->Delete();
-    var_len_copy_block_ = NULL;
+    // If we don't have any var-len pages, clean up the copy page.
+    DCHECK(var_len_copy_page_.is_open());
+    var_len_copy_page_.Close(sorter_->buffer_pool_client_);
   } else {
-    DCHECK(var_len_copy_block_ == NULL);
+    DCHECK(!var_len_copy_page_.is_open());
   }
 
   Status status;
-  for (int i = 0; i < fixed_len_blocks_.size(); ++i) {
-    BufferedBlockMgr::Block* cur_fixed_block = fixed_len_blocks_[i];
+  for (int i = 0; i < fixed_len_pages_.size(); ++i) {
+    Page* cur_fixed_page = &fixed_len_pages_[i];
     // Skip converting the pointers if no var-len slots, or if all the values are null
     // or zero-length. This will possibly leave zero-length pointers pointing to
     // arbitrary memory, but zero-length data cannot be dereferenced anyway.
-    if (HasVarLenBlocks()) {
-      for (int block_offset = 0; block_offset < cur_fixed_block->valid_data_len();
-          block_offset += sort_tuple_size_) {
-        Tuple* cur_tuple =
-            reinterpret_cast<Tuple*>(cur_fixed_block->buffer() + block_offset);
+    if (HasVarLenPages()) {
+      for (int page_offset = 0; page_offset < cur_fixed_page->valid_data_len();
+           page_offset += sort_tuple_size_) {
+        Tuple* cur_tuple = reinterpret_cast<Tuple*>(cur_fixed_page->data() + page_offset);
         CollectNonNullVarSlots(cur_tuple, &string_values, &total_var_len);
-        DCHECK(cur_sorted_var_len_block != NULL);
-        if (cur_sorted_var_len_block->BytesRemaining() < total_var_len) {
+        DCHECK(cur_sorted_var_len_page->is_open());
+        if (cur_sorted_var_len_page->BytesRemaining() < total_var_len) {
           bool added;
-          status = TryAddBlock(UNPIN_PREV, &sorted_var_len_blocks, &added);
-          if (!status.ok()) goto cleanup_blocks;
-          DCHECK(added) << "TryAddBlock() with UNPIN_PREV should not fail to add";
-          cur_sorted_var_len_block = sorted_var_len_blocks.back();
+          status = TryAddPage(UNPIN_PREV, &sorted_var_len_pages, &added);
+          if (!status.ok()) goto cleanup_pages;
+          DCHECK(added) << "TryAddPage() with UNPIN_PREV should not fail to add";
+          cur_sorted_var_len_page = &sorted_var_len_pages.back();
         }
-        uint8_t* var_data_ptr =
-            cur_sorted_var_len_block->Allocate<uint8_t>(total_var_len);
-        DCHECK_EQ(sorted_var_len_blocks.back(), cur_sorted_var_len_block);
-        CopyVarLenDataConvertOffset(string_values, sorted_var_len_blocks.size() - 1,
-            reinterpret_cast<uint8_t*>(cur_sorted_var_len_block->buffer()), var_data_ptr);
+        uint8_t* var_data_ptr = cur_sorted_var_len_page->AllocateBytes(total_var_len);
+        DCHECK_EQ(&sorted_var_len_pages.back(), cur_sorted_var_len_page);
+        CopyVarLenDataConvertOffset(string_values, sorted_var_len_pages.size() - 1,
+            cur_sorted_var_len_page->data(), var_data_ptr);
       }
     }
-    status = cur_fixed_block->Unpin();
-    if (!status.ok()) goto cleanup_blocks;
+    cur_fixed_page->Unpin(sorter_->buffer_pool_client_);
   }
 
-  if (HasVarLenBlocks()) {
-    DCHECK_GT(sorted_var_len_blocks.back()->valid_data_len(), 0);
-    status = sorted_var_len_blocks.back()->Unpin();
-    if (!status.ok()) goto cleanup_blocks;
+  if (HasVarLenPages()) {
+    DCHECK_GT(sorted_var_len_pages.back().valid_data_len(), 0);
+    sorted_var_len_pages.back().Unpin(sorter_->buffer_pool_client_);
   }
 
-  // Clear var_len_blocks_ and replace with it with the contents of sorted_var_len_blocks
-  DeleteAndClearBlocks(&var_len_blocks_);
-  sorted_var_len_blocks.swap(var_len_blocks_);
+  // Clear var_len_pages_ and replace with it with the contents of sorted_var_len_pages
+  DeleteAndClearPages(&var_len_pages_);
+  sorted_var_len_pages.swap(var_len_pages_);
   is_pinned_ = false;
   sorter_->spilled_runs_counter_->Add(1);
   return Status::OK();
 
-cleanup_blocks:
-  DeleteAndClearBlocks(&sorted_var_len_blocks);
+cleanup_pages:
+  DeleteAndClearPages(&sorted_var_len_pages);
   return status;
 }
 
-Status Sorter::Run::PrepareRead(bool* pinned_all_blocks) {
+Status Sorter::Run::PrepareRead(bool* pinned) {
   DCHECK(is_finalized_);
   DCHECK(is_sorted_);
 
-  fixed_len_blocks_index_ = 0;
-  fixed_len_block_offset_ = 0;
-  var_len_blocks_index_ = 0;
-  end_of_fixed_len_block_ = end_of_var_len_block_ = fixed_len_blocks_.empty();
+  fixed_len_pages_index_ = 0;
+  fixed_len_page_offset_ = 0;
+  var_len_pages_index_ = 0;
+  end_of_fixed_len_page_ = end_of_var_len_page_ = fixed_len_pages_.empty();
   num_tuples_returned_ = 0;
 
   buffered_batch_.reset(new RowBatch(
       sorter_->output_row_desc_, sorter_->state_->batch_size(), sorter_->mem_tracker_));
 
-  // If the run is pinned, all blocks are already pinned, so we're ready to read.
+  // If the run is pinned, all pages are already pinned, so we're ready to read.
   if (is_pinned_) {
-    *pinned_all_blocks = true;
+    *pinned = true;
     return Status::OK();
   }
 
-  // Attempt to pin the first fixed and var-length blocks. In either case, pinning may
-  // fail if the number of reserved blocks is oversubscribed, see IMPALA-1590.
-  if (fixed_len_blocks_.size() > 0) {
-    bool pinned;
-    RETURN_IF_ERROR(fixed_len_blocks_[0]->Pin(&pinned));
-    if (!pinned) {
-      *pinned_all_blocks = false;
-      return Status::OK();
-    }
+  int num_to_pin = (fixed_len_pages_.size() > 0 ? 1 : 0) + (HasVarLenPages() ? 1 : 0);
+  int64_t required_mem = num_to_pin * sorter_->page_len_;
+  if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(required_mem)) {
+    *pinned = false;
+    return Status::OK();
   }
 
-  if (HasVarLenBlocks()) {
-    bool pinned;
-    RETURN_IF_ERROR(var_len_blocks_[0]->Pin(&pinned));
-    if (!pinned) {
-      *pinned_all_blocks = false;
-      return Status::OK();
-    }
+  // Attempt to pin the first fixed and var-length pages.
+  if (fixed_len_pages_.size() > 0) {
+    RETURN_IF_ERROR(fixed_len_pages_[0].Pin(sorter_->buffer_pool_client_));
   }
-
-  *pinned_all_blocks = true;
+  if (HasVarLenPages()) {
+    RETURN_IF_ERROR(var_len_pages_[0].Pin(sorter_->buffer_pool_client_));
+  }
+  *pinned = true;
   return Status::OK();
 }
 
@@ -794,7 +910,7 @@ Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {
   buffered_batch_->Reset();
   // Fill more rows into buffered_batch_.
   bool eos;
-  if (HasVarLenBlocks() && !is_pinned_) {
+  if (HasVarLenPages() && !is_pinned_) {
     RETURN_IF_ERROR(GetNext<true>(buffered_batch_.get(), &eos));
   } else {
     RETURN_IF_ERROR(GetNext<false>(buffered_batch_.get(), &eos));
@@ -804,7 +920,7 @@ Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {
     // Setting output_batch to NULL signals eos to the caller, so GetNext() is not
     // allowed to attach resources to the batch on eos.
     DCHECK_EQ(buffered_batch_->num_rows(), 0);
-    DCHECK_EQ(buffered_batch_->num_blocks(), 0);
+    DCHECK_EQ(buffered_batch_->num_buffers(), 0);
     *output_batch = NULL;
     return Status::OK();
   }
@@ -815,122 +931,130 @@ Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {
 template <bool CONVERT_OFFSET_TO_PTR>
 Status Sorter::Run::GetNext(RowBatch* output_batch, bool* eos) {
   // Var-len offsets are converted only when reading var-len data from unpinned runs.
-  // We shouldn't convert var len offsets if there are no blocks, since in that case
-  // they must all be null or zero-length strings, which don't point into a valid block.
-  DCHECK_EQ(CONVERT_OFFSET_TO_PTR, HasVarLenBlocks() && !is_pinned_);
+  // We shouldn't convert var len offsets if there are no pages, since in that case
+  // they must all be null or zero-length strings, which don't point into a valid page.
+  DCHECK_EQ(CONVERT_OFFSET_TO_PTR, HasVarLenPages() && !is_pinned_);
 
-  if (end_of_fixed_len_block_ &&
-      fixed_len_blocks_index_ >= static_cast<int>(fixed_len_blocks_.size()) - 1) {
+  if (end_of_fixed_len_page_
+      && fixed_len_pages_index_ >= static_cast<int>(fixed_len_pages_.size()) - 1) {
     if (is_pinned_) {
-      // All blocks were previously attached to output batches. GetNextBatch() assumes
+      // All pages were previously attached to output batches. GetNextBatch() assumes
       // that we don't attach resources to the batch on eos.
-      DCHECK_EQ(NumNonNullBlocks(fixed_len_blocks_), 0);
-      DCHECK_EQ(NumNonNullBlocks(var_len_blocks_), 0);
+      DCHECK_EQ(NumOpenPages(fixed_len_pages_), 0);
+      DCHECK_EQ(NumOpenPages(var_len_pages_), 0);
 
-      // Flush resources in case we are in a subplan and need to allocate more blocks
+      // Flush resources in case we are in a subplan and need to allocate more pages
       // when the node is reopened.
       output_batch->MarkFlushResources();
     } else {
       // We held onto the last fixed or var len blocks without transferring them to the
       // caller. We signalled MarkNeedsDeepCopy() to the caller, so we can safely delete
       // them now to free memory.
-      if (!fixed_len_blocks_.empty()) DCHECK_EQ(NumNonNullBlocks(fixed_len_blocks_), 1);
-      if (!var_len_blocks_.empty()) DCHECK_EQ(NumNonNullBlocks(var_len_blocks_), 1);
+      if (!fixed_len_pages_.empty()) DCHECK_EQ(NumOpenPages(fixed_len_pages_), 1);
+      if (!var_len_pages_.empty()) DCHECK_EQ(NumOpenPages(var_len_pages_), 1);
     }
-    DeleteAllBlocks();
+    CloseAllPages();
     *eos = true;
     DCHECK_EQ(num_tuples_returned_, num_tuples_);
     return Status::OK();
   }
 
-  // Advance the fixed or var len block if we reached the end in the previous call to
+  // Advance the fixed or var len page if we reached the end in the previous call to
   // GetNext().
-  if (end_of_fixed_len_block_) {
-    RETURN_IF_ERROR(PinNextReadBlock(&fixed_len_blocks_, fixed_len_blocks_index_));
-    ++fixed_len_blocks_index_;
-    fixed_len_block_offset_ = 0;
-    end_of_fixed_len_block_ = false;
-  }
-  if (end_of_var_len_block_) {
-    RETURN_IF_ERROR(PinNextReadBlock(&var_len_blocks_, var_len_blocks_index_));
-    ++var_len_blocks_index_;
-    end_of_var_len_block_ = false;
-  }
-
-  // Fills rows into the output batch until a block boundary is reached.
-  BufferedBlockMgr::Block* fixed_len_block = fixed_len_blocks_[fixed_len_blocks_index_];
-  DCHECK(fixed_len_block != NULL);
-  while (!output_batch->AtCapacity() &&
-      fixed_len_block_offset_ < fixed_len_block->valid_data_len()) {
-    DCHECK(fixed_len_block != NULL);
-    Tuple* input_tuple = reinterpret_cast<Tuple*>(
-        fixed_len_block->buffer() + fixed_len_block_offset_);
+  if (end_of_fixed_len_page_) {
+    RETURN_IF_ERROR(PinNextReadPage(&fixed_len_pages_, fixed_len_pages_index_));
+    ++fixed_len_pages_index_;
+    fixed_len_page_offset_ = 0;
+    end_of_fixed_len_page_ = false;
+  }
+  if (end_of_var_len_page_) {
+    RETURN_IF_ERROR(PinNextReadPage(&var_len_pages_, var_len_pages_index_));
+    ++var_len_pages_index_;
+    end_of_var_len_page_ = false;
+  }
+
+  // Fills rows into the output batch until a page boundary is reached.
+  Page* fixed_len_page = &fixed_len_pages_[fixed_len_pages_index_];
+  DCHECK(fixed_len_page != NULL);
+
+  // Ensure we have a reference to the fixed-length page's buffer.
+  RETURN_IF_ERROR(fixed_len_page->WaitForBuffer());
+
+  // If we're converting offsets into unpinned var-len pages, make sure the
+  // current var-len page is in memory.
+  if (CONVERT_OFFSET_TO_PTR && HasVarLenPages()) {
+    RETURN_IF_ERROR(var_len_pages_[var_len_pages_index_].WaitForBuffer());
+  }
+
+  while (!output_batch->AtCapacity()
+      && fixed_len_page_offset_ < fixed_len_page->valid_data_len()) {
+    DCHECK(fixed_len_page != NULL);
+    Tuple* input_tuple =
+        reinterpret_cast<Tuple*>(fixed_len_page->data() + fixed_len_page_offset_);
 
     if (CONVERT_OFFSET_TO_PTR && !ConvertOffsetsToPtrs(input_tuple)) {
       DCHECK(!is_pinned_);
-      // The var-len data is in the next block. We are done with the current block, so
-      // return rows we've accumulated so far and advance to the next block in the next
-      // GetNext() call. This is needed for the unpinned case where we need to exchange
-      // this block for the next in the next GetNext() call. So therefore we must hold
-      // onto the current var-len block and signal to the caller that the block is going
+      // The var-len data is in the next page. We are done with the current page, so
+      // return rows we've accumulated so far and advance to the next page in the next
+      // GetNext() call. This is needed for the unpinned case where we will exchange
+      // this page for the next in the next GetNext() call. So therefore we must hold
+      // onto the current var-len page and signal to the caller that the page is going
       // to be deleted.
       output_batch->MarkNeedsDeepCopy();
-      end_of_var_len_block_ = true;
+      end_of_var_len_page_ = true;
       break;
     }
     output_batch->GetRow(output_batch->AddRow())->SetTuple(0, input_tuple);
     output_batch->CommitLastRow();
-    fixed_len_block_offset_ += sort_tuple_size_;
+    fixed_len_page_offset_ += sort_tuple_size_;
     ++num_tuples_returned_;
   }
 
-  if (fixed_len_block_offset_ >= fixed_len_block->valid_data_len()) {
-    // Reached the block boundary, need to move to the next block.
+  if (fixed_len_page_offset_ >= fixed_len_page->valid_data_len()) {
+    // Reached the page boundary, need to move to the next page.
     if (is_pinned_) {
-      // Attach block to batch. Caller can delete the block when it wants to.
-      output_batch->AddBlock(fixed_len_blocks_[fixed_len_blocks_index_],
+      BufferPool::ClientHandle* client = sorter_->buffer_pool_client_;
+      // Attach page to batch. Caller can delete the page when it wants to.
+      output_batch->AddBuffer(client,
+          fixed_len_pages_[fixed_len_pages_index_].ExtractBuffer(client),
           RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-      fixed_len_blocks_[fixed_len_blocks_index_] = NULL;
 
-      // Attach the var-len blocks at eos once no more rows will reference the blocks.
-      if (fixed_len_blocks_index_ == fixed_len_blocks_.size() - 1) {
-        for (BufferedBlockMgr::Block* var_len_block: var_len_blocks_) {
-          DCHECK(var_len_block != NULL);
-          output_batch->AddBlock(var_len_block, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+      // Attach the var-len pages at eos once no more rows will reference the pages.
+      if (fixed_len_pages_index_ == fixed_len_pages_.size() - 1) {
+        for (Page& var_len_page : var_len_pages_) {
+          DCHECK(var_len_page.is_open());
+          output_batch->AddBuffer(client, var_len_page.ExtractBuffer(client),
+              RowBatch::FlushMode::NO_FLUSH_RESOURCES);
         }
-        var_len_blocks_.clear();
+        var_len_pages_.clear();
       }
     } else {
-      // To iterate over unpinned runs, we need to exchange this block for the next
-      // in the next GetNext() call, so we need to hold onto the block and signal to
-      // the caller that the block is going to be deleted.
+      // To iterate over unpinned runs, we need to exchange this page for the next
+      // in the next GetNext() call, so we need to hold onto the page and signal to
+      // the caller that the page is going to be deleted.
       output_batch->MarkNeedsDeepCopy();
     }
-    end_of_fixed_len_block_ = true;
+    end_of_fixed_len_page_ = true;
   }
   *eos = false;
   return Status::OK();
 }
 
-Status Sorter::Run::PinNextReadBlock(vector<BufferedBlockMgr::Block*>* blocks,
-    int block_index) {
-  DCHECK_GE(block_index, 0);
-  DCHECK_LT(block_index, blocks->size() - 1);
-  BufferedBlockMgr::Block* curr_block = (*blocks)[block_index];
-  BufferedBlockMgr::Block* next_block = (*blocks)[block_index + 1];
-  DCHECK_EQ(is_pinned_, next_block->is_pinned());
+Status Sorter::Run::PinNextReadPage(vector<Page>* pages, int page_index) {
+  DCHECK_GE(page_index, 0);
+  DCHECK_LT(page_index, pages->size() - 1);
+  Page* curr_page = &(*pages)[page_index];
+  Page* next_page = &(*pages)[page_index + 1];
+  DCHECK_EQ(is_pinned_, next_page->is_pinned());
   if (is_pinned_) {
-    // The current block was attached to a batch and 'next_block' is already pinned.
-    DCHECK(curr_block == NULL);
+    // The current page was attached to a batch and 'next_page' is already pinned.
+    DCHECK(!curr_page->is_open());
     return Status::OK();
   }
-  bool pinned;
-  // Atomically delete the previous block and pin this one. Should not fail due to lack
-  // of memory. Pin() deletes the block even in error cases, so we need to remove it from
-  // the vector first to avoid an inconsistent state.
-  (*blocks)[block_index] = NULL;
-  RETURN_IF_ERROR(next_block->Pin(&pinned, curr_block, false));
-  DCHECK(pinned) << "Atomic delete and pin should not fail without error.";
+  // Close the previous page to free memory and pin the next page. Should always succeed
+  // since the pages are the same size.
+  curr_page->Close(sorter_->buffer_pool_client_);
+  RETURN_IF_ERROR(next_page->Pin(sorter_->buffer_pool_client_));
   return Status::OK();
 }
 
@@ -948,28 +1072,29 @@ void Sorter::Run::CollectNonNullVarSlots(Tuple* src,
   }
 }
 
-Status Sorter::Run::TryAddBlock(AddBlockMode mode,
-    vector<BufferedBlockMgr::Block*>* block_sequence, bool* added) {
-  DCHECK(!block_sequence->empty());
-  BufferedBlockMgr::Block* prev_block;
+Status Sorter::Run::TryAddPage(
+    AddPageMode mode, vector<Page>* page_sequence, bool* added) {
+  DCHECK(!page_sequence->empty());
   if (mode == KEEP_PREV_PINNED) {
-    prev_block = NULL;
+    if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(sorter_->page_len_)) {
+      *added = false;
+      return Status::OK();
+    }
   } else {
     DCHECK(mode == UNPIN_PREV);
-    // Swap the prev block with the next, to guarantee success.
-    prev_block = block_sequence->back();
+    // Unpin the prev page to free up the memory required to pin the next page.
+    page_sequence->back().Unpin(sorter_->buffer_pool_client_);
   }
 
-  BufferedBlockMgr::Block* new_block;
-  RETURN_IF_ERROR(sorter_->block_mgr_->GetNewBlock(
-      sorter_->block_mgr_client_, prev_block, &new_block));
-  if (new_block != NULL) {
-    *added = true;
-    block_sequence->push_back(new_block);
-  } else {
-    DCHECK_EQ(mode, KEEP_PREV_PINNED);
-    *added = false;
-  }
+  RETURN_IF_ERROR(AddPage(page_sequence));
+  *added = true;
+  return Status::OK();
+}
+
+Status Sorter::Run::AddPage(vector<Page>* page_sequence) {
+  Page new_page;
+  RETURN_IF_ERROR(new_page.Init(sorter_));
+  page_sequence->push_back(move(new_page));
   return Status::OK();
 }
 
@@ -983,27 +1108,26 @@ void Sorter::Run::CopyVarLenData(const vector<StringValue*>& string_values,
 }
 
 void Sorter::Run::CopyVarLenDataConvertOffset(const vector<StringValue*>& string_values,
-    int block_index, const uint8_t* block_start, uint8_t* dest) {
-  DCHECK_GE(block_index, 0);
-  DCHECK_GE(dest - block_start, 0);
+    int page_index, const uint8_t* page_start, uint8_t* dest) {
+  DCHECK_GE(page_index, 0);
+  DCHECK_GE(dest - page_start, 0);
 
-  for (StringValue* string_val: string_values) {
+  for (StringValue* string_val : string_values) {
     memcpy(dest, string_val->ptr, string_val->len);
-    DCHECK_LE(dest - block_start, sorter_->block_mgr_->max_block_size());
-    DCHECK_LE(dest - block_start, INT_MAX);
-    int block_offset = dest - block_start;
-    uint64_t packed_offset =
-        (static_cast<uint64_t>(block_index) << 32) | block_offset;
+    DCHECK_LE(dest - page_start, sorter_->page_len_);
+    DCHECK_LE(dest - page_start, numeric_limits<uint32_t>::max());
+    uint32_t page_offset = dest - page_start;
+    uint64_t packed_offset = (static_cast<uint64_t>(page_index) << 32) | page_offset;
     string_val->ptr = reinterpret_cast<char*>(packed_offset);
     dest += string_val->len;
   }
 }
 
 bool Sorter::Run::ConvertOffsetsToPtrs(Tuple* tuple) {
-  // We need to be careful to handle the case where var_len_blocks_ is empty,
+  // We need to be careful to handle the case where var_len_pages_ is empty,
   // e.g. if all strings are NULL.
-  uint8_t* block_start = var_len_blocks_.empty() ? NULL :
-      var_len_blocks_[var_len_blocks_index_]->buffer();
+  uint8_t* page_start =
+      var_len_pages_.empty() ? NULL : var_len_pages_[var_len_pages_index_].data();
 
   const vector<SlotDescriptor*>& string_slots = sort_tuple_desc_->string_slots();
   int num_non_null_string_slots = 0;
@@ -1015,47 +1139,47 @@ bool Sorter::Run::ConvertOffsetsToPtrs(Tuple* tuple) {
     DCHECK(slot_desc->type().IsVarLenStringType());
     StringValue* value = reinterpret_cast<StringValue*>(
         tuple->GetSlot(slot_desc->tuple_offset()));
-    // packed_offset includes the block index in the upper 32 bits and the block
+    // packed_offset includes the page index in the upper 32 bits and the page
     // offset in the lower 32 bits. See CopyVarLenDataConvertOffset().
     uint64_t packed_offset = reinterpret_cast<uint64_t>(value->ptr);
-    int block_index = packed_offset >> 32;
-    int block_offset = packed_offset & 0xFFFFFFFF;
+    uint32_t page_index = packed_offset >> 32;
+    uint32_t page_offset = packed_offset & 0xFFFFFFFF;
 
-    if (block_index > var_len_blocks_index_) {
-      // We've reached the block boundary for the current var-len block.
+    if (page_index > var_len_pages_index_) {
+      // We've reached the page boundary for the current var-len page.
       // This tuple will be returned in the next call to GetNext().
-      DCHECK_GE(block_index, 0);
-      DCHECK_LE(block_index, var_len_blocks_.size());
-      DCHECK_EQ(block_index, var_len_blocks_index_ + 1);
-      DCHECK_EQ(block_offset, 0); // The data is the first thing in the next block.
+      DCHECK_GE(page_index, 0);
+      DCHECK_LE(page_index, var_len_pages_.size());
+      DCHECK_EQ(page_index, var_len_pages_index_ + 1);
+      DCHECK_EQ(page_offset, 0); // The data is the first thing in the next page.
       // This must be the first slot with var len data for the tuple. Var len data
       // for tuple shouldn't be split across blocks.
       DCHECK_EQ(num_non_null_string_slots, 1);
       return false;
     }
 
-    DCHECK_EQ(block_index, var_len_blocks_index_);
-    if (var_len_blocks_.empty()) {
+    DCHECK_EQ(page_index, var_len_pages_index_);
+    if (var_len_pages_.empty()) {
       DCHECK_EQ(value->len, 0);
     } else {
-      DCHECK_LE(block_offset + value->len, var_len_blocks_[block_index]->valid_data_len());
+      DCHECK_LE(page_offset + value->len, var_len_pages_[page_index].valid_data_len());
     }
     // Calculate the address implied by the offset and assign it. May be NULL for
-    // zero-length strings if there are no blocks in the run since block_start is NULL.
-    DCHECK(block_start != NULL || block_offset == 0);
-    value->ptr = reinterpret_cast<char*>(block_start + block_offset);
+    // zero-length strings if there are no pages in the run since page_start is NULL.
+    DCHECK(page_start != NULL || page_offset == 0);
+    value->ptr = reinterpret_cast<char*>(page_start + page_offset);
   }
   return true;
 }
 
 int64_t Sorter::Run::TotalBytes() const {
   int64_t total_bytes = 0;
-  for (BufferedBlockMgr::Block* block: fixed_len_blocks_) {
-    if (block != NULL) total_bytes += block->valid_data_len();
+  for (const Page& page : fixed_len_pages_) {
+    if (page.is_open()) total_bytes += page.valid_data_len();
   }
 
-  for (BufferedBlockMgr::Block* block: var_len_blocks_) {
-    if (block != NULL) total_bytes += block->valid_data_len();
+  for (const Page& page : var_len_pages_) {
+    if (page.is_open()) total_bytes += page.valid_data_len();
   }
   return total_bytes;
 }
@@ -1072,61 +1196,61 @@ Sorter::TupleIterator::TupleIterator(Sorter::Run* run, int64_t index)
   }
 
   const int tuple_size = run->sort_tuple_size_;
-  int block_offset;
+  uint32_t page_offset;
   if (UNLIKELY(index == run->num_tuples())) {
     // If the iterator is initialized past the end, set up buffer_start_index_,
-    // 'buffer_end_index_' and 'block_index_' for the last block, then set 'tuple' to
+    // 'buffer_end_index_' and 'page_index_' for the last page, then set 'tuple' to
     // 'tuple_size' bytes past the last tuple, so everything is correct when Prev() is
     // invoked.
-    block_index_ = run->fixed_len_blocks_.size() - 1;
-    block_offset = ((index - 1) % run->block_capacity_) * tuple_size + tuple_size;
+    page_index_ = run->fixed_len_pages_.size() - 1;
+    page_offset = ((index - 1) % run->page_capacity_) * tuple_size + tuple_size;
   } else {
-    block_index_ = index / run->block_capacity_;
-    block_offset = (index % run->block_capacity_) * tuple_size;
+    page_index_ = index / run->page_capacity_;
+    page_offset = (index % run->page_capacity_) * tuple_size;
   }
-  buffer_start_index_ = block_index_ * run->block_capacity_;
-  buffer_end_index_ = buffer_start_index_ + run->block_capacity_;
-  tuple_ = run->fixed_len_blocks_[block_index_]->buffer() + block_offset;
+  buffer_start_index_ = page_index_ * run->page_capacity_;
+  buffer_end_index_ = buffer_start_index_ + run->page_capacity_;
+  tuple_ = run->fixed_len_pages_[page_index_].data() + page_offset;
 }
 
 void Sorter::TupleIterator::Next(Sorter::Run* run, int tuple_size) {
   DCHECK_LT(index_, run->num_tuples()) << "Can only advance one past end of run";
   tuple_ += tuple_size;
   ++index_;
-  if (UNLIKELY(index_ >= buffer_end_index_)) NextBlock(run, tuple_size);
+  if (UNLIKELY(index_ >= buffer_end_index_)) NextPage(run, tuple_size);
 }
 
-void Sorter::TupleIterator::NextBlock(Sorter::Run* run, int tuple_size) {
-  // When moving after the last tuple, stay at the last block.
+void Sorter::TupleIterator::NextPage(Sorter::Run* run, int tuple_size) {
+  // When moving after the last tuple, stay at the last page.
   if (index_ >= run->num_tuples()) return;
-  ++block_index_;
-  DCHECK_LT(block_index_, run->fixed_len_blocks_.size());
-  buffer_start_index_ = block_index_ * run->block_capacity_;
+  ++page_index_;
+  DCHECK_LT(page_index_, run->fixed_len_pages_.size());
+  buffer_start_index_ = page_index_ * run->page_capacity_;
   DCHECK_EQ(index_, buffer_start_index_);
-  buffer_end_index_ = buffer_start_index_ + run->block_capacity_;
-  tuple_ = run->fixed_len_blocks_[block_index_]->buffer();
+  buffer_end_index_ = buffer_start_index_ + run->page_capacity_;
+  tuple_ = run->fixed_len_pages_[page_index_].data();
 }
 
 void Sorter::TupleIterator::Prev(Sorter::Run* run, int tuple_size) {
   DCHECK_GE(index_, 0) << "Can only advance one before start of run";
   tuple_ -= tuple_size;
   --index_;
-  if (UNLIKELY(index_ < buffer_start_index_)) PrevBlock(run, tuple_size);
+  if (UNLIKELY(index_ < buffer_start_index_)) PrevPage(run, tuple_size);
 }
 
-void Sorter::TupleIterator::PrevBlock(Sorter::Run* run, int tuple_size) {
-  // When moving before the first tuple, stay at the first block.
+void Sorter::TupleIterator::PrevPage(Sorter::Run* run, int tuple_size) {
+  // When moving before the first tuple, stay at the first page.
   if (index_ < 0) return;
-  --block_index_;
-  DCHECK_GE(block_index_, 0);
-  buffer_start_index_ = block_index_ * run->block_capacity_;
-  buffer_end_index_ = buffer_start_index_ + run->block_capacity_;
+  --page_index_;
+  DCHECK_GE(page_index_, 0);
+  buffer_start_index_ = page_index_ * run->page_capacity_;
+  buffer_end_index_ = buffer_start_index_ + run->page_capacity_;
   DCHECK_EQ(index_, buffer_end_index_ - 1);
-  int last_tuple_block_offset = run->sort_tuple_size_ * (run->block_capacity_ - 1);
-  tuple_ = run->fixed_len_blocks_[block_index_]->buffer() + last_tuple_block_offset;
+  int last_tuple_page_offset = run->sort_tuple_size_ * (run->page_capacity_ - 1);
+  tuple_ = run->fixed_len_pages_[page_index_].data() + last_tuple_page_offset;
 }
 
-Sorter::TupleSorter::TupleSorter(const TupleRowComparator& comp, int64_t block_size,
+Sorter::TupleSorter::TupleSorter(const TupleRowComparator& comp, int64_t page_size,
     int tuple_size, RuntimeState* state)
   : tuple_size_(tuple_size),
     comparator_(comp),
@@ -1340,13 +1464,15 @@ inline void Sorter::TupleSorter::Swap(Tuple* left, Tuple* right, Tuple* swap_tup
 
 Sorter::Sorter(const TupleRowComparator& compare_less_than,
     const vector<ScalarExpr*>& sort_tuple_exprs, RowDescriptor* output_row_desc,
-    MemTracker* mem_tracker, RuntimeProfile* profile, RuntimeState* state,
+    MemTracker* mem_tracker, BufferPool::ClientHandle* buffer_pool_client,
+    int64_t page_len, RuntimeProfile* profile, RuntimeState* state, int node_id,
     bool enable_spilling)
-  : state_(state),
+  : node_id_(node_id),
+    state_(state),
     compare_less_than_(compare_less_than),
     in_mem_tuple_sorter_(NULL),
-    block_mgr_(state->block_mgr()),
-    block_mgr_client_(NULL),
+    buffer_pool_client_(buffer_pool_client),
+    page_len_(page_len),
     has_var_len_slots_(false),
     sort_tuple_exprs_(sort_tuple_exprs),
     mem_tracker_(mem_tracker),
@@ -1370,10 +1496,24 @@ Sorter::~Sorter() {
 
 Status Sorter::Prepare(ObjectPool* obj_pool, MemPool* expr_mem_pool) {
   DCHECK(in_mem_tuple_sorter_ == NULL) << "Already prepared";
+  // Page byte offsets are packed into uint32_t values, which limits the supported
+  // page size.
+  if (page_len_ > numeric_limits<uint32_t>::max()) {
+    return Status(Substitute(
+          "Page size $0 exceeded maximum supported in sorter ($1)",
+          PrettyPrinter::PrintBytes(page_len_),
+          PrettyPrinter::PrintBytes(numeric_limits<uint32_t>::max())));
+  }
+
   TupleDescriptor* sort_tuple_desc = output_row_desc_->tuple_descriptors()[0];
+  if (sort_tuple_desc->byte_size() > page_len_) {
+    return Status(TErrorCode::MAX_ROW_SIZE,
+        PrettyPrinter::Print(sort_tuple_desc->byte_size(), TUnit::BYTES), node_id_,
+        PrettyPrinter::Print(0, TUnit::BYTES));
+  }
   has_var_len_slots_ = sort_tuple_desc->HasVarlenSlots();
-  in_mem_tuple_sorter_.reset(new TupleSorter(compare_less_than_,
-      block_mgr_->max_block_size(), sort_tuple_desc->byte_size(), state_));
+  in_mem_tuple_sorter_.reset(new TupleSorter(compare_less_than_, page_len_,
+      sort_tuple_desc->byte_size(), state_));
 
   initial_runs_counter_ = ADD_COUNTER(profile_, "InitialRunsCreated", TUnit::UNIT);
   spilled_runs_counter_ = ADD_COUNTER(profile_, "SpilledRuns", TUnit::UNIT);
@@ -1382,17 +1522,6 @@ Status Sorter::Prepare(ObjectPool* obj_pool, MemPool* expr_mem_pool) {
   sorted_data_size_ = ADD_COUNTER(profile_, "SortDataSize", TUnit::BYTES);
   run_sizes_ = ADD_SUMMARY_STATS_COUNTER(profile_, "NumRowsPerRun", TUnit::UNIT);
 
-  // If spilling is enabled, we need enough buffers to perform merges. Otherwise, there
-  // won't be any merges and we only need 1 buffer.
-  // Must be kept in sync with SortNode.computeResourceProfile() in fe.
-  int min_buffers_required = enable_spilling_ ? MIN_BUFFERS_PER_MERGE : 1;
-  // Fixed and var-length blocks are separate, so we need twice as many blocks for both if
-  // there is var-length data.
-  if (sort_tuple_desc->HasVarlenSlots()) min_buffers_required *= 2;
-
-  RETURN_IF_ERROR(block_mgr_->RegisterClient(Substitute("Sorter ptr=$0", this),
-      min_buffers_required, false, mem_tracker_, state_, &block_mgr_client_));
-
   RETURN_IF_ERROR(ScalarExprEvaluator::Create(sort_tuple_exprs_, state_, obj_pool,
       expr_mem_pool, &sort_tuple_expr_evals_));
   return Status::OK();
@@ -1413,6 +1542,15 @@ void Sorter::FreeLocalAllocations() {
   ScalarExprEvaluator::FreeLocalAllocations(sort_tuple_expr_evals_);
 }
 
+int64_t Sorter::ComputeMinReservation() {
+  // Must be kept in sync with SortNode.computeNodeResourceProfile() in fe.
+  int min_buffers_required = enable_spilling_ ? MIN_BUFFERS_PER_MERGE : 1;
+  // Fixed and var-length pages are separate, so we need double the pages
+  // if there is var-length data.
+  if (output_row_desc_->HasVarlenSlots()) min_buffers_required *= 2;
+  return min_buffers_required * page_len_;
+}
+
 Status Sorter::AddBatch(RowBatch* batch) {
   DCHECK(unsorted_run_ != NULL);
   DCHECK(batch != NULL);
@@ -1424,11 +1562,12 @@ Status Sorter::AddBatch(RowBatch* batch) {
 
     cur_batch_index += num_processed;
     if (cur_batch_index < batch->num_rows()) {
-      // The current run is full. Sort it and begin the next one.
+      // The current run is full. Sort it, spill it and begin the next one.
+      RETURN_IF_ERROR(state_->StartSpilling(mem_tracker_));
       RETURN_IF_ERROR(SortCurrentInputRun());
-      RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllBlocks());
-      unsorted_run_ = obj_pool_.Add(
-          new Run(this, output_row_desc_->tuple_descriptors()[0], true));
+      RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllPages());
+      unsorted_run_ =
+          obj_pool_.Add(new Run(this, output_row_desc_->tuple_descriptors()[0], true));
       RETURN_IF_ERROR(unsorted_run_->Init());
     }
   }
@@ -1459,7 +1598,7 @@ Status Sorter::InputDone() {
   // Unpin the final run to free up memory for the merge.
   // TODO: we could keep it in memory in some circumstances as an optimisation, once
   // we have a buffer pool with more reliable reservations (IMPALA-3200).
-  RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllBlocks());
+  RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllPages());
 
   // Merge intermediate runs until we have a final merge set-up.
   // TODO: Attempt to allocate more memory before doing intermediate merges. This may
@@ -1487,7 +1626,6 @@ void Sorter::Reset() {
 
 void Sorter::Close(RuntimeState* state) {
   CleanupAllRuns();
-  block_mgr_->ClearReservations(block_mgr_client_);
   obj_pool_.Clear();
   ScalarExprEvaluator::Close(sort_tuple_expr_evals_, state);
 }
@@ -1495,9 +1633,9 @@ void Sorter::Close(RuntimeState* state) {
 void Sorter::CleanupAllRuns() {
   Run::CleanupRuns(&sorted_runs_);
   Run::CleanupRuns(&merging_runs_);
-  if (unsorted_run_ != NULL) unsorted_run_->DeleteAllBlocks();
+  if (unsorted_run_ != NULL) unsorted_run_->CloseAllPages();
   unsorted_run_ = NULL;
-  if (merge_output_run_ != NULL) merge_output_run_->DeleteAllBlocks();
+  if (merge_output_run_ != NULL) merge_output_run_->CloseAllPages();
   merge_output_run_ = NULL;
 }
 
@@ -1519,10 +1657,10 @@ Status Sorter::SortCurrentInputRun() {
 
 Status Sorter::MergeIntermediateRuns() {
   DCHECK_GE(sorted_runs_.size(), 2);
-  int pinned_blocks_per_run = has_var_len_slots_ ? 2 : 1;
-  int max_runs_per_final_merge = MAX_BUFFERS_PER_MERGE / pinned_blocks_per_run;
+  int pinned_pages_per_run = has_var_len_slots_ ? 2 : 1;
+  int max_runs_per_final_merge = MAX_BUFFERS_PER_MERGE / pinned_pages_per_run;
 
-  // During an intermediate merge, the one or two blocks from the output sorted run
+  // During an intermediate merge, the one or two pages from the output sorted run
   // that are being written must be pinned.
   int max_runs_per_intermediate_merge = max_runs_per_final_merge - 1;
   DCHECK_GT(max_runs_per_intermediate_merge, 1);
@@ -1549,7 +1687,7 @@ Status Sorter::MergeIntermediateRuns() {
     if (sorted_runs_.empty()) {
       // Don't need intermediate run for final merge.
       if (merge_output_run_ != NULL) {
-        merge_output_run_->DeleteAllBlocks();
+        merge_output_run_->CloseAllPages();
         merge_output_run_ = NULL;
       }
       return Status::OK();
@@ -1604,7 +1742,8 @@ Status Sorter::CreateMerger(int max_num_runs) {
 }
 
 Status Sorter::ExecuteIntermediateMerge(Sorter::Run* merged_run) {
-  RowBatch intermediate_merge_batch(output_row_desc_, state_->batch_size(), mem_tracker_);
+  RowBatch intermediate_merge_batch(
+      output_row_desc_, state_->batch_size(), mem_tracker_);
   bool eos = false;
   while (!eos) {
     // Copy rows into the new run until done.
@@ -1621,5 +1760,4 @@ Status Sorter::ExecuteIntermediateMerge(Sorter::Run* merged_run) {
   RETURN_IF_ERROR(merged_run->FinalizeInput());
   return Status::OK();
 }
-
 } // namespace impala


[11/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
IMPALA-4674: Part 2: port backend exec to BufferPool

Always create global BufferPool at startup using 80% of memory and
limit reservations to 80% of query memory (same as BufferedBlockMgr).
The query's initial reservation is computed in the planner, claimed
centrally (managed by the InitialReservations class) and distributed
to query operators from there.

min_spillable_buffer_size and default_spillable_buffer_size query
options control the buffer size that the planner selects for
spilling operators.

Port ExecNodes to use BufferPool:
  * Each ExecNode has to claim its reservation during Open()
  * Port Sorter to use BufferPool.
  * Switch from BufferedTupleStream to BufferedTupleStreamV2
  * Port HashTable to use BufferPool via a Suballocator.

This also makes PAGG memory consumption more efficient (avoid wasting buffers)
and improve the spilling algorithm:
* Allow preaggs to execute with 0 reservation - if streams and hash tables
  cannot be allocated, it will pass through rows.
* Halve the buffer requirement for spilling aggs - avoid allocating
  buffers for aggregated and unaggregated streams simultaneously.
* Rebuild spilled partitions instead of repartitioning (IMPALA-2708)

TODO in follow-up patches:
* Rename BufferedTupleStreamV2 to BufferedTupleStream
* Implement max_row_size query option.

Testing:
* Updated tests to reflect new memory requirements

Change-Id: I7fc7fe1c04e9dfb1a0c749fb56a5e0f2bf9c6c3e
Reviewed-on: http://gerrit.cloudera.org:8080/5801
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a98b90bd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a98b90bd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a98b90bd

Branch: refs/heads/master
Commit: a98b90bd3877886e97dc2385cfdf5e3f95245533
Parents: d5b0c6b
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Mar 16 16:09:36 2016 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Aug 5 01:03:02 2017 +0000

----------------------------------------------------------------------
 be/src/codegen/gen_ir_descriptions.py           |    2 +-
 be/src/exec/analytic-eval-node.cc               |   53 +-
 be/src/exec/analytic-eval-node.h                |   25 +-
 be/src/exec/exec-node.cc                        |   45 +-
 be/src/exec/exec-node.h                         |   17 +
 be/src/exec/hash-table-test.cc                  |  251 ++-
 be/src/exec/hash-table.cc                       |  149 +-
 be/src/exec/hash-table.h                        |  140 +-
 be/src/exec/hash-table.inline.h                 |   20 +-
 be/src/exec/nested-loop-join-builder.cc         |    3 +-
 be/src/exec/partial-sort-node.cc                |    7 +-
 be/src/exec/partial-sort-node.h                 |    1 -
 be/src/exec/partitioned-aggregation-node-ir.cc  |   20 +-
 be/src/exec/partitioned-aggregation-node.cc     |  639 ++++----
 be/src/exec/partitioned-aggregation-node.h      |  192 ++-
 be/src/exec/partitioned-hash-join-builder-ir.cc |   12 +-
 be/src/exec/partitioned-hash-join-builder.cc    |  159 +-
 be/src/exec/partitioned-hash-join-builder.h     |   76 +-
 be/src/exec/partitioned-hash-join-node-ir.cc    |    7 +-
 be/src/exec/partitioned-hash-join-node.cc       |  136 +-
 be/src/exec/partitioned-hash-join-node.h        |   26 +-
 be/src/exec/partitioned-hash-join-node.inline.h |    2 +-
 be/src/exec/sort-node.cc                        |   15 +-
 be/src/exec/sort-node.h                         |    3 +-
 be/src/runtime/CMakeLists.txt                   |    5 +-
 be/src/runtime/buffered-block-mgr-test.cc       | 1547 ------------------
 be/src/runtime/buffered-block-mgr.cc            | 1254 --------------
 be/src/runtime/buffered-block-mgr.h             |  606 -------
 be/src/runtime/buffered-tuple-stream-test.cc    | 1264 --------------
 be/src/runtime/buffered-tuple-stream.cc         |  903 ----------
 be/src/runtime/buffered-tuple-stream.h          |  561 -------
 be/src/runtime/buffered-tuple-stream.inline.h   |   59 -
 be/src/runtime/bufferpool/buffer-pool.cc        |   12 +-
 be/src/runtime/bufferpool/buffer-pool.h         |    8 +
 be/src/runtime/bufferpool/reservation-tracker.h |    4 +
 be/src/runtime/disk-io-mgr.cc                   |    7 +-
 be/src/runtime/exec-env.cc                      |   35 +-
 be/src/runtime/exec-env.h                       |    4 +-
 be/src/runtime/fragment-instance-state.cc       |    2 -
 be/src/runtime/initial-reservations.cc          |   90 +
 be/src/runtime/initial-reservations.h           |   79 +
 be/src/runtime/query-exec-mgr.cc                |    2 +
 be/src/runtime/query-state.cc                   |   91 +-
 be/src/runtime/query-state.h                    |   51 +-
 be/src/runtime/row-batch.cc                     |   19 -
 be/src/runtime/row-batch.h                      |   13 -
 be/src/runtime/runtime-filter.h                 |    1 +
 be/src/runtime/runtime-state.cc                 |   52 +-
 be/src/runtime/runtime-state.h                  |   32 +-
 be/src/runtime/sorter.cc                        | 1058 ++++++------
 be/src/runtime/sorter.h                         |   65 +-
 be/src/runtime/test-env.cc                      |   23 +-
 be/src/runtime/test-env.h                       |    9 +-
 be/src/runtime/tmp-file-mgr-test.cc             |   10 +-
 be/src/runtime/tmp-file-mgr.h                   |   23 +-
 be/src/service/client-request-state.cc          |    4 +-
 be/src/service/query-options.cc                 |   28 +-
 be/src/service/query-options.h                  |    6 +-
 be/src/util/bloom-filter.h                      |    2 +-
 be/src/util/static-asserts.cc                   |    2 -
 common/thrift/Frontend.thrift                   |   16 +-
 common/thrift/ImpalaInternalService.thrift      |   22 +-
 common/thrift/ImpalaService.thrift              |    8 +-
 common/thrift/PlanNodes.thrift                  |   18 +
 common/thrift/generate_error_codes.py           |   10 +-
 .../org/apache/impala/common/RuntimeEnv.java    |   10 -
 .../apache/impala/planner/AggregationNode.java  |   11 +-
 .../apache/impala/planner/AnalyticEvalNode.java |    7 +-
 .../impala/planner/DataSourceScanNode.java      |    2 +-
 .../apache/impala/planner/DataStreamSink.java   |    2 +-
 .../org/apache/impala/planner/EmptySetNode.java |    2 +-
 .../org/apache/impala/planner/ExchangeNode.java |    2 +-
 .../apache/impala/planner/HBaseScanNode.java    |    2 +-
 .../apache/impala/planner/HBaseTableSink.java   |    2 +-
 .../org/apache/impala/planner/HashJoinNode.java |    8 +-
 .../org/apache/impala/planner/HdfsScanNode.java |    4 +-
 .../apache/impala/planner/HdfsTableSink.java    |    2 +-
 .../apache/impala/planner/JoinBuildSink.java    |    2 +-
 .../org/apache/impala/planner/KuduScanNode.java |    2 +-
 .../apache/impala/planner/KuduTableSink.java    |    2 +-
 .../impala/planner/NestedLoopJoinNode.java      |    5 +-
 .../org/apache/impala/planner/PlanNode.java     |   14 +-
 .../org/apache/impala/planner/PlanRootSink.java |    2 +-
 .../java/org/apache/impala/planner/Planner.java |   12 +-
 .../apache/impala/planner/ResourceProfile.java  |   72 +-
 .../org/apache/impala/planner/SelectNode.java   |    2 +-
 .../impala/planner/SingularRowSrcNode.java      |    2 +-
 .../org/apache/impala/planner/SortNode.java     |   47 +-
 .../org/apache/impala/planner/SubplanNode.java  |    2 +-
 .../org/apache/impala/planner/UnionNode.java    |    2 +-
 .../org/apache/impala/planner/UnnestNode.java   |    2 +-
 .../org/apache/impala/service/Frontend.java     |    2 +-
 .../org/apache/impala/planner/PlannerTest.java  |    2 -
 .../queries/PlannerTest/constant-folding.test   |   32 +-
 .../queries/PlannerTest/disable-codegen.test    |    2 +-
 .../PlannerTest/fk-pk-join-detection.test       |   52 +-
 .../queries/PlannerTest/mt-dop-validation.test  |   30 +-
 .../queries/PlannerTest/parquet-filtering.test  |    6 +-
 .../PlannerTest/resource-requirements.test      |  418 ++---
 .../PlannerTest/sort-expr-materialization.test  |   30 +-
 .../PlannerTest/spillable-buffer-sizing.test    |  112 +-
 .../queries/PlannerTest/tablesample.test        |    4 +-
 .../queries/QueryTest/analytic-fns.test         |   12 +-
 .../queries/QueryTest/explain-level0.test       |    2 +-
 .../queries/QueryTest/explain-level1.test       |    2 +-
 .../queries/QueryTest/explain-level2.test       |    6 +-
 .../queries/QueryTest/explain-level3.test       |    6 +-
 .../queries/QueryTest/nested-types-tpch.test    |   24 +-
 .../QueryTest/runtime_row_filters_phj.test      |    5 +-
 ...ingle-node-joins-with-limits-exhaustive.test |    2 +-
 .../QueryTest/single-node-large-sorts.test      |    2 +-
 .../queries/QueryTest/spilling.test             |   87 +-
 .../targeted-stress/queries/agg_stress.test     |    2 +-
 .../workloads/tpch/queries/insert_parquet.test  |    2 +
 tests/comparison/discrepancy_searcher.py        |    4 +-
 tests/custom_cluster/test_scratch_disk.py       |   12 +-
 tests/custom_cluster/test_spilling.py           |   47 -
 tests/query_test/test_cancellation.py           |   10 +-
 tests/query_test/test_mem_usage_scaling.py      |   31 +-
 tests/query_test/test_nested_types.py           |    1 -
 tests/query_test/test_scratch_limit.py          |   12 +-
 tests/query_test/test_sort.py                   |   26 +-
 tests/query_test/test_spilling.py               |   39 +
 123 files changed, 2885 insertions(+), 8366 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index 94dc86a..be4be80 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -119,7 +119,7 @@ ir_functions = [
   ["PHJ_PROCESS_PROBE_BATCH_FULL_OUTER_JOIN",
    "_ZN6impala23PartitionedHashJoinNode17ProcessProbeBatchILi8EEEiNS_13TPrefetchMode4typeEPNS_8RowBatchEPNS_12HashTableCtxEPNS_6StatusE"],
   ["PHJ_INSERT_BATCH",
-   "_ZN6impala10PhjBuilder9Partition11InsertBatchENS_13TPrefetchMode4typeEPNS_12HashTableCtxEPNS_8RowBatchERKSt6vectorINS_19BufferedTupleStream6RowIdxESaISA_EE"],
+   "_ZN6impala10PhjBuilder9Partition11InsertBatchENS_13TPrefetchMode4typeEPNS_12HashTableCtxEPNS_8RowBatchERKSt6vectorIPhSaIS9_EEPNS_6StatusE"],
   ["HASH_TABLE_GET_HASH_SEED",
    "_ZNK6impala12HashTableCtx11GetHashSeedEv"],
   ["HASH_TABLE_GET_BUILD_EXPR_EVALUATORS",

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/analytic-eval-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/analytic-eval-node.cc b/be/src/exec/analytic-eval-node.cc
index b789188..f6d96ae 100644
--- a/be/src/exec/analytic-eval-node.cc
+++ b/be/src/exec/analytic-eval-node.cc
@@ -23,9 +23,10 @@
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "udf/udf-internal.h"
@@ -34,13 +35,14 @@
 #include "common/names.h"
 
 static const int MAX_TUPLE_POOL_SIZE = 8 * 1024 * 1024; // 8MB
+static const int MIN_REQUIRED_BUFFERS = 2;
 
 using namespace strings;
 
 namespace impala {
 
-AnalyticEvalNode::AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode,
-    const DescriptorTbl& descs)
+AnalyticEvalNode::AnalyticEvalNode(
+    ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs)
   : ExecNode(pool, tnode, descs),
     window_(tnode.analytic_node.window),
     intermediate_tuple_desc_(
@@ -51,7 +53,6 @@ AnalyticEvalNode::AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode,
     rows_end_offset_(0),
     has_first_val_null_offset_(false),
     first_val_null_offset_(0),
-    client_(nullptr),
     child_tuple_cmp_row_(nullptr),
     last_result_idx_(-1),
     prev_pool_last_result_idx_(-1),
@@ -110,6 +111,7 @@ AnalyticEvalNode::~AnalyticEvalNode() {
 Status AnalyticEvalNode::Init(const TPlanNode& tnode, RuntimeState* state) {
   RETURN_IF_ERROR(ExecNode::Init(tnode, state));
   DCHECK_EQ(conjunct_evals_.size(), 0);
+  state_ = state;
   const TAnalyticNode& analytic_node = tnode.analytic_node;
   bool has_lead_fn = false;
 
@@ -154,6 +156,8 @@ Status AnalyticEvalNode::Prepare(RuntimeState* state) {
   SCOPED_TIMER(runtime_profile_->total_time_counter());
   RETURN_IF_ERROR(ExecNode::Prepare(state));
   DCHECK(child(0)->row_desc()->IsPrefixOf(*row_desc()));
+  DCHECK_GE(resource_profile_.min_reservation,
+      resource_profile_.spillable_buffer_size * MIN_REQUIRED_BUFFERS);
   curr_tuple_pool_.reset(new MemPool(mem_tracker()));
   prev_tuple_pool_.reset(new MemPool(mem_tracker()));
   mem_pool_.reset(new MemPool(mem_tracker()));
@@ -175,12 +179,6 @@ Status AnalyticEvalNode::Prepare(RuntimeState* state) {
         fn_pool_.get(), &order_by_eq_expr_eval_));
     AddEvaluatorToFree(order_by_eq_expr_eval_);
   }
-
-  // Must be kept in sync with AnalyticEvalNode.computeResourceProfile() in fe.
-  const int MIN_REQUIRED_BUFFERS = 2;
-  RETURN_IF_ERROR(state->block_mgr()->RegisterClient(
-      Substitute("AnalyticEvalNode id=$0 ptr=$1", id_, this),
-      MIN_REQUIRED_BUFFERS, false, mem_tracker(), state, &client_));
   return Status::OK();
 }
 
@@ -190,22 +188,20 @@ Status AnalyticEvalNode::Open(RuntimeState* state) {
   RETURN_IF_CANCELLED(state);
   RETURN_IF_ERROR(QueryMaintenance(state));
   RETURN_IF_ERROR(child(0)->Open(state));
-  DCHECK(client_ != nullptr);
-  DCHECK(input_stream_ == nullptr);
-  input_stream_.reset(
-      new BufferedTupleStream(state, child(0)->row_desc(), state->block_mgr(), client_,
-          false /* use_initial_small_buffers */, true /* read_write */));
-  RETURN_IF_ERROR(input_stream_->Init(id(), runtime_profile(), true));
-  bool got_write_buffer;
-  RETURN_IF_ERROR(input_stream_->PrepareForWrite(&got_write_buffer));
-  if (!got_write_buffer) {
-    return state->block_mgr()->MemLimitTooLowError(client_, id());
-  }
-  bool got_read_buffer;
-  RETURN_IF_ERROR(input_stream_->PrepareForRead(true, &got_read_buffer));
-  if (!got_read_buffer) {
-    return state->block_mgr()->MemLimitTooLowError(client_, id());
+
+  // Claim reservation after the child has been opened to reduce the peak reservation
+  // requirement.
+  if (!buffer_pool_client_.is_registered()) {
+    RETURN_IF_ERROR(ClaimBufferReservation(state));
   }
+  DCHECK(input_stream_ == nullptr);
+  input_stream_.reset(new BufferedTupleStreamV2(state, child(0)->row_desc(),
+      &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+      resource_profile_.spillable_buffer_size));
+  RETURN_IF_ERROR(input_stream_->Init(id(), true));
+  bool success;
+  RETURN_IF_ERROR(input_stream_->PrepareForReadWrite(true, &success));
+  DCHECK(success) << "Had reservation: " << buffer_pool_client_.DebugString();
 
   for (int i = 0; i < analytic_fn_evals_.size(); ++i) {
     RETURN_IF_ERROR(analytic_fn_evals_[i]->Open(state));
@@ -366,8 +362,8 @@ inline Status AnalyticEvalNode::AddRow(int64_t stream_idx, TupleRow* row) {
     // the stream and continue writing/reading in unpinned mode.
     // TODO: Consider re-pinning later if the output stream is fully consumed.
     RETURN_IF_ERROR(status);
-    RETURN_IF_ERROR(
-        input_stream_->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
+    RETURN_IF_ERROR(state_->StartSpilling(mem_tracker()));
+    input_stream_->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
     VLOG_FILE << id() << " Unpin input stream while adding row idx=" << stream_idx;
     if (!input_stream_->AddRow(row, &status)) {
       // Rows should be added in unpinned mode unless an error occurs.
@@ -627,7 +623,7 @@ Status AnalyticEvalNode::ProcessChildBatch(RuntimeState* state) {
             << " tuple pool size:" << curr_tuple_pool_->total_allocated_bytes();
   SCOPED_TIMER(evaluation_timer_);
 
-  // BufferedTupleStream::num_rows() returns the total number of rows that have been
+  // BufferedTupleStreamV2::num_rows() returns the total number of rows that have been
   // inserted into the stream (it does not decrease when we read rows), so the index of
   // the next input row that will be inserted will be the current size of the stream.
   int64_t stream_idx = input_stream_->num_rows();
@@ -857,7 +853,6 @@ Status AnalyticEvalNode::Reset(RuntimeState* state) {
 
 void AnalyticEvalNode::Close(RuntimeState* state) {
   if (is_closed()) return;
-  if (client_ != nullptr) state->block_mgr()->ClearReservations(client_);
   // We may need to clean up input_stream_ if an error occurred at some point.
   if (input_stream_ != nullptr) {
     input_stream_->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/analytic-eval-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/analytic-eval-node.h b/be/src/exec/analytic-eval-node.h
index 89c5cf3..eab9198 100644
--- a/be/src/exec/analytic-eval-node.h
+++ b/be/src/exec/analytic-eval-node.h
@@ -19,8 +19,7 @@
 #define IMPALA_EXEC_ANALYTIC_EVAL_NODE_H
 
 #include "exec/exec-node.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream-v2.h"
 #include "runtime/tuple.h"
 
 namespace impala {
@@ -189,6 +188,10 @@ class AnalyticEvalNode : public ExecNode {
   /// Debug string containing the window definition.
   std::string DebugWindowString() const;
 
+  /// The RuntimeState for the fragment instance containing this AnalyticEvalNode. Set
+  /// in Init().
+  RuntimeState* state_;
+
   /// Window over which the analytic functions are evaluated. Only used if fn_scope_
   /// is ROWS or RANGE.
   /// TODO: fn_scope_ and window_ are candidates to be removed during codegen
@@ -254,9 +257,6 @@ class AnalyticEvalNode : public ExecNode {
   boost::scoped_ptr<MemPool> curr_tuple_pool_;
   boost::scoped_ptr<MemPool> prev_tuple_pool_;
 
-  /// Block manager client used by input_stream_. Not owned.
-  BufferedBlockMgr::Client* client_ = nullptr;
-
   /////////////////////////////////////////
   /// BEGIN: Members that must be Reset()
 
@@ -330,15 +330,16 @@ class AnalyticEvalNode : public ExecNode {
 
   /// Buffers input rows added in ProcessChildBatch() until enough rows are able to
   /// be returned by GetNextOutputBatch(), in which case row batches are returned from
-  /// the front of the stream and the underlying buffered blocks are deleted once read.
+  /// the front of the stream and the underlying buffers are deleted once read.
   /// The number of rows that must be buffered may vary from an entire partition (e.g.
-  /// no order by clause) to a single row (e.g. ROWS windows). When the amount of
-  /// buffered data exceeds the available memory in the underlying BufferedBlockMgr,
-  /// input_stream_ is unpinned (i.e., possibly spilled to disk if necessary).
-  /// The input stream owns tuple data backing rows returned in GetNext(). The blocks
-  /// with tuple data are attached to an output row batch on eos or ReachedLimit().
+  /// no order by clause) to a single row (e.g. ROWS windows). If the amount of buffered
+  /// data in 'input_stream_' exceeds the ExecNode's buffer reservation and the stream
+  /// cannot increase the reservation, then 'input_stream_' is unpinned (i.e., spilled to
+  /// disk). The input stream owns tuple data backing rows returned in GetNext(). The
+  /// buffers with tuple data are attached to an output row batch on eos or
+  /// ReachedLimit().
   /// TODO: Consider re-pinning unpinned streams when possible.
-  boost::scoped_ptr<BufferedTupleStream> input_stream_;
+  boost::scoped_ptr<BufferedTupleStreamV2> input_stream_;
 
   /// Pool used for O(1) allocations that live until Close() or Reset().
   /// Does not own data backing tuples returned in GetNext(), so it does not

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/exec-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc
index c3d9946..61c8d40 100644
--- a/be/src/exec/exec-node.cc
+++ b/be/src/exec/exec-node.cc
@@ -34,10 +34,10 @@
 #include "exec/empty-set-node.h"
 #include "exec/exchange-node.h"
 #include "exec/hbase-scan-node.h"
-#include "exec/hdfs-scan-node.h"
 #include "exec/hdfs-scan-node-mt.h"
-#include "exec/kudu-scan-node.h"
+#include "exec/hdfs-scan-node.h"
 #include "exec/kudu-scan-node-mt.h"
+#include "exec/kudu-scan-node.h"
 #include "exec/kudu-util.h"
 #include "exec/nested-loop-join-node.h"
 #include "exec/partial-sort-node.h"
@@ -50,9 +50,14 @@
 #include "exec/topn-node.h"
 #include "exec/union-node.h"
 #include "exec/unnest-node.h"
+#include "exprs/expr.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/descriptors.h"
-#include "runtime/mem-tracker.h"
+#include "runtime/exec-env.h"
+#include "runtime/initial-reservations.h"
 #include "runtime/mem-pool.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "util/debug-util.h"
@@ -61,7 +66,10 @@
 #include "common/names.h"
 
 using namespace llvm;
+using strings::Substitute;
 
+DECLARE_int32(be_port);
+DECLARE_string(hostname);
 DEFINE_bool(enable_partitioned_hash_join, true, "Deprecated - has no effect");
 DEFINE_bool(enable_partitioned_aggregation, true, "Deprecated - has no effect");
 
@@ -116,6 +124,7 @@ ExecNode::ExecNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl
     type_(tnode.node_type),
     pool_(pool),
     row_descriptor_(descs, tnode.row_tuples, tnode.nullable_tuples),
+    resource_profile_(tnode.resource_profile),
     debug_phase_(TExecNodePhase::INVALID),
     debug_action_(TDebugAction::WAIT),
     limit_(tnode.limit),
@@ -195,7 +204,12 @@ void ExecNode::Close(RuntimeState* state) {
   ScalarExprEvaluator::Close(conjunct_evals_, state);
   ScalarExpr::Close(conjuncts_);
   if (expr_mem_pool() != nullptr) expr_mem_pool_->FreeAll();
-
+  if (buffer_pool_client_.is_registered()) {
+    VLOG_FILE << id_ << " returning reservation " << resource_profile_.min_reservation;
+    state->query_state()->initial_reservations()->Return(
+        &buffer_pool_client_, resource_profile_.min_reservation);
+    state->exec_env()->buffer_pool()->DeregisterClient(&buffer_pool_client_);
+  }
   if (mem_tracker() != NULL && mem_tracker()->consumption() != 0) {
     LOG(WARNING) << "Query " << state->query_id() << " may have leaked memory." << endl
                  << state->instance_mem_tracker()->LogUsage();
@@ -204,6 +218,29 @@ void ExecNode::Close(RuntimeState* state) {
   }
 }
 
+Status ExecNode::ClaimBufferReservation(RuntimeState* state) {
+  DCHECK(!buffer_pool_client_.is_registered());
+  BufferPool* buffer_pool = ExecEnv::GetInstance()->buffer_pool();
+  // Check the minimum buffer size in case the minimum buffer size used by the planner
+  // doesn't match this backend's.
+  if (resource_profile_.__isset.spillable_buffer_size &&
+      resource_profile_.spillable_buffer_size < buffer_pool->min_buffer_len()) {
+    return Status(Substitute("Spillable buffer size for node $0 of $1 bytes is less "
+                             "than the minimum buffer pool buffer size of $2 bytes",
+        id_, resource_profile_.spillable_buffer_size, buffer_pool->min_buffer_len()));
+  }
+
+  RETURN_IF_ERROR(buffer_pool->RegisterClient(
+      Substitute("$0 id=$1 ptr=$2", PrintPlanNodeType(type_), id_, this),
+      state->query_state()->file_group(), state->instance_buffer_reservation(),
+      mem_tracker(), resource_profile_.max_reservation, runtime_profile(),
+      &buffer_pool_client_));
+  VLOG_FILE << id_ << " claiming reservation " << resource_profile_.min_reservation;
+  state->query_state()->initial_reservations()->Claim(
+      &buffer_pool_client_, resource_profile_.min_reservation);
+  return Status::OK();
+}
+
 Status ExecNode::CreateTree(
     RuntimeState* state, const TPlan& plan, const DescriptorTbl& descs, ExecNode** root) {
   if (plan.nodes.size() == 0) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/exec-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/exec-node.h b/be/src/exec/exec-node.h
index a107f62..60efff0 100644
--- a/be/src/exec/exec-node.h
+++ b/be/src/exec/exec-node.h
@@ -26,6 +26,8 @@
 #include "common/status.h"
 #include "exprs/scalar-expr-evaluator.h"
 #include "gen-cpp/PlanNodes_types.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/reservation-tracker.h"
 #include "runtime/descriptors.h" // for RowDescriptor
 #include "util/blocking-queue.h"
 #include "util/runtime-profile.h"
@@ -227,6 +229,12 @@ class ExecNode {
  protected:
   friend class DataSink;
 
+  /// Initialize 'buffer_pool_client_' and claim the initial reservation for this
+  /// ExecNode. Only needs to be called by ExecNodes that will use the client.
+  /// The client is automatically cleaned up in Close(). Should not be called if
+  /// the client is already open.
+  Status ClaimBufferReservation(RuntimeState* state);
+
   /// Extends blocking queue for row batches. Row batches have a property that
   /// they must be processed in the order they were produced, even in cancellation
   /// paths. Preceding row batches can contain ptrs to memory in subsequent row batches
@@ -276,6 +284,9 @@ class ExecNode {
   std::vector<ExecNode*> children_;
   RowDescriptor row_descriptor_;
 
+  /// Resource information sent from the frontend.
+  const TBackendResourceProfile resource_profile_;
+
   /// debug-only: if debug_action_ is not INVALID, node will perform action in
   /// debug_phase_
   TExecNodePhase::type debug_phase_;
@@ -298,6 +309,12 @@ class ExecNode {
   /// Created in Prepare().
   boost::scoped_ptr<MemPool> expr_mem_pool_;
 
+  /// Buffer pool client for this node. Initialized with the node's minimum reservation
+  /// in ClaimBufferReservation(). After initialization, the client must hold onto at
+  /// least the minimum reservation so that it can be returned to the initial
+  /// reservations pool in Close().
+  BufferPool::ClientHandle buffer_pool_client_;
+
   bool is_closed() const { return is_closed_; }
 
   /// Pointer to the containing SubplanNode or NULL if not inside a subplan.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table-test.cc b/be/src/exec/hash-table-test.cc
index 42bc7e1..7a6ec9d 100644
--- a/be/src/exec/hash-table-test.cc
+++ b/be/src/exec/hash-table-test.cc
@@ -17,24 +17,27 @@
 
 #include <boost/scoped_ptr.hpp>
 
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <iostream>
+#include <limits>
 #include <vector>
 
-#include "testutil/gtest-util.h"
 #include "common/compiler-util.h"
 #include "common/init.h"
 #include "exec/hash-table.inline.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
 #include "exprs/slot-ref.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/reservation-tracker.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/string-value.h"
 #include "runtime/test-env.h"
 #include "runtime/tuple-row.h"
 #include "service/fe-support.h"
+#include "testutil/gtest-util.h"
 #include "util/cpu-info.h"
 #include "util/runtime-profile-counters.h"
 #include "util/test-info.h"
@@ -51,9 +54,16 @@ class HashTableTest : public testing::Test {
   HashTableTest() : mem_pool_(&tracker_) {}
 
  protected:
+  /// Temporary runtime environment for the hash table.
   scoped_ptr<TestEnv> test_env_;
   RuntimeState* runtime_state_;
+
+  /// Hash tables and associated clients - automatically closed in TearDown().
+  vector<BufferPool::ClientHandle*> clients_;
+  vector<HashTable*> hash_tables_;
+
   ObjectPool pool_;
+  /// A dummy MemTracker used for exprs and other things we don't need to have limits on.
   MemTracker tracker_;
   MemPool mem_pool_;
   vector<ScalarExpr*> build_exprs_;
@@ -83,6 +93,8 @@ class HashTableTest : public testing::Test {
     ASSERT_OK(ScalarExprEvaluator::Create(probe_exprs_, nullptr, &pool_, &mem_pool_,
         &probe_expr_evals_));
     ASSERT_OK(ScalarExprEvaluator::Open(probe_expr_evals_, nullptr));
+
+    CreateTestEnv();
   }
 
   virtual void TearDown() {
@@ -90,9 +102,34 @@ class HashTableTest : public testing::Test {
     ScalarExprEvaluator::Close(probe_expr_evals_, nullptr);
     ScalarExpr::Close(build_exprs_);
     ScalarExpr::Close(probe_exprs_);
+
+    for (HashTable* hash_table : hash_tables_) hash_table->Close();
+    hash_tables_.clear();
+
+    for (BufferPool::ClientHandle* client : clients_) {
+      test_env_->exec_env()->buffer_pool()->DeregisterClient(client);
+    }
+    clients_.clear();
+
     runtime_state_ = nullptr;
     test_env_.reset();
     mem_pool_.FreeAll();
+    pool_.Clear();
+  }
+
+  /// Initialize test_env_ and runtime_state_ with the given page size and capacity
+  /// for the given number of pages. If test_env_ was already created, then re-creates it.
+  void CreateTestEnv(int64_t min_page_size = 64 * 1024,
+      int64_t buffer_bytes_limit = 4L * 1024 * 1024 * 1024) {
+    test_env_.reset(new TestEnv());
+    test_env_->SetBufferPoolArgs(min_page_size, buffer_bytes_limit);
+    ASSERT_OK(test_env_->Init());
+
+    TQueryOptions query_options;
+    query_options.__set_default_spillable_buffer_size(min_page_size);
+    query_options.__set_min_spillable_buffer_size(min_page_size);
+    query_options.__set_buffer_pool_limit(buffer_bytes_limit);
+    ASSERT_OK(test_env_->CreateQueryState(0, &query_options, &runtime_state_));
   }
 
   TupleRow* CreateTupleRow(int32_t val) {
@@ -116,8 +153,9 @@ class HashTableTest : public testing::Test {
 
   // Wrapper to call private methods on HashTable
   // TODO: understand google testing, there must be a more natural way to do this
-  void ResizeTable(HashTable* table, int64_t new_size, HashTableCtx* ht_ctx) {
-    table->ResizeBuckets(new_size, ht_ctx);
+  Status ResizeTable(
+      HashTable* table, int64_t new_size, HashTableCtx* ht_ctx, bool* success) {
+    return table->ResizeBuckets(new_size, ht_ctx, success);
   }
 
   // Do a full table scan on table.  All values should be between [min,max).  If
@@ -188,24 +226,41 @@ class HashTableTest : public testing::Test {
     }
   }
 
-  // Construct hash table with custom block manager. Returns result of HashTable::Init()
-  bool CreateHashTable(bool quadratic, int64_t initial_num_buckets,
-      scoped_ptr<HashTable>* table, int block_size = 8 * 1024 * 1024,
-      int max_num_blocks = 100, int reserved_blocks = 10) {
-    EXPECT_OK(test_env_->CreateQueryStateWithBlockMgr(
-        next_query_id_++, max_num_blocks, block_size, nullptr, &runtime_state_));
+  /// Construct hash table and buffer pool client.
+  /// Returns true if HashTable::Init() was successful. Created objects
+  /// and resources (e.g. reservations) are automatically freed in TearDown().
+  bool CreateHashTable(bool quadratic, int64_t initial_num_buckets, HashTable** table,
+      int64_t block_size = 8 * 1024 * 1024, int max_num_blocks = 100,
+      int initial_reserved_blocks = 10, int64_t suballocator_buffer_len = 64 * 1024) {
+    BufferPool* buffer_pool = test_env_->exec_env()->buffer_pool();
+    RuntimeProfile* profile = pool_.Add(new RuntimeProfile(&pool_, "ht"));
+
+    // Set up memory tracking for the hash table.
     MemTracker* client_tracker =
         pool_.Add(new MemTracker(-1, "client", runtime_state_->instance_mem_tracker()));
-    BufferedBlockMgr::Client* client;
-    EXPECT_OK(runtime_state_->block_mgr()->RegisterClient(
-        "", reserved_blocks, false, client_tracker, runtime_state_, &client));
+    int64_t initial_reservation_bytes = block_size * initial_reserved_blocks;
+    int64_t max_reservation_bytes = block_size * max_num_blocks;
+
+    // Set up the memory allocator.
+    BufferPool::ClientHandle* client = pool_.Add(new BufferPool::ClientHandle);
+    clients_.push_back(client);
+    EXPECT_OK(buffer_pool->RegisterClient("", nullptr,
+        runtime_state_->instance_buffer_reservation(), client_tracker,
+        max_reservation_bytes, profile, client));
+    EXPECT_TRUE(client->IncreaseReservation(initial_reservation_bytes));
+    Suballocator* allocator =
+        pool_.Add(new Suballocator(buffer_pool, client, suballocator_buffer_len));
 
     // Initial_num_buckets must be a power of two.
     EXPECT_EQ(initial_num_buckets, BitUtil::RoundUpToPowerOfTwo(initial_num_buckets));
     int64_t max_num_buckets = 1L << 31;
-    table->reset(new HashTable(quadratic, runtime_state_, client, true, 1, nullptr,
-          max_num_buckets, initial_num_buckets));
-    return (*table)->Init();
+    *table = pool_.Add(new HashTable(
+        quadratic, allocator, true, 1, nullptr, max_num_buckets, initial_num_buckets));
+    hash_tables_.push_back(*table);
+    bool success;
+    Status status = (*table)->Init(&success);
+    EXPECT_OK(status);
+    return status.ok() && success;
   }
 
   // Constructs and closes a hash table.
@@ -229,14 +284,12 @@ class HashTableTest : public testing::Test {
     EXPECT_EQ(*val_row4, 4);
 
     // Create and close the hash table.
-    scoped_ptr<HashTable> hash_table;
+    HashTable* hash_table;
     bool initialized = CreateHashTable(quadratic, initial_num_buckets, &hash_table);
     EXPECT_EQ(too_big, !initialized);
     if (initialized && initial_num_buckets > 0) {
       EXPECT_NE(hash_table->ByteSize(), 0);
     }
-
-    hash_table->Close();
   }
 
   // IMPALA-2897: Build rows that are equivalent (where nullptrs are counted as equivalent)
@@ -246,7 +299,7 @@ class HashTableTest : public testing::Test {
     for (int i = 0; i < 2; ++i) build_rows[i] = CreateNullTupleRow();
 
     // Create the hash table and insert the build rows
-    scoped_ptr<HashTable> hash_table;
+    HashTable* hash_table;
     ASSERT_TRUE(CreateHashTable(true, 1024, &hash_table));
     scoped_ptr<HashTableCtx> ht_ctx;
     EXPECT_OK(HashTableCtx::Create(&pool_, runtime_state_,
@@ -256,13 +309,15 @@ class HashTableTest : public testing::Test {
 
     for (int i = 0; i < 2; ++i) {
       if (!ht_ctx->EvalAndHashBuild(build_rows[i])) continue;
-      BufferedTupleStream::RowIdx dummy_row_idx;
+      BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
       EXPECT_TRUE(hash_table->stores_tuples_);
-      bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, build_rows[i]);
+      Status status;
+      bool inserted =
+          hash_table->Insert(ht_ctx.get(), dummy_flat_row, build_rows[i], &status);
       EXPECT_TRUE(inserted);
+      ASSERT_OK(status);
     }
     EXPECT_EQ(hash_table->num_buckets() - hash_table->EmptyBuckets(), 1);
-    hash_table->Close();
     ht_ctx->Close(runtime_state_);
   }
 
@@ -282,7 +337,7 @@ class HashTableTest : public testing::Test {
     }
 
     // Create the hash table and insert the build rows
-    scoped_ptr<HashTable> hash_table;
+    HashTable* hash_table;
     ASSERT_TRUE(CreateHashTable(quadratic, initial_num_buckets, &hash_table));
     scoped_ptr<HashTableCtx> ht_ctx;
     Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
@@ -290,52 +345,57 @@ class HashTableTest : public testing::Test {
         vector<bool>(build_exprs_.size(), false), 1, 0, 1, &mem_pool_, &ht_ctx);
     EXPECT_OK(status);
     EXPECT_OK(ht_ctx->Open(runtime_state_));
-    bool success = hash_table->CheckAndResize(5, ht_ctx.get());
+    bool success;
+    EXPECT_OK(hash_table->CheckAndResize(5, ht_ctx.get(), &success));
     ASSERT_TRUE(success);
     for (int i = 0; i < 5; ++i) {
       if (!ht_ctx->EvalAndHashBuild(build_rows[i])) continue;
-      BufferedTupleStream::RowIdx dummy_row_idx;
+      BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
       EXPECT_TRUE(hash_table->stores_tuples_);
-      bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, build_rows[i]);
+      bool inserted =
+          hash_table->Insert(ht_ctx.get(), dummy_flat_row, build_rows[i], &status);
       EXPECT_TRUE(inserted);
+      ASSERT_OK(status);
     }
     EXPECT_EQ(hash_table->size(), 5);
 
     // Do a full table scan and validate returned pointers
-    FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+    FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
 
     // Double the size of the hash table and scan again.
-    ResizeTable(hash_table.get(), 2048, ht_ctx.get());
+    EXPECT_OK(ResizeTable(hash_table, 2048, ht_ctx.get(), &success));
+    EXPECT_TRUE(success);
     EXPECT_EQ(hash_table->num_buckets(), 2048);
     EXPECT_EQ(hash_table->size(), 5);
     memset(scan_rows, 0, sizeof(scan_rows));
-    FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+    FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
 
     // Try to shrink and scan again.
-    ResizeTable(hash_table.get(), 64, ht_ctx.get());
+    EXPECT_OK(ResizeTable(hash_table, 64, ht_ctx.get(), &success));
+    EXPECT_TRUE(success);
     EXPECT_EQ(hash_table->num_buckets(), 64);
     EXPECT_EQ(hash_table->size(), 5);
     memset(scan_rows, 0, sizeof(scan_rows));
-    FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+    FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
 
     // Resize to 8, which is the smallest value to fit the number of filled buckets.
-    ResizeTable(hash_table.get(), 8, ht_ctx.get());
+    EXPECT_OK(ResizeTable(hash_table, 8, ht_ctx.get(), &success));
+    EXPECT_TRUE(success);
     EXPECT_EQ(hash_table->num_buckets(), 8);
     EXPECT_EQ(hash_table->size(), 5);
     memset(scan_rows, 0, sizeof(scan_rows));
-    FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+    FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
 
-    hash_table->Close();
     ht_ctx->Close(runtime_state_);
   }
 
-  void ScanTest(bool quadratic, int initial_size, int rows_to_insert,
-                int additional_rows) {
-    scoped_ptr<HashTable> hash_table;
+  void ScanTest(
+      bool quadratic, int initial_size, int rows_to_insert, int additional_rows) {
+    HashTable* hash_table;
     ASSERT_TRUE(CreateHashTable(quadratic, initial_size, &hash_table));
 
     int total_rows = rows_to_insert + additional_rows;
@@ -347,19 +407,21 @@ class HashTableTest : public testing::Test {
     EXPECT_OK(ht_ctx->Open(runtime_state_));
 
     // Add 1 row with val 1, 2 with val 2, etc.
+    bool success;
     vector<TupleRow*> build_rows;
     ProbeTestData* probe_rows = new ProbeTestData[total_rows];
     probe_rows[0].probe_row = CreateTupleRow(0);
     for (int val = 1; val <= rows_to_insert; ++val) {
-      bool success = hash_table->CheckAndResize(val, ht_ctx.get());
+      EXPECT_OK(hash_table->CheckAndResize(val, ht_ctx.get(), &success));
       EXPECT_TRUE(success) << " failed to resize: " << val;
       probe_rows[val].probe_row = CreateTupleRow(val);
       for (int i = 0; i < val; ++i) {
         TupleRow* row = CreateTupleRow(val);
         if (!ht_ctx->EvalAndHashBuild(row)) continue;
-        BufferedTupleStream::RowIdx dummy_row_idx;
+        BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
         EXPECT_TRUE(hash_table->stores_tuples_);
-        hash_table->Insert(ht_ctx.get(), dummy_row_idx, row);
+        ASSERT_TRUE(hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status));
+        ASSERT_OK(status);
         build_rows.push_back(row);
         probe_rows[val].expected_build_rows.push_back(row);
       }
@@ -371,21 +433,22 @@ class HashTableTest : public testing::Test {
     }
 
     // Test that all the builds were found.
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, total_rows, true);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, total_rows, true);
 
     // Resize and try again.
     int target_size = BitUtil::RoundUpToPowerOfTwo(2 * total_rows);
-    ResizeTable(hash_table.get(), target_size, ht_ctx.get());
+    EXPECT_OK(ResizeTable(hash_table, target_size, ht_ctx.get(), &success));
+    EXPECT_TRUE(success);
     EXPECT_EQ(hash_table->num_buckets(), target_size);
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, total_rows, true);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, total_rows, true);
 
     target_size = BitUtil::RoundUpToPowerOfTwo(total_rows + 1);
-    ResizeTable(hash_table.get(), target_size, ht_ctx.get());
+    EXPECT_OK(ResizeTable(hash_table, target_size, ht_ctx.get(), &success));
+    EXPECT_TRUE(success);
     EXPECT_EQ(hash_table->num_buckets(), target_size);
-    ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, total_rows, true);
+    ProbeTest(hash_table, ht_ctx.get(), probe_rows, total_rows, true);
 
     delete [] probe_rows;
-    hash_table->Close();
     ht_ctx->Close(runtime_state_);
   }
 
@@ -395,9 +458,11 @@ class HashTableTest : public testing::Test {
     uint64_t num_to_add = 4;
     int expected_size = 0;
 
-    MemTracker tracker(100 * 1024 * 1024);
-    scoped_ptr<HashTable> hash_table;
-    ASSERT_TRUE(CreateHashTable(quadratic, num_to_add, &hash_table));
+    // Need enough memory for two hash table bucket directories during resize.
+    const int64_t mem_limit_mb = 128 + 64;
+    HashTable* hash_table;
+    ASSERT_TRUE(
+        CreateHashTable(quadratic, num_to_add, &hash_table, 1024 * 1024, mem_limit_mb));
     scoped_ptr<HashTableCtx> ht_ctx;
     Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
         probe_exprs_, false /* !stores_nulls_ */,
@@ -408,27 +473,32 @@ class HashTableTest : public testing::Test {
     // entries. When num_to_add == 4, then the total number of inserts is 4194300.
     int build_row_val = 0;
     for (int i = 0; i < 20; ++i) {
-      // Currently the mem used for the bucket is not being tracked by the mem tracker.
-      // Thus the resize is expected to be successful.
-      // TODO: Keep track of the mem used for the buckets and test cases where we actually
-      // hit OOM.
-      // TODO: Insert duplicates to also hit OOM.
-      bool success = hash_table->CheckAndResize(num_to_add, ht_ctx.get());
-      EXPECT_TRUE(success) << " failed to resize: " << num_to_add;
+      bool success;
+      EXPECT_OK(hash_table->CheckAndResize(num_to_add, ht_ctx.get(), &success));
+      EXPECT_TRUE(success) << " failed to resize: " << num_to_add << "\n"
+                           << tracker_.LogUsage() << "\n"
+                           << clients_.back()->DebugString();
       for (int j = 0; j < num_to_add; ++build_row_val, ++j) {
         TupleRow* row = CreateTupleRow(build_row_val);
         if (!ht_ctx->EvalAndHashBuild(row)) continue;
-        BufferedTupleStream::RowIdx dummy_row_idx;
+        BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
         EXPECT_TRUE(hash_table->stores_tuples_);
-        bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, row);
+        bool inserted = hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status);
+        ASSERT_OK(status);
         if (!inserted) goto done_inserting;
       }
       expected_size += num_to_add;
       num_to_add *= 2;
     }
- done_inserting:
-    EXPECT_FALSE(tracker.LimitExceeded());
+  done_inserting:
     EXPECT_EQ(hash_table->size(), 4194300);
+
+    // The next allocation should put us over the limit, since we'll need 128MB for
+    // the old buckets and 256MB for the new buckets.
+    bool success;
+    EXPECT_OK(hash_table->CheckAndResize(num_to_add * 2, ht_ctx.get(), &success));
+    EXPECT_FALSE(success);
+
     // Validate that we can find the entries before we went over the limit
     for (int i = 0; i < expected_size * 5; i += 100000) {
       TupleRow* probe_row = CreateTupleRow(i);
@@ -441,7 +511,34 @@ class HashTableTest : public testing::Test {
         EXPECT_TRUE(iter.AtEnd()) << " i: " << i;
       }
     }
-    hash_table->Close();
+
+    // Insert duplicates to also hit OOM.
+    int64_t num_duplicates_inserted = 0;
+    const int DUPLICATE_VAL = 1234;
+    while (true) {
+      TupleRow* duplicate_row = CreateTupleRow(DUPLICATE_VAL);
+      if (!ht_ctx->EvalAndHashBuild(duplicate_row)) continue;
+      BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+      bool inserted =
+          hash_table->Insert(ht_ctx.get(), dummy_flat_row, duplicate_row, &status);
+      ASSERT_OK(status);
+      if (!inserted) break;
+      ++num_duplicates_inserted;
+    }
+
+    // Check that the duplicates that we successfully inserted are all present.
+    TupleRow* duplicate_row = CreateTupleRow(DUPLICATE_VAL);
+    ASSERT_TRUE(ht_ctx->EvalAndHashProbe(duplicate_row));
+    HashTable::Iterator iter = hash_table->FindProbeRow(ht_ctx.get());
+    ValidateMatch(duplicate_row, iter.GetRow());
+    for (int64_t i = 0; i < num_duplicates_inserted; ++i) {
+      ASSERT_FALSE(iter.AtEnd());
+      iter.NextDuplicate();
+      ValidateMatch(duplicate_row, iter.GetRow());
+    }
+    iter.NextDuplicate();
+    EXPECT_TRUE(iter.AtEnd());
+
     ht_ctx->Close(runtime_state_);
   }
 
@@ -450,7 +547,7 @@ class HashTableTest : public testing::Test {
   // enough space in the hash table (it is also expected to be slow). It also expects that
   // a probe for a N+1 element will return BUCKET_NOT_FOUND.
   void InsertFullTest(bool quadratic, int table_size) {
-    scoped_ptr<HashTable> hash_table;
+    HashTable* hash_table;
     ASSERT_TRUE(CreateHashTable(quadratic, table_size, &hash_table));
     EXPECT_EQ(hash_table->EmptyBuckets(), table_size);
     scoped_ptr<HashTableCtx> ht_ctx;
@@ -472,10 +569,11 @@ class HashTableTest : public testing::Test {
 
       // Insert using both Insert() and FindBucket() methods.
       if (build_row_val % 2 == 0) {
-        BufferedTupleStream::RowIdx dummy_row_idx;
+        BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
         EXPECT_TRUE(hash_table->stores_tuples_);
-        bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, row);
+        bool inserted = hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status);
         EXPECT_TRUE(inserted);
+        ASSERT_OK(status);
       } else {
         iter = hash_table->FindBuildRowBucket(ht_ctx.get(), &found);
         EXPECT_FALSE(iter.AtEnd());
@@ -511,20 +609,20 @@ class HashTableTest : public testing::Test {
     EXPECT_TRUE(iter.AtEnd());
     EXPECT_FALSE(found);
 
-    hash_table->Close();
     ht_ctx->Close(runtime_state_);
   }
 
   // This test makes sure we can tolerate the low memory case where we do not have enough
   // memory to allocate the array of buckets for the hash table.
   void VeryLowMemTest(bool quadratic) {
-    const int block_size = 2 * 1024;
+    const int64_t block_size = 2 * 1024;
     const int max_num_blocks = 1;
-    const int reserved_blocks = 0;
     const int table_size = 1024;
-    scoped_ptr<HashTable> hash_table;
-    ASSERT_FALSE(CreateHashTable(quadratic, table_size, &hash_table, block_size,
-          max_num_blocks, reserved_blocks));
+    CreateTestEnv(block_size, block_size * max_num_blocks);
+
+    HashTable* hash_table;
+    ASSERT_FALSE(CreateHashTable(
+        quadratic, table_size, &hash_table, block_size, max_num_blocks, 0, 1024));
     scoped_ptr<HashTableCtx> ht_ctx;
     Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
         probe_exprs_, false /* !stores_nulls_ */, vector<bool>(build_exprs_.size(), false), 1, 0, 1,
@@ -532,7 +630,6 @@ class HashTableTest : public testing::Test {
     EXPECT_OK(status);
     HashTable::Iterator iter = hash_table->Begin(ht_ctx.get());
     EXPECT_TRUE(iter.AtEnd());
-    hash_table->Close();
     ht_ctx->Close(runtime_state_);
   }
 };
@@ -612,8 +709,6 @@ TEST_F(HashTableTest, QuadraticInsertFullTest) {
 
 // Test that hashing empty string updates hash value.
 TEST_F(HashTableTest, HashEmpty) {
-  EXPECT_OK(test_env_->CreateQueryStateWithBlockMgr(
-      0, 100, 8 * 1024 * 1024, nullptr, &runtime_state_));
   scoped_ptr<HashTableCtx> ht_ctx;
   Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
       probe_exprs_, false /* !stores_nulls_ */,

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index a4856e9..aacedc2 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -26,7 +26,7 @@
 #include "exprs/slot-ref.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/reservation-tracker.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/raw-value.inline.h"
 #include "runtime/runtime-state.h"
@@ -37,8 +37,17 @@
 #include "common/names.h"
 
 using namespace impala;
-using namespace llvm;
-using namespace strings;
+using llvm::APFloat;
+using llvm::ArrayRef;
+using llvm::BasicBlock;
+using llvm::ConstantFP;
+using llvm::Function;
+using llvm::LLVMContext;
+using llvm::PHINode;
+using llvm::PointerType;
+using llvm::Type;
+using llvm::Value;
+using strings::Substitute;
 
 DEFINE_bool(enable_quadratic_probing, true, "Enable quadratic probing hash table");
 
@@ -85,12 +94,6 @@ static int64_t NULL_VALUE[] = {
 static_assert(sizeof(NULL_VALUE) >= ColumnType::MAX_CHAR_LENGTH,
     "NULL_VALUE must be at least as large as the largest possible slot");
 
-// The first NUM_SMALL_BLOCKS of nodes_ are made of blocks less than the IO size (of 8MB)
-// to reduce the memory footprint of small queries. In particular, we always first use a
-// 64KB and a 512KB block before starting using IO-sized blocks.
-static const int64_t INITIAL_DATA_PAGE_SIZES[] = { 64 * 1024, 512 * 1024 };
-static const int NUM_SMALL_DATA_PAGES = sizeof(INITIAL_DATA_PAGE_SIZES) / sizeof(int64_t);
-
 HashTableCtx::HashTableCtx(const std::vector<ScalarExpr*>& build_exprs,
     const std::vector<ScalarExpr*>& probe_exprs, bool stores_nulls,
     const std::vector<bool>& finds_nulls, int32_t initial_seed,
@@ -378,21 +381,20 @@ void HashTableCtx::ExprValuesCache::ResetForRead() {
   ResetIterators();
 }
 
-const double HashTable::MAX_FILL_FACTOR = 0.75f;
+constexpr double HashTable::MAX_FILL_FACTOR;
+constexpr int64_t HashTable::DATA_PAGE_SIZE;
 
-HashTable* HashTable::Create(RuntimeState* state,
-    BufferedBlockMgr::Client* client, bool stores_duplicates, int num_build_tuples,
-    BufferedTupleStream* tuple_stream, int64_t max_num_buckets,
+HashTable* HashTable::Create(Suballocator* allocator, bool stores_duplicates,
+    int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
     int64_t initial_num_buckets) {
-  return new HashTable(FLAGS_enable_quadratic_probing, state, client, stores_duplicates,
+  return new HashTable(FLAGS_enable_quadratic_probing, allocator, stores_duplicates,
       num_build_tuples, tuple_stream, max_num_buckets, initial_num_buckets);
 }
 
-HashTable::HashTable(bool quadratic_probing, RuntimeState* state,
-    BufferedBlockMgr::Client* client, bool stores_duplicates, int num_build_tuples,
-    BufferedTupleStream* stream, int64_t max_num_buckets, int64_t num_buckets)
-  : state_(state),
-    block_mgr_client_(client),
+HashTable::HashTable(bool quadratic_probing, Suballocator* allocator,
+    bool stores_duplicates, int num_build_tuples, BufferedTupleStreamV2* stream,
+    int64_t max_num_buckets, int64_t num_buckets)
+  : allocator_(allocator),
     tuple_stream_(stream),
     stores_tuples_(num_build_tuples == 1),
     stores_duplicates_(stores_duplicates),
@@ -410,26 +412,23 @@ HashTable::HashTable(bool quadratic_probing, RuntimeState* state,
     has_matches_(false),
     num_probes_(0), num_failed_probes_(0), travel_length_(0), num_hash_collisions_(0),
     num_resizes_(0) {
-  DCHECK_EQ((num_buckets & (num_buckets-1)), 0) << "num_buckets must be a power of 2";
+  DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2";
   DCHECK_GT(num_buckets, 0) << "num_buckets must be larger than 0";
   DCHECK(stores_tuples_ || stream != NULL);
-  DCHECK(client != NULL);
 }
 
-bool HashTable::Init() {
+Status HashTable::Init(bool* got_memory) {
   int64_t buckets_byte_size = num_buckets_ * sizeof(Bucket);
-  if (!state_->block_mgr()->ConsumeMemory(block_mgr_client_, buckets_byte_size)) {
-    num_buckets_ = 0;
-    return false;
-  }
-  buckets_ = reinterpret_cast<Bucket*>(malloc(buckets_byte_size));
-  if (buckets_ == NULL) {
-    state_->block_mgr()->ReleaseMemory(block_mgr_client_, buckets_byte_size);
+  RETURN_IF_ERROR(allocator_->Allocate(buckets_byte_size, &bucket_allocation_));
+  if (bucket_allocation_ == nullptr) {
     num_buckets_ = 0;
-    return false;
+    *got_memory = false;
+    return Status::OK();
   }
+  buckets_ = reinterpret_cast<Bucket*>(bucket_allocation_->data());
   memset(buckets_, 0, buckets_byte_size);
-  return true;
+  *got_memory = true;
+  return Status::OK();
 }
 
 void HashTable::Close() {
@@ -439,36 +438,39 @@ void HashTable::Close() {
   const int64_t HEAVILY_USED = 1024 * 1024;
   // TODO: These statistics should go to the runtime profile as well.
   if ((num_buckets_ > LARGE_HT) || (num_probes_ > HEAVILY_USED)) VLOG(2) << PrintStats();
-  for (int i = 0; i < data_pages_.size(); ++i) {
-    data_pages_[i]->Delete();
-  }
+  for (auto& data_page : data_pages_) allocator_->Free(move(data_page));
+  data_pages_.clear();
   if (ImpaladMetrics::HASH_TABLE_TOTAL_BYTES != NULL) {
     ImpaladMetrics::HASH_TABLE_TOTAL_BYTES->Increment(-total_data_page_size_);
   }
-  data_pages_.clear();
-  if (buckets_ != NULL) free(buckets_);
-  state_->block_mgr()->ReleaseMemory(block_mgr_client_, num_buckets_ * sizeof(Bucket));
+  if (bucket_allocation_ != nullptr) allocator_->Free(move(bucket_allocation_));
 }
 
-bool HashTable::CheckAndResize(uint64_t buckets_to_fill, const HashTableCtx* ht_ctx) {
+Status HashTable::CheckAndResize(
+    uint64_t buckets_to_fill, const HashTableCtx* ht_ctx, bool* got_memory) {
   uint64_t shift = 0;
   while (num_filled_buckets_ + buckets_to_fill >
          (num_buckets_ << shift) * MAX_FILL_FACTOR) {
-    // TODO: next prime instead of double?
     ++shift;
   }
-  if (shift > 0) return ResizeBuckets(num_buckets_ << shift, ht_ctx);
-  return true;
+  if (shift > 0) return ResizeBuckets(num_buckets_ << shift, ht_ctx, got_memory);
+  *got_memory = true;
+  return Status::OK();
 }
 
-bool HashTable::ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx) {
-  DCHECK_EQ((num_buckets & (num_buckets-1)), 0)
+Status HashTable::ResizeBuckets(
+    int64_t num_buckets, const HashTableCtx* ht_ctx, bool* got_memory) {
+  DCHECK_EQ((num_buckets & (num_buckets - 1)), 0)
       << "num_buckets=" << num_buckets << " must be a power of 2";
-  DCHECK_GT(num_buckets, num_filled_buckets_) << "Cannot shrink the hash table to "
-      "smaller number of buckets than the number of filled buckets.";
-  VLOG(2) << "Resizing hash table from "
-          << num_buckets_ << " to " << num_buckets << " buckets.";
-  if (max_num_buckets_ != -1 && num_buckets > max_num_buckets_) return false;
+  DCHECK_GT(num_buckets, num_filled_buckets_)
+    << "Cannot shrink the hash table to smaller number of buckets than the number of "
+    << "filled buckets.";
+  VLOG(2) << "Resizing hash table from " << num_buckets_ << " to " << num_buckets
+          << " buckets.";
+  if (max_num_buckets_ != -1 && num_buckets > max_num_buckets_) {
+    *got_memory = false;
+    return Status::OK();
+  }
   ++num_resizes_;
 
   // All memory that can grow proportional to the input should come from the block mgrs
@@ -476,14 +478,16 @@ bool HashTable::ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx) {
   // Note that while we copying over the contents of the old hash table, we need to have
   // allocated both the old and the new hash table. Once we finish, we return the memory
   // of the old hash table.
-  int64_t old_size = num_buckets_ * sizeof(Bucket);
+  // int64_t old_size = num_buckets_ * sizeof(Bucket);
   int64_t new_size = num_buckets * sizeof(Bucket);
-  if (!state_->block_mgr()->ConsumeMemory(block_mgr_client_, new_size)) return false;
-  Bucket* new_buckets = reinterpret_cast<Bucket*>(malloc(new_size));
-  if (new_buckets == NULL) {
-    state_->block_mgr()->ReleaseMemory(block_mgr_client_, new_size);
-    return false;
+
+  unique_ptr<Suballocation> new_allocation;
+  RETURN_IF_ERROR(allocator_->Allocate(new_size, &new_allocation));
+  if (new_allocation == NULL) {
+    *got_memory = false;
+    return Status::OK();
   }
+  Bucket* new_buckets = reinterpret_cast<Bucket*>(new_allocation->data());
   memset(new_buckets, 0, new_size);
 
   // Walk the old table and copy all the filled buckets to the new (resized) table.
@@ -503,28 +507,22 @@ bool HashTable::ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx) {
   }
 
   num_buckets_ = num_buckets;
-  free(buckets_);
-  buckets_ = new_buckets;
-  state_->block_mgr()->ReleaseMemory(block_mgr_client_, old_size);
-  return true;
+  allocator_->Free(move(bucket_allocation_));
+  bucket_allocation_ = move(new_allocation);
+  buckets_ = reinterpret_cast<Bucket*>(bucket_allocation_->data());
+  *got_memory = true;
+  return Status::OK();
 }
 
-bool HashTable::GrowNodeArray() {
-  int64_t page_size = 0;
-  page_size = state_->block_mgr()->max_block_size();
-  if (data_pages_.size() < NUM_SMALL_DATA_PAGES) {
-    page_size = min(page_size, INITIAL_DATA_PAGE_SIZES[data_pages_.size()]);
-  }
-  BufferedBlockMgr::Block* block = NULL;
-  Status status = state_->block_mgr()->GetNewBlock(
-      block_mgr_client_, NULL, &block, page_size);
-  DCHECK(status.ok() || block == NULL);
-  if (block == NULL) return false;
-  data_pages_.push_back(block);
-  next_node_ = block->Allocate<DuplicateNode>(page_size);
-  ImpaladMetrics::HASH_TABLE_TOTAL_BYTES->Increment(page_size);
-  node_remaining_current_page_ = page_size / sizeof(DuplicateNode);
-  total_data_page_size_ += page_size;
+bool HashTable::GrowNodeArray(Status* status) {
+  unique_ptr<Suballocation> allocation;
+  *status = allocator_->Allocate(DATA_PAGE_SIZE, &allocation);
+  if (!status->ok() || allocation == nullptr) return false;
+  next_node_ = reinterpret_cast<DuplicateNode*>(allocation->data());
+  data_pages_.push_back(move(allocation));
+  ImpaladMetrics::HASH_TABLE_TOTAL_BYTES->Increment(DATA_PAGE_SIZE);
+  node_remaining_current_page_ = DATA_PAGE_SIZE / sizeof(DuplicateNode);
+  total_data_page_size_ += DATA_PAGE_SIZE;
   return true;
 }
 
@@ -533,8 +531,7 @@ void HashTable::DebugStringTuple(stringstream& ss, HtData& htdata,
   if (stores_tuples_) {
     ss << "(" << htdata.tuple << ")";
   } else {
-    ss << "(" << htdata.idx.block() << ", " << htdata.idx.idx()
-       << ", " << htdata.idx.offset() << ")";
+    ss << "(" << htdata.flat_row << ")";
   }
   if (desc != NULL) {
     Tuple* row[num_build_tuples_];

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.h b/be/src/exec/hash-table.h
index 9ba5b04..297e619 100644
--- a/be/src/exec/hash-table.h
+++ b/be/src/exec/hash-table.h
@@ -15,19 +15,21 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
 #ifndef IMPALA_EXEC_HASH_TABLE_H
 #define IMPALA_EXEC_HASH_TABLE_H
 
+#include <memory>
 #include <vector>
 #include <boost/cstdint.hpp>
 #include <boost/scoped_ptr.hpp>
+
 #include "codegen/impala-ir.h"
-#include "common/logging.h"
 #include "common/compiler-util.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "common/logging.h"
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/suballocator.h"
 #include "runtime/tuple-row.h"
 #include "util/bitmap.h"
 #include "util/hash-util.h"
@@ -101,7 +103,6 @@ class HashTable;
 /// Inserts().  We may want to optimize joins more heavily for Inserts() (in particular
 /// growing).
 /// TODO: Batched interface for inserts and finds.
-/// TODO: Do we need to check mem limit exceeded so often. Check once per batch?
 /// TODO: as an optimization, compute variable-length data size for the agg node.
 
 /// Control block for a hash table. This class contains the logic as well as the variables
@@ -525,13 +526,15 @@ class HashTableCtx {
 /// nodes do not contain the hash value, because all the linked nodes have the same hash
 /// value, the one in the bucket. The data is either a tuple stream index or a Tuple*.
 /// This array of buckets is sparse, we are shooting for up to 3/4 fill factor (75%). The
-/// data allocated by the hash table comes from the BufferedBlockMgr.
+/// data allocated by the hash table comes from the BufferPool.
 class HashTable {
  private:
-
-  /// Either the row in the tuple stream or a pointer to the single tuple of this row.
+  /// Rows are represented as pointers into the BufferedTupleStream data with one
+  /// of two formats, depending on the number of tuples in the row.
   union HtData {
-    BufferedTupleStream::RowIdx idx;
+    // For rows with multiple tuples per row, a pointer to the flattened TupleRow.
+    BufferedTupleStreamV2::FlatRowPtr flat_row;
+    // For rows with one tuple per row, a pointer to the Tuple itself.
     Tuple* tuple;
   };
 
@@ -584,7 +587,7 @@ class HashTable {
 
   /// Returns a newly allocated HashTable. The probing algorithm is set by the
   /// FLAG_enable_quadratic_probing.
-  ///  - client: block mgr client to allocate data pages from.
+  ///  - allocator: allocator to allocate bucket directory and data pages from.
   ///  - stores_duplicates: true if rows with duplicate keys may be inserted into the
   ///    hash table.
   ///  - num_build_tuples: number of Tuples in the build tuple row.
@@ -596,31 +599,35 @@ class HashTable {
   ///    -1, if it unlimited.
   ///  - initial_num_buckets: number of buckets that the hash table should be initialized
   ///    with.
-  static HashTable* Create(RuntimeState* state, BufferedBlockMgr::Client* client,
-      bool stores_duplicates, int num_build_tuples, BufferedTupleStream* tuple_stream,
-      int64_t max_num_buckets, int64_t initial_num_buckets);
+  static HashTable* Create(Suballocator* allocator, bool stores_duplicates,
+      int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+      int64_t initial_num_buckets);
 
-  /// Allocates the initial bucket structure. Returns false if OOM.
-  bool Init();
+  /// Allocates the initial bucket structure. Returns a non-OK status if an error is
+  /// encountered. If an OK status is returned , 'got_memory' is set to indicate whether
+  /// enough memory for the initial buckets was allocated from the Suballocator.
+  Status Init(bool* got_memory) WARN_UNUSED_RESULT;
 
   /// Call to cleanup any resources. Must be called once.
   void Close();
 
-  /// Inserts the row to the hash table. Returns true if the insertion was successful.
-  /// Always returns true if the table has free buckets and the key is not a duplicate.
-  /// The caller is responsible for ensuring that the table has free buckets
-  /// 'idx' is the index into tuple_stream_ for this row. If the row contains more than
-  /// one tuple, the 'idx' is stored instead of the 'row'. The 'row' is not copied by the
-  /// hash table and the caller must guarantee it stays in memory. This will not grow the
-  /// hash table. In the case that there is a need to insert a duplicate node, instead of
-  /// filling a new bucket, and there is not enough memory to insert a duplicate node,
-  /// the insert fails and this function returns false.
-  /// Used during the build phase of hash joins.
+  /// Inserts the row to the hash table. The caller is responsible for ensuring that the
+  /// table has free buckets. Returns true if the insertion was successful. Always
+  /// returns true if the table has free buckets and the key is not a duplicate. If the
+  /// key was a duplicate and memory could not be allocated for the new duplicate node,
+  /// returns false. If an error is encountered while creating a duplicate node, returns
+  /// false and sets 'status' to the error.
+  ///
+  /// 'flat_row' is a pointer to the flattened row in 'tuple_stream_' If the row contains
+  /// only one tuple, a pointer to that tuple is stored. Otherwise the 'flat_row' pointer
+  /// is stored. The 'row' is not copied by the hash table and the caller must guarantee
+  /// it stays in memory. This will not grow the hash table.
   bool IR_ALWAYS_INLINE Insert(HashTableCtx* ht_ctx,
-      const BufferedTupleStream::RowIdx& idx, TupleRow* row);
+      BufferedTupleStreamV2::FlatRowPtr flat_row, TupleRow* row,
+      Status* status) WARN_UNUSED_RESULT;
 
   /// Prefetch the hash table bucket which the given hash value 'hash' maps to.
-  template<const bool READ>
+  template <const bool READ>
   void IR_ALWAYS_INLINE PrefetchBucket(uint32_t hash);
 
   /// Returns an iterator to the bucket that matches the probe expression results that
@@ -680,12 +687,17 @@ class HashTable {
   /// Calculates the fill factor if 'buckets_to_fill' additional buckets were to be
   /// filled and resizes the hash table so that the projected fill factor is below the
   /// max fill factor.
-  /// If it returns true, then it is guaranteed at least 'rows_to_add' rows can be
-  /// inserted without need to resize.
-  bool CheckAndResize(uint64_t buckets_to_fill, const HashTableCtx* ht_ctx);
+  /// If 'got_memory' is true, then it is guaranteed at least 'rows_to_add' rows can be
+  /// inserted without need to resize. If there is not enough memory available to
+  /// resize the hash table, Status::OK() is returned and 'got_memory' is false. If a
+  /// another error occurs, an error status may be returned.
+  Status CheckAndResize(uint64_t buckets_to_fill, const HashTableCtx* ht_ctx,
+      bool* got_memory) WARN_UNUSED_RESULT;
 
   /// Returns the number of bytes allocated to the hash table from the block manager.
-  int64_t ByteSize() const { return num_buckets_ * sizeof(Bucket) + total_data_page_size_; }
+  int64_t ByteSize() const {
+    return num_buckets_ * sizeof(Bucket) + total_data_page_size_;
+  }
 
   /// Returns an iterator at the beginning of the hash table.  Advancing this iterator
   /// will traverse all elements.
@@ -792,7 +804,6 @@ class HashTable {
     TupleRow* scratch_row_;
 
     /// Current bucket idx.
-    /// TODO: Use uint32_t?
     int64_t bucket_idx_;
 
     /// Pointer to the current duplicate node.
@@ -807,9 +818,9 @@ class HashTable {
   /// of calling this constructor directly.
   ///  - quadratic_probing: set to true when the probing algorithm is quadratic, as
   ///    opposed to linear.
-  HashTable(bool quadratic_probing, RuntimeState* state, BufferedBlockMgr::Client* client,
-      bool stores_duplicates, int num_build_tuples, BufferedTupleStream* tuple_stream,
-      int64_t max_num_buckets, int64_t initial_num_buckets);
+  HashTable(bool quadratic_probing, Suballocator* allocator, bool stores_duplicates,
+      int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+      int64_t initial_num_buckets);
 
   /// Performs the probing operation according to the probing algorithm (linear or
   /// quadratic. Returns one of the following:
@@ -839,8 +850,10 @@ class HashTable {
       HashTableCtx* ht_ctx, uint32_t hash, bool* found);
 
   /// Performs the insert logic. Returns the HtData* of the bucket or duplicate node
-  /// where the data should be inserted. Returns NULL if the insert was not successful.
-  HtData* IR_ALWAYS_INLINE InsertInternal(HashTableCtx* ht_ctx);
+  /// where the data should be inserted. Returns NULL if the insert was not successful
+  /// and either sets 'status' to OK if it failed because not enough reservation was
+  /// available or the error if an error was encountered.
+  HtData* IR_ALWAYS_INLINE InsertInternal(HashTableCtx* ht_ctx, Status* status);
 
   /// Updates 'bucket_idx' to the index of the next non-empty bucket. If the bucket has
   /// duplicates, 'node' will be pointing to the head of the linked list of duplicates.
@@ -848,8 +861,8 @@ class HashTable {
   /// 'bucket_idx' to BUCKET_NOT_FOUND.
   void NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node);
 
-  /// Resize the hash table to 'num_buckets'. Returns false on OOM.
-  bool ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx);
+  /// Resize the hash table to 'num_buckets'. 'got_memory' is false on OOM.
+  Status ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx, bool* got_memory);
 
   /// Appends the DuplicateNode pointed by next_node_ to 'bucket' and moves the next_node_
   /// pointer to the next DuplicateNode in the page, updating the remaining node counter.
@@ -862,9 +875,10 @@ class HashTable {
   /// the bucket is converted to a DuplicateNode. That is, the contents of 'data' of the
   /// bucket are copied to a DuplicateNode and 'data' is updated to pointing to a
   /// DuplicateNode.
-  /// Returns NULL if the node array could not grow, i.e. there was not enough memory to
-  /// allocate a new DuplicateNode.
-  DuplicateNode* IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx);
+  /// Returns NULL and sets 'status' to OK if the node array could not grow, i.e. there
+  /// was not enough memory to allocate a new DuplicateNode. Returns NULL and sets
+  /// 'status' to an error if another error was encountered.
+  DuplicateNode* IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx, Status* status);
 
   /// Resets the contents of the empty bucket with index 'bucket_idx', in preparation for
   /// an insert. Sets all the fields of the bucket other than 'data'.
@@ -877,8 +891,10 @@ class HashTable {
   /// returns the content of the first chained duplicate node of the bucket.
   TupleRow* GetRow(Bucket* bucket, TupleRow* row) const;
 
-  /// Grow the node array. Returns false on OOM.
-  bool GrowNodeArray();
+  /// Grow the node array. Returns true and sets 'status' to OK on success. Returns false
+  /// and set 'status' to OK if we can't get sufficient reservation to allocate the next
+  /// data page. Returns false and sets 'status' if another error is encountered.
+  bool GrowNodeArray(Status* status);
 
   /// Functions to be replaced by codegen to specialize the hash table.
   bool IR_NO_INLINE stores_tuples() const { return stores_tuples_; }
@@ -887,20 +903,26 @@ class HashTable {
 
   /// Load factor that will trigger growing the hash table on insert.  This is
   /// defined as the number of non-empty buckets / total_buckets
-  static const double MAX_FILL_FACTOR;
+  static constexpr double MAX_FILL_FACTOR = 0.75;
+
+  /// The size in bytes of each page of duplicate nodes. Should be large enough to fit
+  /// enough DuplicateNodes to amortise the overhead of allocating each page and low
+  /// enough to not waste excessive memory to internal fragmentation.
+  static constexpr int64_t DATA_PAGE_SIZE = 64L * 1024;
 
   RuntimeState* state_;
 
-  /// Client to allocate data pages with.
-  BufferedBlockMgr::Client* block_mgr_client_;
+  /// Suballocator to allocate data pages and hash table buckets with.
+  Suballocator* allocator_;
 
   /// Stream contains the rows referenced by the hash table. Can be NULL if the
   /// row only contains a single tuple, in which case the TupleRow indirection
   /// is removed by the hash table.
-  BufferedTupleStream* tuple_stream_;
+  BufferedTupleStreamV2* tuple_stream_;
 
-  /// Constants on how the hash table should behave. Joins and aggs have slightly
-  /// different behavior.
+  /// Constants on how the hash table should behave.
+
+  /// True if the HtData uses the Tuple* representation, or false if it uses FlatRowPtr.
   const bool stores_tuples_;
 
   /// True if duplicates may be inserted into hash table.
@@ -909,8 +931,9 @@ class HashTable {
   /// Quadratic probing enabled (as opposed to linear).
   const bool quadratic_probing_;
 
-  /// Data pages for all nodes. These are always pinned.
-  std::vector<BufferedBlockMgr::Block*> data_pages_;
+  /// Data pages for all nodes. Allocated from suballocator to reduce memory
+  /// consumption of small tables.
+  std::vector<std::unique_ptr<Suballocation>> data_pages_;
 
   /// Byte size of all buffers in data_pages_.
   int64_t total_data_page_size_;
@@ -926,8 +949,10 @@ class HashTable {
 
   const int64_t max_num_buckets_;
 
-  /// Array of all buckets. Owned by this node. Using c-style array to control
-  /// control memory footprint.
+  /// Allocation containing all buckets.
+  std::unique_ptr<Suballocation> bucket_allocation_;
+
+  /// Pointer to the 'buckets_' array from 'bucket_allocation_'.
   Bucket* buckets_;
 
   /// Total number of buckets (filled and empty).
@@ -943,9 +968,8 @@ class HashTable {
   /// Number of build tuples, used for constructing temp row* for probes.
   const int num_build_tuples_;
 
-  /// Flag used to disable spilling hash tables that already had matches in case of
-  /// right joins (IMPALA-1488).
-  /// TODO: Not fail when spilling hash tables with matches in right joins
+  /// Flag used to check that we don't lose stored matches when spilling hash tables
+  /// (IMPALA-1488).
   bool has_matches_;
 
   /// The stats below can be used for debugging perf.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.inline.h b/be/src/exec/hash-table.inline.h
index aff7c14..ce2f784 100644
--- a/be/src/exec/hash-table.inline.h
+++ b/be/src/exec/hash-table.inline.h
@@ -90,7 +90,8 @@ inline int64_t HashTable::Probe(Bucket* buckets, int64_t num_buckets,
   return Iterator::BUCKET_NOT_FOUND;
 }
 
-inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx) {
+inline HashTable::HtData* HashTable::InsertInternal(
+    HashTableCtx* ht_ctx, Status* status) {
   ++num_probes_;
   bool found = false;
   uint32_t hash = ht_ctx->expr_values_cache()->CurExprValuesHash();
@@ -98,7 +99,7 @@ inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx) {
   DCHECK_NE(bucket_idx, Iterator::BUCKET_NOT_FOUND);
   if (found) {
     // We need to insert a duplicate node, note that this may fail to allocate memory.
-    DuplicateNode* new_node = InsertDuplicateNode(bucket_idx);
+    DuplicateNode* new_node = InsertDuplicateNode(bucket_idx, status);
     if (UNLIKELY(new_node == NULL)) return NULL;
     return &new_node->htdata;
   } else {
@@ -108,14 +109,14 @@ inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx) {
 }
 
 inline bool HashTable::Insert(HashTableCtx* ht_ctx,
-    const BufferedTupleStream::RowIdx& idx, TupleRow* row) {
-  HtData* htdata = InsertInternal(ht_ctx);
+    BufferedTupleStreamV2::FlatRowPtr flat_row, TupleRow* row, Status* status) {
+  HtData* htdata = InsertInternal(ht_ctx, status);
   // If successful insert, update the contents of the newly inserted entry with 'idx'.
   if (LIKELY(htdata != NULL)) {
     if (stores_tuples()) {
       htdata->tuple = row->GetTuple(0);
     } else {
-      htdata->idx = idx;
+      htdata->flat_row = flat_row;
     }
     return true;
   }
@@ -213,7 +214,8 @@ inline HashTable::DuplicateNode* HashTable::AppendNextNode(Bucket* bucket) {
   return next_node_++;
 }
 
-inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(int64_t bucket_idx) {
+inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(
+    int64_t bucket_idx, Status* status) {
   DCHECK_GE(bucket_idx, 0);
   DCHECK_LT(bucket_idx, num_buckets_);
   Bucket* bucket = &buckets_[bucket_idx];
@@ -222,12 +224,12 @@ inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(int64_t bucket_i
   // Allocate one duplicate node for the new data and one for the preexisting data,
   // if needed.
   while (node_remaining_current_page_ < 1 + !bucket->hasDuplicates) {
-    if (UNLIKELY(!GrowNodeArray())) return NULL;
+    if (UNLIKELY(!GrowNodeArray(status))) return NULL;
   }
   if (!bucket->hasDuplicates) {
     // This is the first duplicate in this bucket. It means that we need to convert
     // the current entry in the bucket to a node and link it from the bucket.
-    next_node_->htdata.idx = bucket->bucketData.htdata.idx;
+    next_node_->htdata.flat_row = bucket->bucketData.htdata.flat_row;
     DCHECK(!bucket->matched);
     next_node_->matched = false;
     next_node_->next = NULL;
@@ -246,7 +248,7 @@ inline TupleRow* IR_ALWAYS_INLINE HashTable::GetRow(HtData& htdata, TupleRow* ro
     return reinterpret_cast<TupleRow*>(&htdata.tuple);
   } else {
     // TODO: GetTupleRow() has interpreted code that iterates over the row's descriptor.
-    tuple_stream_->GetTupleRow(htdata.idx, row);
+    tuple_stream_->GetTupleRow(htdata.flat_row, row);
     return row;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/nested-loop-join-builder.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/nested-loop-join-builder.cc b/be/src/exec/nested-loop-join-builder.cc
index 67e6ed6..fdd94ee 100644
--- a/be/src/exec/nested-loop-join-builder.cc
+++ b/be/src/exec/nested-loop-join-builder.cc
@@ -45,8 +45,7 @@ Status NljBuilder::Send(RuntimeState* state, RowBatch* batch) {
   build_batch->AcquireState(batch);
 
   AddBuildBatch(build_batch);
-  if (build_batch->needs_deep_copy() || build_batch->num_blocks() > 0
-      || build_batch->num_buffers() > 0) {
+  if (build_batch->needs_deep_copy() || build_batch->num_buffers() > 0) {
     // This batch and earlier batches may refer to resources passed from the child
     // that aren't owned by the row batch itself. Deep copying ensures that the row
     // batches are backed by memory owned by this node that is safe to hold on to.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partial-sort-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partial-sort-node.cc b/be/src/exec/partial-sort-node.cc
index 4f485d5..88b2f26 100644
--- a/be/src/exec/partial-sort-node.cc
+++ b/be/src/exec/partial-sort-node.cc
@@ -58,8 +58,10 @@ Status PartialSortNode::Prepare(RuntimeState* state) {
   RETURN_IF_ERROR(ExecNode::Prepare(state));
   less_than_.reset(new TupleRowComparator(ordering_exprs_, is_asc_order_, nulls_first_));
   sorter_.reset(new Sorter(*less_than_, sort_tuple_exprs_, &row_descriptor_,
-      mem_tracker(), runtime_profile(), state, false));
+      mem_tracker(), &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+      runtime_profile(), state, id(), false));
   RETURN_IF_ERROR(sorter_->Prepare(pool_, expr_mem_pool()));
+  DCHECK_GE(resource_profile_.min_reservation, sorter_->ComputeMinReservation());
   AddCodegenDisabledMessage(state);
   input_batch_.reset(
       new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker()));
@@ -81,6 +83,9 @@ Status PartialSortNode::Open(RuntimeState* state) {
   RETURN_IF_CANCELLED(state);
   RETURN_IF_ERROR(QueryMaintenance(state));
   RETURN_IF_ERROR(child(0)->Open(state));
+  if (!buffer_pool_client_.is_registered()) {
+    RETURN_IF_ERROR(ClaimBufferReservation(state));
+  }
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partial-sort-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partial-sort-node.h b/be/src/exec/partial-sort-node.h
index ab4c547..d40d653 100644
--- a/be/src/exec/partial-sort-node.h
+++ b/be/src/exec/partial-sort-node.h
@@ -19,7 +19,6 @@
 #define IMPALA_EXEC_PARTIAL_SORT_NODE_H
 
 #include "exec/exec-node.h"
-#include "runtime/buffered-block-mgr.h"
 #include "runtime/sorter.h"
 
 namespace impala {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-aggregation-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node-ir.cc b/be/src/exec/partitioned-aggregation-node-ir.cc
index cd5d336..126a2a5 100644
--- a/be/src/exec/partitioned-aggregation-node-ir.cc
+++ b/be/src/exec/partitioned-aggregation-node-ir.cc
@@ -21,7 +21,7 @@
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
 #include "runtime/row-batch.h"
 #include "runtime/tuple-row.h"
 
@@ -46,7 +46,8 @@ Status PartitionedAggregationNode::ProcessBatch(RowBatch* batch,
   // will end up to the same partition.
   // TODO: Once we have a histogram with the number of rows per partition, we will have
   // accurate resize calls.
-  RETURN_IF_ERROR(CheckAndResizeHashPartitions(batch->num_rows(), ht_ctx));
+  RETURN_IF_ERROR(
+      CheckAndResizeHashPartitions(AGGREGATED_ROWS, batch->num_rows(), ht_ctx));
 
   HashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache();
   const int cache_size = expr_vals_cache->capacity();
@@ -108,6 +109,7 @@ Status PartitionedAggregationNode::ProcessRow(TupleRow* __restrict__ row,
   // so we can try again to insert the row.
   HashTable* hash_tbl = GetHashTable(partition_idx);
   Partition* dst_partition = hash_partitions_[partition_idx];
+  DCHECK(dst_partition != nullptr);
   DCHECK_EQ(dst_partition->is_spilled(), hash_tbl == NULL);
   if (hash_tbl == NULL) {
     // This partition is already spilled, just append the row.
@@ -155,24 +157,13 @@ Status PartitionedAggregationNode::AddIntermediateTuple(Partition* __restrict__
     }
 
     // We did not have enough memory to add intermediate_tuple to the stream.
-    RETURN_IF_ERROR(SpillPartition());
+    RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS));
     if (partition->is_spilled()) {
       return AppendSpilledRow<AGGREGATED_ROWS>(partition, row);
     }
   }
 }
 
-template<bool AGGREGATED_ROWS>
-Status PartitionedAggregationNode::AppendSpilledRow(Partition* __restrict__ partition,
-    TupleRow* __restrict__ row) {
-  DCHECK(!is_streaming_preagg_);
-  DCHECK(partition->is_spilled());
-  BufferedTupleStream* stream = AGGREGATED_ROWS ?
-      partition->aggregated_row_stream.get() :
-      partition->unaggregated_row_stream.get();
-  return AppendSpilledRow(stream, row);
-}
-
 Status PartitionedAggregationNode::ProcessBatchStreaming(bool needs_serialize,
     TPrefetchMode::type prefetch_mode, RowBatch* in_batch, RowBatch* out_batch,
     HashTableCtx* __restrict__ ht_ctx, int remaining_capacity[PARTITION_FANOUT]) {
@@ -230,6 +221,7 @@ bool PartitionedAggregationNode::TryAddToHashTable(
   DCHECK(remaining_capacity != NULL);
   DCHECK_EQ(hash_tbl, partition->hash_tbl.get());
   DCHECK_GE(*remaining_capacity, 0);
+  if (hash_tbl == nullptr) return false; // Hash table was not created - pass through.
   bool found;
   // This is called from ProcessBatchStreaming() so the rows are not aggregated.
   HashTable::Iterator it = hash_tbl->FindBuildRowBucket(ht_ctx, &found);


[09/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-builder.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.h b/be/src/exec/partitioned-hash-join-builder.h
index e0393b5..912613d 100644
--- a/be/src/exec/partitioned-hash-join-builder.h
+++ b/be/src/exec/partitioned-hash-join-builder.h
@@ -26,8 +26,9 @@
 #include "exec/data-sink.h"
 #include "exec/filter-context.h"
 #include "exec/hash-table.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/suballocator.h"
 
 #include "gen-cpp/PlanNodes_types.h"
 
@@ -56,7 +57,7 @@ class ScalarExprEvaluator;
 /// RepartitionBuildInput() to repartition a level n partition into multiple level n + 1
 /// partitions.
 ///
-/// Both the PartitionedHashJoinNode and the builder share a BufferedBlockMgr client
+/// Both the PartitionedHashJoinNode and the builder share a BufferPool client
 /// and the corresponding reservations. Different stages of the spilling algorithm
 /// require different mixes of build and probe buffers and hash tables, so we can
 /// share the reservation to minimize the combined memory requirement. Initial probe-side
@@ -72,7 +73,8 @@ class PhjBuilder : public DataSink {
   class Partition;
 
   PhjBuilder(int join_node_id, TJoinOp::type join_op, const RowDescriptor* probe_row_desc,
-      const RowDescriptor* build_row_desc, RuntimeState* state);
+      const RowDescriptor* build_row_desc, RuntimeState* state,
+      BufferPool::ClientHandle* buffer_pool_client, int64_t spillable_buffer_size);
 
   Status InitExprsAndFilters(RuntimeState* state,
       const std::vector<TEqJoinCondition>& eq_join_conjuncts,
@@ -101,7 +103,7 @@ class PhjBuilder : public DataSink {
   /// Transfer ownership of the probe streams to the caller. One stream was allocated per
   /// spilled partition in FlushFinal(). The probe streams are empty but prepared for
   /// writing with a write buffer allocated.
-  std::vector<std::unique_ptr<BufferedTupleStream>> TransferProbeStreams();
+  std::vector<std::unique_ptr<BufferedTupleStreamV2>> TransferProbeStreams();
 
   /// Clears the current list of hash partitions. Called after probing of the partitions
   /// is done. The partitions are not closed or destroyed, since they may be spilled or
@@ -122,7 +124,7 @@ class PhjBuilder : public DataSink {
   /// 'input_probe_rows' for reading in "delete_on_read" mode, so that the probe phase
   /// has enough buffers preallocated to execute successfully.
   Status RepartitionBuildInput(Partition* input_partition, int level,
-      BufferedTupleStream* input_probe_rows) WARN_UNUSED_RESULT;
+      BufferedTupleStreamV2* input_probe_rows) WARN_UNUSED_RESULT;
 
   /// Returns the largest build row count out of the current hash partitions.
   int64_t LargestPartitionRows() const;
@@ -132,7 +134,6 @@ class PhjBuilder : public DataSink {
   bool HashTableStoresNulls() const;
 
   /// Accessor functions, mainly required to expose state to PartitionedHashJoinNode.
-  inline BufferedBlockMgr::Client* block_mgr_client() const { return block_mgr_client_; }
   inline bool non_empty_build() const { return non_empty_build_; }
   inline const std::vector<bool>& is_not_distinct_from() const {
     return is_not_distinct_from_;
@@ -200,24 +201,27 @@ class PhjBuilder : public DataSink {
 
     /// Spills this partition, the partition's stream is unpinned with 'mode' and
     /// its hash table is destroyed if it was built.
-    Status Spill(BufferedTupleStream::UnpinMode mode) WARN_UNUSED_RESULT;
+    Status Spill(BufferedTupleStreamV2::UnpinMode mode) WARN_UNUSED_RESULT;
 
     bool ALWAYS_INLINE IsClosed() const { return build_rows_ == NULL; }
-    BufferedTupleStream* ALWAYS_INLINE build_rows() { return build_rows_.get(); }
+    BufferedTupleStreamV2* ALWAYS_INLINE build_rows() { return build_rows_.get(); }
     HashTable* ALWAYS_INLINE hash_tbl() const { return hash_tbl_.get(); }
     bool ALWAYS_INLINE is_spilled() const { return is_spilled_; }
     int ALWAYS_INLINE level() const { return level_; }
 
    private:
-    /// Inserts each row in 'batch' into 'hash_tbl_' using 'ctx'. 'indices' is an array
-    /// containing the index of each row's index into the hash table's tuple stream.
+    /// Inserts each row in 'batch' into 'hash_tbl_' using 'ctx'. 'flat_rows' is an array
+    /// containing the rows in the hash table's tuple stream.
     /// 'prefetch_mode' is the prefetching mode in use. If it's not PREFETCH_NONE, hash
     /// table buckets which the rows hashes to will be prefetched. This parameter is
     /// replaced with a constant during codegen time. This function may be replaced with
     /// a codegen'd version. Returns true if all rows in 'batch' are successfully
-    /// inserted.
+    /// inserted and false otherwise. If inserting failed, 'status' indicates why it
+    /// failed: if 'status' is ok, inserting failed because not enough reservation
+    /// was available and if 'status' is an error, inserting failed because of that error.
     bool InsertBatch(TPrefetchMode::type prefetch_mode, HashTableCtx* ctx,
-        RowBatch* batch, const std::vector<BufferedTupleStream::RowIdx>& indices);
+        RowBatch* batch, const std::vector<BufferedTupleStreamV2::FlatRowPtr>& flat_rows,
+        Status* status);
 
     const PhjBuilder* parent_;
 
@@ -235,16 +239,9 @@ class PhjBuilder : public DataSink {
     /// Stream of build tuples in this partition. Initially owned by this object but
     /// transferred to the parent exec node (via the row batch) when the partition
     /// is closed. If NULL, ownership has been transferred and the partition is closed.
-    std::unique_ptr<BufferedTupleStream> build_rows_;
+    std::unique_ptr<BufferedTupleStreamV2> build_rows_;
   };
 
- protected:
-  /// Init() function inherited from DataSink. Overridden to be a no-op for now.
-  /// TODO: Merge with InitExprsAndFilters() once this class becomes a true data sink.
-  virtual Status Init(const std::vector<TExpr>& thrift_output_exprs,
-      const TDataSink& tsink, RuntimeState* state) override;
-
- private:
   /// Computes the minimum number of buffers required to execute the spilling partitioned
   /// hash algorithm successfully for any input size (assuming enough disk space is
   /// available for spilled rows). The buffers are used for buffering both build and
@@ -255,15 +252,22 @@ class PhjBuilder : public DataSink {
   /// For NAAJ, we need 3 additional buffers for 'null_aware_partition_',
   /// 'null_aware_probe_partition_' and 'null_probe_rows_'.
   int MinRequiredBuffers() const {
-    // Must be kept in sync with HashJoinNode.computeResourceProfile() in fe.
+    // Must be kept in sync with HashJoinNode.computeNodeResourceProfile() in fe.
     int num_reserved_buffers = PARTITION_FANOUT + 1;
     if (join_op_ == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) num_reserved_buffers += 3;
     return num_reserved_buffers;
   }
 
+ protected:
+  /// Init() function inherited from DataSink. Overridden to be a no-op for now.
+  /// TODO: Merge with InitExprsAndFilters() once this class becomes a true data sink.
+  virtual Status Init(const std::vector<TExpr>& thrift_output_exprs,
+      const TDataSink& tsink, RuntimeState* state) override;
+
   /// Free local allocations made from expr evaluators during hash table construction.
   void FreeLocalAllocations() const;
 
+ private:
   /// Create and initialize a set of hash partitions for partitioning level 'level'.
   /// The previous hash partitions must have been cleared with ClearHashPartitions().
   /// After calling this, batches are added to the new partitions by calling Send().
@@ -284,19 +288,19 @@ class PhjBuilder : public DataSink {
   /// partitions. This odd return convention is used to avoid emitting unnecessary code
   /// for ~Status in perf-critical code.
   bool AppendRow(
-      BufferedTupleStream* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
+      BufferedTupleStreamV2* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
 
   /// Slow path for AppendRow() above. It is called when the stream has failed to append
   /// the row. We need to find more memory by either switching to IO-buffers, in case the
   /// stream still uses small buffers, or spilling a partition. Returns false and sets
   /// 'status' if it was unable to append the row, even after spilling partitions.
-  bool AppendRowStreamFull(BufferedTupleStream* stream, TupleRow* row,
+  bool AppendRowStreamFull(BufferedTupleStreamV2* stream, TupleRow* row,
       Status* status) noexcept WARN_UNUSED_RESULT;
 
   /// Frees memory by spilling one of the hash partitions. The 'mode' argument is passed
   /// to the Spill() call for the selected partition. The current policy is to spill the
   /// largest partition. Returns non-ok status if we couldn't spill a partition.
-  Status SpillPartition(BufferedTupleStream::UnpinMode mode) WARN_UNUSED_RESULT;
+  Status SpillPartition(BufferedTupleStreamV2::UnpinMode mode) WARN_UNUSED_RESULT;
 
   /// Tries to build hash tables for all unspilled hash partitions. Called after
   /// FlushFinal() when all build rows have been partitioned and added to the appropriate
@@ -358,14 +362,20 @@ class PhjBuilder : public DataSink {
   /// Pool for objects with same lifetime as builder.
   ObjectPool pool_;
 
-  /// Client to the buffered block mgr, used to allocate build partition buffers and hash
-  /// tables. When probing, the spilling algorithm keeps some build partitions in memory
-  /// while using memory for probe buffers for spilled partitions. To support dynamically
-  /// dividing memory between build and probe, this client is owned by the builder but
-  /// shared with the PartitionedHashJoinNode.
+  /// Client to the buffer pool, used to allocate build partition buffers and hash tables.
+  /// When probing, the spilling algorithm keeps some build partitions in memory while
+  /// using memory for probe buffers for spilled partitions. To support dynamically
+  /// dividing memory between build and probe, this client is shared between the builder
+  /// and the PartitionedHashJoinNode.
   /// TODO: this approach to sharing will not work for spilling broadcast joins with a
   /// 1:N relationship from builders to join nodes.
-  BufferedBlockMgr::Client* block_mgr_client_;
+  BufferPool::ClientHandle* buffer_pool_client_;
+
+  /// The size of buffers to use in the build and probe streams.
+  const int64_t spillable_buffer_size_;
+
+  /// Allocator for hash table memory.
+  boost::scoped_ptr<Suballocator> ht_allocator_;
 
   /// If true, the build side has at least one row.
   bool non_empty_build_;
@@ -454,7 +464,7 @@ class PhjBuilder : public DataSink {
   ///
   /// Because of this, at the end of the build phase, we always have sufficient memory
   /// to execute the probe phase of the algorithm without spilling more partitions.
-  std::vector<std::unique_ptr<BufferedTupleStream>> spilled_partition_probe_streams_;
+  std::vector<std::unique_ptr<BufferedTupleStreamV2>> spilled_partition_probe_streams_;
 
   /// END: Members that must be Reset()
   /////////////////////////////////////////
@@ -469,7 +479,7 @@ class PhjBuilder : public DataSink {
   ProcessBuildBatchFn process_build_batch_fn_level0_;
 
   typedef bool (*InsertBatchFn)(Partition*, TPrefetchMode::type, HashTableCtx*, RowBatch*,
-      const std::vector<BufferedTupleStream::RowIdx>&);
+      const std::vector<BufferedTupleStreamV2::FlatRowPtr>&, Status*);
   /// Jitted Partition::InsertBatch() function pointers. NULL if codegen is disabled.
   InsertBatchFn insert_batch_fn_;
   InsertBatchFn insert_batch_fn_level0_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node-ir.cc b/be/src/exec/partitioned-hash-join-node-ir.cc
index 2c951d1..b890eb9 100644
--- a/be/src/exec/partitioned-hash-join-node-ir.cc
+++ b/be/src/exec/partitioned-hash-join-node-ir.cc
@@ -313,7 +313,7 @@ bool IR_ALWAYS_INLINE PartitionedHashJoinNode::NextProbeRow(
           // The partition is not in memory, spill the probe row and move to the next row.
           // Skip the current row if we manage to append to the spilled partition's BTS.
           // Otherwise, we need to bail out and report the failure.
-          BufferedTupleStream* probe_rows = probe_partition->probe_rows();
+          BufferedTupleStreamV2* probe_rows = probe_partition->probe_rows();
           if (UNLIKELY(!AppendProbeRow(probe_rows, current_probe_row_, status))) {
             DCHECK(!status->ok());
             return false;
@@ -438,9 +438,8 @@ int PartitionedHashJoinNode::ProcessProbeBatch(TPrefetchMode::type prefetch_mode
 }
 
 inline bool PartitionedHashJoinNode::AppendProbeRow(
-    BufferedTupleStream* stream, TupleRow* row, Status* status) {
-  DCHECK(stream->has_write_block());
-  DCHECK(!stream->using_small_buffers());
+    BufferedTupleStreamV2* stream, TupleRow* row, Status* status) {
+  DCHECK(stream->has_write_iterator());
   DCHECK(!stream->is_pinned());
   return stream->AddRow(row, status);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.cc b/be/src/exec/partitioned-hash-join-node.cc
index a5c9897..2db9e00 100644
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -27,8 +27,7 @@
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
 #include "exprs/slot-ref.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
@@ -47,9 +46,15 @@ static const string PREPARE_FOR_READ_FAILED_ERROR_MSG =
     "successfully.";
 
 using namespace impala;
-using namespace llvm;
-using namespace strings;
-using std::unique_ptr;
+using llvm::BasicBlock;
+using llvm::ConstantInt;
+using llvm::Function;
+using llvm::GlobalValue;
+using llvm::LLVMContext;
+using llvm::PointerType;
+using llvm::Type;
+using llvm::Value;
+using strings::Substitute;
 
 PartitionedHashJoinNode::PartitionedHashJoinNode(
     ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs)
@@ -77,8 +82,9 @@ Status PartitionedHashJoinNode::Init(const TPlanNode& tnode, RuntimeState* state
   // TODO: allow PhjBuilder to be the sink of a separate fragment. For now, PhjBuilder is
   // owned by this node, but duplicates some state (exprs, etc) in anticipation of it
   // being separated out further.
-  builder_.reset(
-      new PhjBuilder(id(), join_op_, child(0)->row_desc(), child(1)->row_desc(), state));
+  builder_.reset(new PhjBuilder(id(), join_op_, child(0)->row_desc(),
+        child(1)->row_desc(), state, &buffer_pool_client_,
+        resource_profile_.spillable_buffer_size));
   RETURN_IF_ERROR(
       builder_->InitExprsAndFilters(state, eq_join_conjuncts, tnode.runtime_filters));
 
@@ -177,6 +183,11 @@ Status PartitionedHashJoinNode::Open(RuntimeState* state) {
 }
 
 Status PartitionedHashJoinNode::AcquireResourcesForBuild(RuntimeState* state) {
+  DCHECK_GE(resource_profile_.min_reservation,
+      resource_profile_.spillable_buffer_size * builder_->MinRequiredBuffers());
+  if (!buffer_pool_client_.is_registered()) {
+    RETURN_IF_ERROR(ClaimBufferReservation(state));
+  }
   if (join_op_ == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
     // Initialize these partitions before doing the build so that the build does not
     // use the reservation intended for them.
@@ -254,12 +265,10 @@ void PartitionedHashJoinNode::Close(RuntimeState* state) {
 
 PartitionedHashJoinNode::ProbePartition::ProbePartition(RuntimeState* state,
     PartitionedHashJoinNode* parent, PhjBuilder::Partition* build_partition,
-    unique_ptr<BufferedTupleStream> probe_rows)
-  : parent_(parent),
-    build_partition_(build_partition),
+    unique_ptr<BufferedTupleStreamV2> probe_rows)
+  : build_partition_(build_partition),
     probe_rows_(std::move(probe_rows)) {
-  DCHECK(probe_rows_->has_write_block());
-  DCHECK(!probe_rows_->using_small_buffers());
+  DCHECK(probe_rows_->has_write_iterator());
   DCHECK(!probe_rows_->is_pinned());
 }
 
@@ -270,10 +279,7 @@ PartitionedHashJoinNode::ProbePartition::~ProbePartition() {
 Status PartitionedHashJoinNode::ProbePartition::PrepareForRead() {
   bool got_read_buffer;
   RETURN_IF_ERROR(probe_rows_->PrepareForRead(true, &got_read_buffer));
-  if (!got_read_buffer) {
-    return parent_->mem_tracker()->MemLimitExceeded(parent_->runtime_state_,
-        Substitute(PREPARE_FOR_READ_FAILED_ERROR_MSG, parent_->id_));
-  }
+  DCHECK(got_read_buffer) << "Accounted in min reservation";
   return Status::OK();
 }
 
@@ -322,7 +328,7 @@ Status PartitionedHashJoinNode::NextSpilledProbeRowBatch(
     probe_batch_pos_ = -1;
     return Status::OK();
   }
-  BufferedTupleStream* probe_rows = input_partition_->probe_rows();
+  BufferedTupleStreamV2* probe_rows = input_partition_->probe_rows();
   if (LIKELY(probe_rows->rows_returned() < probe_rows->num_rows())) {
     // Continue from the current probe stream.
     bool eos = false;
@@ -414,12 +420,11 @@ Status PartitionedHashJoinNode::PrepareSpilledPartitionForProbe(
     ht_ctx_->set_level(next_partition_level);
 
     // Spill to free memory from hash tables and pinned streams for use in new partitions.
-    RETURN_IF_ERROR(build_partition->Spill(BufferedTupleStream::UNPIN_ALL));
+    RETURN_IF_ERROR(build_partition->Spill(BufferedTupleStreamV2::UNPIN_ALL));
     // Temporarily free up the probe buffer to use when repartitioning.
-    RETURN_IF_ERROR(
-        input_partition_->probe_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
-    DCHECK_EQ(build_partition->build_rows()->blocks_pinned(), 0) << NodeDebugString();
-    DCHECK_EQ(input_partition_->probe_rows()->blocks_pinned(), 0) << NodeDebugString();
+    input_partition_->probe_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+    DCHECK_EQ(build_partition->build_rows()->BytesPinned(false), 0) << NodeDebugString();
+    DCHECK_EQ(input_partition_->probe_rows()->BytesPinned(false), 0) << NodeDebugString();
     int64_t num_input_rows = build_partition->build_rows()->num_rows();
     RETURN_IF_ERROR(builder_->RepartitionBuildInput(
         build_partition, next_partition_level, input_partition_->probe_rows()));
@@ -430,7 +435,8 @@ Status PartitionedHashJoinNode::PrepareSpilledPartitionForProbe(
                                                          "more rows than the input";
     if (UNLIKELY(num_input_rows == largest_partition_rows)) {
       return Status(TErrorCode::PARTITIONED_HASH_JOIN_REPARTITION_FAILS, id_,
-          next_partition_level, num_input_rows);
+          next_partition_level, num_input_rows, NodeDebugString(),
+          buffer_pool_client_.DebugString());
     }
 
     RETURN_IF_ERROR(PrepareForProbe());
@@ -816,18 +822,18 @@ static Status NullAwareAntiJoinError(bool build) {
 
 Status PartitionedHashJoinNode::InitNullAwareProbePartition() {
   RuntimeState* state = runtime_state_;
-  unique_ptr<BufferedTupleStream> probe_rows = std::make_unique<BufferedTupleStream>(
-      state, child(0)->row_desc(), state->block_mgr(), builder_->block_mgr_client(),
-      false /* use_initial_small_buffers */, false /* read_write */);
-  Status status = probe_rows->Init(id(), runtime_profile(), false);
+  unique_ptr<BufferedTupleStreamV2> probe_rows = make_unique<BufferedTupleStreamV2>(
+      state, child(0)->row_desc(), &buffer_pool_client_,
+      resource_profile_.spillable_buffer_size,
+      resource_profile_.spillable_buffer_size);
+  // TODO: this should be pinned if spilling is disabled.
+  Status status = probe_rows->Init(id(), false);
   if (!status.ok()) goto error;
   bool got_buffer;
   status = probe_rows->PrepareForWrite(&got_buffer);
   if (!status.ok()) goto error;
-  if (!got_buffer) {
-    status = state->block_mgr()->MemLimitTooLowError(builder_->block_mgr_client(), id());
-    goto error;
-  }
+  DCHECK(got_buffer)
+      << "Accounted in min reservation" << buffer_pool_client_.DebugString();
   null_aware_probe_partition_.reset(new ProbePartition(
       state, this, builder_->null_aware_partition(), std::move(probe_rows)));
   return Status::OK();
@@ -841,15 +847,15 @@ error:
 
 Status PartitionedHashJoinNode::InitNullProbeRows() {
   RuntimeState* state = runtime_state_;
-  null_probe_rows_ = std::make_unique<BufferedTupleStream>(state, child(0)->row_desc(),
-      state->block_mgr(), builder_->block_mgr_client(),
-      false /* use_initial_small_buffers */, false /* read_write */);
-  RETURN_IF_ERROR(null_probe_rows_->Init(id(), runtime_profile(), false));
+  null_probe_rows_ = make_unique<BufferedTupleStreamV2>(state, child(0)->row_desc(),
+      &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+      resource_profile_.spillable_buffer_size);
+  // TODO: we shouldn't start with this unpinned if spilling is disabled.
+  RETURN_IF_ERROR(null_probe_rows_->Init(id(), false));
   bool got_buffer;
   RETURN_IF_ERROR(null_probe_rows_->PrepareForWrite(&got_buffer));
-  if (!got_buffer) {
-    return state->block_mgr()->MemLimitTooLowError(builder_->block_mgr_client(), id());
-  }
+  DCHECK(got_buffer)
+      << "Accounted in min reservation" << buffer_pool_client_.DebugString();
   return Status::OK();
 }
 
@@ -860,8 +866,8 @@ Status PartitionedHashJoinNode::PrepareNullAwarePartition() {
   DCHECK_EQ(probe_batch_pos_, -1);
   DCHECK_EQ(probe_batch_->num_rows(), 0);
 
-  BufferedTupleStream* build_stream = builder_->null_aware_partition()->build_rows();
-  BufferedTupleStream* probe_stream = null_aware_probe_partition_->probe_rows();
+  BufferedTupleStreamV2* build_stream = builder_->null_aware_partition()->build_rows();
+  BufferedTupleStreamV2* probe_stream = null_aware_probe_partition_->probe_rows();
 
   if (build_stream->num_rows() == 0) {
     // There were no build rows. Nothing to do. Just prepare to output the null
@@ -874,7 +880,7 @@ Status PartitionedHashJoinNode::PrepareNullAwarePartition() {
 
   // Bring the entire spilled build stream into memory and read into a single batch.
   bool got_rows;
-  RETURN_IF_ERROR(build_stream->GetRows(&nulls_build_batch_, &got_rows));
+  RETURN_IF_ERROR(build_stream->GetRows(mem_tracker(), &nulls_build_batch_, &got_rows));
   if (!got_rows) return NullAwareAntiJoinError(true);
 
   // Initialize the streams for read.
@@ -898,7 +904,7 @@ Status PartitionedHashJoinNode::OutputNullAwareProbeRows(RuntimeState* state,
   int num_join_conjuncts = other_join_conjuncts_.size();
   DCHECK(probe_batch_ != NULL);
 
-  BufferedTupleStream* probe_stream = null_aware_probe_partition_->probe_rows();
+  BufferedTupleStreamV2* probe_stream = null_aware_probe_partition_->probe_rows();
   if (probe_batch_pos_ == probe_batch_->num_rows()) {
     probe_batch_pos_ = 0;
     probe_batch_->TransferResourceOwnership(out_batch);
@@ -946,7 +952,8 @@ Status PartitionedHashJoinNode::PrepareForProbe() {
   DCHECK(probe_hash_partitions_.empty());
 
   // Initialize the probe partitions, providing them with probe streams.
-  vector<unique_ptr<BufferedTupleStream>> probe_streams = builder_->TransferProbeStreams();
+  vector<unique_ptr<BufferedTupleStreamV2>> probe_streams =
+      builder_->TransferProbeStreams();
   probe_hash_partitions_.resize(PARTITION_FANOUT);
   for (int i = 0; i < PARTITION_FANOUT; ++i) {
     PhjBuilder::Partition* build_partition = builder_->hash_partition(i);
@@ -982,16 +989,16 @@ Status PartitionedHashJoinNode::PrepareForProbe() {
 }
 
 void PartitionedHashJoinNode::CreateProbePartition(
-    int partition_idx, unique_ptr<BufferedTupleStream> probe_rows) {
+    int partition_idx, unique_ptr<BufferedTupleStreamV2> probe_rows) {
   DCHECK_GE(partition_idx, 0);
   DCHECK_LT(partition_idx, probe_hash_partitions_.size());
   DCHECK(probe_hash_partitions_[partition_idx] == NULL);
-  probe_hash_partitions_[partition_idx] = std::make_unique<ProbePartition>(runtime_state_,
+  probe_hash_partitions_[partition_idx] = make_unique<ProbePartition>(runtime_state_,
       this, builder_->hash_partition(partition_idx), std::move(probe_rows));
 }
 
 Status PartitionedHashJoinNode::EvaluateNullProbe(
-    RuntimeState* state, BufferedTupleStream* build) {
+    RuntimeState* state, BufferedTupleStreamV2* build) {
   if (null_probe_rows_ == NULL || null_probe_rows_->num_rows() == 0) {
     return Status::OK();
   }
@@ -1000,10 +1007,10 @@ Status PartitionedHashJoinNode::EvaluateNullProbe(
   // Bring both the build and probe side into memory and do a pairwise evaluation.
   bool got_rows;
   scoped_ptr<RowBatch> build_rows;
-  RETURN_IF_ERROR(build->GetRows(&build_rows, &got_rows));
+  RETURN_IF_ERROR(build->GetRows(mem_tracker(), &build_rows, &got_rows));
   if (!got_rows) return NullAwareAntiJoinError(true);
   scoped_ptr<RowBatch> probe_rows;
-  RETURN_IF_ERROR(null_probe_rows_->GetRows(&probe_rows, &got_rows));
+  RETURN_IF_ERROR(null_probe_rows_->GetRows(mem_tracker(), &probe_rows, &got_rows));
   if (!got_rows) return NullAwareAntiJoinError(false);
 
   ScalarExprEvaluator* const* join_conjunct_evals = other_join_conjunct_evals_.data();
@@ -1060,11 +1067,9 @@ Status PartitionedHashJoinNode::CleanUpHashPartitions(
       // can recurse the algorithm and create new hash partitions from spilled partitions.
       // TODO: we shouldn't need to unpin the build stream if we stop spilling
       // while probing.
-      RETURN_IF_ERROR(
-          build_partition->build_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
-      DCHECK_EQ(build_partition->build_rows()->blocks_pinned(), 0);
-      RETURN_IF_ERROR(
-          probe_partition->probe_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
+      build_partition->build_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+      DCHECK_EQ(build_partition->build_rows()->BytesPinned(false), 0);
+      probe_partition->probe_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
 
       if (probe_partition->probe_rows()->num_rows() != 0
           || NeedToProcessUnmatchedBuildRows()) {
@@ -1102,9 +1107,9 @@ Status PartitionedHashJoinNode::CleanUpHashPartitions(
 
   // Just finished evaluating the null probe rows with all the non-spilled build
   // partitions. Unpin this now to free this memory for repartitioning.
-  if (null_probe_rows_ != NULL)
-    RETURN_IF_ERROR(
-        null_probe_rows_->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
+  if (null_probe_rows_ != NULL) {
+    null_probe_rows_->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+  }
 
   builder_->ClearHashPartitions();
   probe_hash_partitions_.clear();
@@ -1165,10 +1170,10 @@ string PartitionedHashJoinNode::NodeDebugString() const {
     ss << "  Probe hash partition " << i << ": ";
     if (probe_partition != NULL) {
       ss << "probe ptr=" << probe_partition;
-      BufferedTupleStream* probe_rows = probe_partition->probe_rows();
+      BufferedTupleStreamV2* probe_rows = probe_partition->probe_rows();
       if (probe_rows != NULL) {
-         ss << "    Probe Rows: " << probe_rows->num_rows()
-            << "    (Blocks pinned: " << probe_rows->blocks_pinned() << ")";
+        ss << "    Probe Rows: " << probe_rows->num_rows()
+           << "    (Bytes pinned: " << probe_rows->BytesPinned(false) << ")";
       }
     }
     ss << endl;
@@ -1189,12 +1194,15 @@ string PartitionedHashJoinNode::NodeDebugString() const {
     }
   }
   if (input_partition_ != NULL) {
-    DCHECK(input_partition_->build_partition()->build_rows() != NULL);
     DCHECK(input_partition_->probe_rows() != NULL);
-    ss << "InputPartition: " << input_partition_.get() << endl
-       << "   Spilled Build Rows: "
-       << input_partition_->build_partition()->build_rows()->num_rows() << endl
-       << "   Spilled Probe Rows: " << input_partition_->probe_rows()->num_rows() << endl;
+    ss << "InputPartition: " << input_partition_.get() << endl;
+    PhjBuilder::Partition* build_partition = input_partition_->build_partition();
+    if (build_partition->IsClosed()) {
+      ss << "   Build Partition Closed" << endl;
+    } else {
+      ss << "   Build Rows: " << build_partition->build_rows()->num_rows() << endl;
+    }
+    ss << "   Probe Rows: " << input_partition_->probe_rows()->num_rows() << endl;
   } else {
     ss << "InputPartition: NULL" << endl;
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.h b/be/src/exec/partitioned-hash-join-node.h
index 73e0dd5..b3f663e 100644
--- a/be/src/exec/partitioned-hash-join-node.h
+++ b/be/src/exec/partitioned-hash-join-node.h
@@ -15,28 +15,24 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
 #ifndef IMPALA_EXEC_PARTITIONED_HASH_JOIN_NODE_H
 #define IMPALA_EXEC_PARTITIONED_HASH_JOIN_NODE_H
 
-#include <boost/scoped_ptr.hpp>
-#include <boost/thread.hpp>
 #include <list>
 #include <memory>
 #include <string>
+#include <boost/scoped_ptr.hpp>
+#include <boost/thread.hpp>
 
 #include "exec/blocking-join-node.h"
 #include "exec/exec-node.h"
 #include "exec/partitioned-hash-join-builder.h"
-#include "runtime/buffered-block-mgr.h"
 
 #include "gen-cpp/Types_types.h"
 
 namespace impala {
 
 class BloomFilter;
-class BufferedBlockMgr;
-class BufferedTupleStream;
 class MemPool;
 class RowBatch;
 class RuntimeFilter;
@@ -100,8 +96,6 @@ class TupleRow;
 /// NULLs into several different streams, which are processed in a separate step to
 /// produce additional output rows. The NAAJ algorithm is documented in more detail in
 /// header comments for the null aware functions and data structures.
-///
-/// TODO: don't copy tuple rows so often.
 class PartitionedHashJoinNode : public BlockingJoinNode {
  public:
   PartitionedHashJoinNode(ObjectPool* pool, const TPlanNode& tnode,
@@ -168,7 +162,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
   /// Creates an initialized probe partition at 'partition_idx' in
   /// 'probe_hash_partitions_'.
   void CreateProbePartition(
-      int partition_idx, std::unique_ptr<BufferedTupleStream> probe_rows);
+      int partition_idx, std::unique_ptr<BufferedTupleStreamV2> probe_rows);
 
   /// Append the probe row 'row' to 'stream'. The stream must be unpinned and must have
   /// a write buffer allocated, so this will succeed unless an error is encountered.
@@ -176,7 +170,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
   /// return convention is used to avoid emitting unnecessary code for ~Status in perf-
   /// critical code.
   bool AppendProbeRow(
-      BufferedTupleStream* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
+      BufferedTupleStreamV2* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
 
   /// Probes the hash table for rows matching the current probe row and appends
   /// all the matching build rows (with probe row) to output batch. Returns true
@@ -331,7 +325,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
   /// conjuncts pass (i.e. there is a match).
   /// This is used for NAAJ, when there are NULL probe rows.
   Status EvaluateNullProbe(
-      RuntimeState* state, BufferedTupleStream* build) WARN_UNUSED_RESULT;
+      RuntimeState* state, BufferedTupleStreamV2* build) WARN_UNUSED_RESULT;
 
   /// Prepares to output NULLs on the probe side for NAAJ. Before calling this,
   /// matched_null_probe_ should have been fully evaluated.
@@ -478,7 +472,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
 
   /// For NAAJ, this stream contains all probe rows that had NULL on the hash table
   /// conjuncts. Must be unique_ptr so we can release it and transfer to output batches.
-  std::unique_ptr<BufferedTupleStream> null_probe_rows_;
+  std::unique_ptr<BufferedTupleStreamV2> null_probe_rows_;
 
   /// For each row in null_probe_rows_, true if this row has matched any build row
   /// (i.e. the resulting joined row passes other_join_conjuncts).
@@ -510,7 +504,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
     /// that has been prepared for writing with an I/O-sized write buffer.
     ProbePartition(RuntimeState* state, PartitionedHashJoinNode* parent,
         PhjBuilder::Partition* build_partition,
-        std::unique_ptr<BufferedTupleStream> probe_rows);
+        std::unique_ptr<BufferedTupleStreamV2> probe_rows);
     ~ProbePartition();
 
     /// Prepare to read the probe rows. Allocates the first read block, so reads will
@@ -523,21 +517,19 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
     /// resources if 'batch' is NULL. Idempotent.
     void Close(RowBatch* batch);
 
-    BufferedTupleStream* ALWAYS_INLINE probe_rows() { return probe_rows_.get(); }
+    BufferedTupleStreamV2* ALWAYS_INLINE probe_rows() { return probe_rows_.get(); }
     PhjBuilder::Partition* build_partition() { return build_partition_; }
 
     inline bool IsClosed() const { return probe_rows_ == NULL; }
 
    private:
-    PartitionedHashJoinNode* parent_;
-
     /// The corresponding build partition. Not NULL. Owned by PhjBuilder.
     PhjBuilder::Partition* build_partition_;
 
     /// Stream of probe tuples in this partition. Initially owned by this object but
     /// transferred to the parent exec node (via the row batch) when the partition
     /// is complete. If NULL, ownership was transferred and the partition is closed.
-    std::unique_ptr<BufferedTupleStream> probe_rows_;
+    std::unique_ptr<BufferedTupleStreamV2> probe_rows_;
   };
 
   /// For the below codegen'd functions, xxx_fn_level0_ uses CRC hashing when available

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.inline.h b/be/src/exec/partitioned-hash-join-node.inline.h
index a53b40e..3441aac 100644
--- a/be/src/exec/partitioned-hash-join-node.inline.h
+++ b/be/src/exec/partitioned-hash-join-node.inline.h
@@ -20,7 +20,7 @@
 
 #include "exec/partitioned-hash-join-node.h"
 
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
 
 namespace impala {
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/sort-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/sort-node.cc b/be/src/exec/sort-node.cc
index fd42124..440f809 100644
--- a/be/src/exec/sort-node.cc
+++ b/be/src/exec/sort-node.cc
@@ -52,9 +52,12 @@ Status SortNode::Prepare(RuntimeState* state) {
   SCOPED_TIMER(runtime_profile_->total_time_counter());
   RETURN_IF_ERROR(ExecNode::Prepare(state));
   less_than_.reset(new TupleRowComparator(ordering_exprs_, is_asc_order_, nulls_first_));
-  sorter_.reset(new Sorter(*less_than_, sort_tuple_exprs_,
-      &row_descriptor_, mem_tracker(), runtime_profile(), state));
+  sorter_.reset(
+      new Sorter(*less_than_, sort_tuple_exprs_, &row_descriptor_, mem_tracker(),
+          &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+          runtime_profile(), state, id(), true));
   RETURN_IF_ERROR(sorter_->Prepare(pool_, expr_mem_pool()));
+  DCHECK_GE(resource_profile_.min_reservation, sorter_->ComputeMinReservation());
   AddCodegenDisabledMessage(state);
   return Status::OK();
 }
@@ -69,9 +72,13 @@ void SortNode::Codegen(RuntimeState* state) {
 
 Status SortNode::Open(RuntimeState* state) {
   SCOPED_TIMER(runtime_profile_->total_time_counter());
-  // Open the child before consuming resources in this node.
-  RETURN_IF_ERROR(child(0)->Open(state));
   RETURN_IF_ERROR(ExecNode::Open(state));
+  RETURN_IF_ERROR(child(0)->Open(state));
+  // Claim reservation after the child has been opened to reduce the peak reservation
+  // requirement.
+  if (!buffer_pool_client_.is_registered()) {
+    RETURN_IF_ERROR(ClaimBufferReservation(state));
+  }
   RETURN_IF_ERROR(less_than_->Open(pool_, state, expr_mem_pool()));
   RETURN_IF_ERROR(sorter_->Open());
   RETURN_IF_CANCELLED(state);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/sort-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/sort-node.h b/be/src/exec/sort-node.h
index 8b3de11..a11d424 100644
--- a/be/src/exec/sort-node.h
+++ b/be/src/exec/sort-node.h
@@ -20,13 +20,12 @@
 
 #include "exec/exec-node.h"
 #include "runtime/sorter.h"
-#include "runtime/buffered-block-mgr.h"
 
 namespace impala {
 
 /// Node that implements a full sort of its input with a fixed memory budget, spilling
 /// to disk if the input is larger than available memory.
-/// Uses Sorter and BufferedBlockMgr for the external sort implementation.
+/// Uses Sorter for the external sort implementation.
 /// Input rows to SortNode are materialized by the Sorter into a single tuple
 /// using the expressions specified in sort_tuple_exprs_.
 /// In GetNext(), SortNode passes in the output batch to the sorter instance created

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index 2de0f2e..92af968 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -24,8 +24,6 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
 set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
 
 add_library(Runtime
-  buffered-block-mgr.cc
-  buffered-tuple-stream.cc
   buffered-tuple-stream-v2.cc
   client-cache.cc
   coordinator.cc
@@ -45,6 +43,7 @@ add_library(Runtime
   hbase-table.cc
   hbase-table-factory.cc
   hdfs-fs-cache.cc
+  initial-reservations.cc
   lib-cache.cc
   mem-tracker.cc
   mem-pool.cc
@@ -83,7 +82,6 @@ ADD_BE_TEST(string-buffer-test)
 ADD_BE_TEST(data-stream-test)
 ADD_BE_TEST(timestamp-test)
 ADD_BE_TEST(disk-io-mgr-test)
-ADD_BE_TEST(buffered-block-mgr-test)
 ADD_BE_TEST(parallel-executor-test)
 ADD_BE_TEST(raw-value-test)
 ADD_BE_TEST(string-compare-test)
@@ -93,7 +91,6 @@ ADD_BE_TEST(thread-resource-mgr-test)
 ADD_BE_TEST(mem-tracker-test)
 ADD_BE_TEST(multi-precision-test)
 ADD_BE_TEST(decimal-test)
-ADD_BE_TEST(buffered-tuple-stream-test)
 ADD_BE_TEST(buffered-tuple-stream-v2-test)
 ADD_BE_TEST(hdfs-fs-cache-test)
 ADD_BE_TEST(tmp-file-mgr-test)



[02/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
index 8e8ddc0..54ce9b0 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
@@ -5,7 +5,7 @@ on ss_customer_sk = c_customer_sk
 where c_salutation = 'Mrs.'
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=180.46MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=180.46MB mem-reservation=8.50MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -13,7 +13,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: ss_customer_sk = c_customer_sk
 |  runtime filters: RF000 <- c_customer_sk
-|  mem-estimate=4.46MB mem-reservation=136.00MB
+|  mem-estimate=4.46MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0,1 row-size=355B cardinality=529700
 |
 |--01:SCAN HDFS [tpcds.customer]
@@ -43,7 +43,7 @@ on ss_customer_sk = c_customer_sk
 where c_salutation = 'Mrs.'
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=180.46MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=180.46MB mem-reservation=8.50MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -51,7 +51,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: ss_customer_sk = c_customer_sk
 |  other predicates: c_salutation = 'Mrs.'
-|  mem-estimate=4.46MB mem-reservation=136.00MB
+|  mem-estimate=4.46MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0,1N row-size=355B cardinality=2880404
 |
 |--01:SCAN HDFS [tpcds.customer]
@@ -80,7 +80,7 @@ on ss_customer_sk = c_customer_sk
 where c_salutation = 'Mrs.'
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=180.46MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=180.46MB mem-reservation=8.50MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -88,7 +88,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: ss_customer_sk = c_customer_sk
 |  runtime filters: RF000 <- c_customer_sk
-|  mem-estimate=4.46MB mem-reservation=136.00MB
+|  mem-estimate=4.46MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=0N,1 row-size=355B cardinality=529700
 |
 |--01:SCAN HDFS [tpcds.customer]
@@ -117,7 +117,7 @@ on ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number
 where sr_return_quantity < 10
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=210.65MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=210.65MB mem-reservation=4.25MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -125,7 +125,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_item_sk = sr_item_sk, ss_ticket_number = sr_ticket_number
 |  fk/pk conjuncts: ss_item_sk = sr_item_sk, ss_ticket_number = sr_ticket_number
 |  runtime filters: RF000 <- sr_item_sk, RF001 <- sr_ticket_number
-|  mem-estimate=2.65MB mem-reservation=136.00MB
+|  mem-estimate=2.65MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  tuple-ids=0,1 row-size=188B cardinality=211838
 |
 |--01:SCAN HDFS [tpcds.store_returns]
@@ -153,7 +153,7 @@ tpcds.store_sales inner join tpcds.web_sales
 on ss_sold_time_sk = ws_sold_time_sk
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=396.67MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=396.67MB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -161,7 +161,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_sold_time_sk = ws_sold_time_sk
 |  fk/pk conjuncts: none
 |  runtime filters: RF000 <- ws_sold_time_sk
-|  mem-estimate=108.67MB mem-reservation=136.00MB
+|  mem-estimate=108.67MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=244B cardinality=44136418
 |
 |--01:SCAN HDFS [tpcds.web_sales]
@@ -188,7 +188,7 @@ on a.d_date_sk = b.d_date_sk
 where a.d_holiday = "Y"
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=107.62MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=107.62MB mem-reservation=17.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -196,7 +196,7 @@ PLAN-ROOT SINK
 |  hash predicates: b.d_date_sk = a.d_date_sk
 |  fk/pk conjuncts: b.d_date_sk = a.d_date_sk
 |  runtime filters: RF000 <- a.d_date_sk
-|  mem-estimate=11.62MB mem-reservation=136.00MB
+|  mem-estimate=11.62MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=1,0 row-size=606B cardinality=36525
 |
 |--00:SCAN HDFS [tpcds.date_dim a]
@@ -229,7 +229,7 @@ where ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number
   and d1.d_fy_week_seq = 1000
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=352.73MB mem-reservation=544.00MB
+|  Per-Host Resources: mem-estimate=352.73MB mem-reservation=4.25MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -237,7 +237,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_addr_sk = c_current_addr_sk
 |  fk/pk conjuncts: none
 |  runtime filters: RF000 <- c_current_addr_sk
-|  mem-estimate=429.69KB mem-reservation=136.00MB
+|  mem-estimate=429.69KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=1,0,3,4,2 row-size=60B cardinality=19358
 |
 |--02:SCAN HDFS [tpcds.customer]
@@ -252,7 +252,7 @@ PLAN-ROOT SINK
 |  hash predicates: sr_returned_date_sk = d2.d_date_sk
 |  fk/pk conjuncts: sr_returned_date_sk = d2.d_date_sk
 |  runtime filters: RF001 <- d2.d_date_sk
-|  mem-estimate=313.88KB mem-reservation=136.00MB
+|  mem-estimate=313.88KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=1,0,3,4 row-size=56B cardinality=8131
 |
 |--04:SCAN HDFS [tpcds.date_dim d2]
@@ -267,14 +267,14 @@ PLAN-ROOT SINK
 |  hash predicates: sr_item_sk = ss_item_sk, sr_ticket_number = ss_ticket_number
 |  fk/pk conjuncts: sr_item_sk = ss_item_sk, sr_ticket_number = ss_ticket_number
 |  runtime filters: RF002 <- ss_item_sk, RF003 <- ss_ticket_number
-|  mem-estimate=380.02KB mem-reservation=136.00MB
+|  mem-estimate=380.02KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=1,0,3 row-size=52B cardinality=8131
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: ss_sold_date_sk = d1.d_date_sk
 |  |  fk/pk conjuncts: ss_sold_date_sk = d1.d_date_sk
 |  |  runtime filters: RF004 <- d1.d_date_sk
-|  |  mem-estimate=62B mem-reservation=136.00MB
+|  |  mem-estimate=62B mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=0,3 row-size=32B cardinality=11055
 |  |
 |  |--03:SCAN HDFS [tpcds.date_dim d1]
@@ -311,7 +311,7 @@ tpcds.store_sales inner join tpcds.customer
 on ss_customer_sk % 10 = c_customer_sk / 100
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=202.79MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=202.79MB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -319,7 +319,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_customer_sk % 10 = c_customer_sk / 100
 |  fk/pk conjuncts: assumed fk/pk
 |  runtime filters: RF000 <- c_customer_sk / 100
-|  mem-estimate=26.79MB mem-reservation=136.00MB
+|  mem-estimate=26.79MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=355B cardinality=2880404
 |
 |--01:SCAN HDFS [tpcds.customer]
@@ -346,7 +346,7 @@ tpcds.store_sales inner join tpcds_seq_snap.customer
 on ss_customer_sk = c_customer_sk
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=2.17GB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=2.17GB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -354,7 +354,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: assumed fk/pk
 |  runtime filters: RF000 <- c_customer_sk
-|  mem-estimate=2.00GB mem-reservation=136.00MB
+|  mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=8B cardinality=2880404
 |
 |--01:SCAN HDFS [tpcds_seq_snap.customer]
@@ -380,7 +380,7 @@ tpcds_seq_snap.store_sales inner join tpcds.customer
 on ss_customer_sk = c_customer_sk
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=176.42MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=176.42MB mem-reservation=1.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -388,7 +388,7 @@ PLAN-ROOT SINK
 |  hash predicates: ss_customer_sk = c_customer_sk
 |  fk/pk conjuncts: assumed fk/pk
 |  runtime filters: RF000 <- c_customer_sk
-|  mem-estimate=429.69KB mem-reservation=136.00MB
+|  mem-estimate=429.69KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=0,1 row-size=8B cardinality=unavailable
 |
 |--01:SCAN HDFS [tpcds.customer]
@@ -416,7 +416,7 @@ tpcds.store_sales inner join
 on ss_sold_time_sk = ws_sold_time_sk
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=298.00MB mem-reservation=400.00MB
+|  Per-Host Resources: mem-estimate=298.00MB mem-reservation=2.12MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -424,12 +424,12 @@ PLAN-ROOT SINK
 |  hash predicates: ss_sold_time_sk = ws_sold_time_sk
 |  fk/pk conjuncts: none
 |  runtime filters: RF000 <- ws_sold_time_sk
-|  mem-estimate=170.89KB mem-reservation=136.00MB
+|  mem-estimate=170.89KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=0,2 row-size=104B cardinality=2440073
 |
 |--02:AGGREGATE [FINALIZE]
 |  |  group by: ws_sold_time_sk
-|  |  mem-estimate=10.00MB mem-reservation=264.00MB
+|  |  mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=2 row-size=4B cardinality=39771
 |  |
 |  01:SCAN HDFS [tpcds.web_sales]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
index f22e359..8bd09be 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
@@ -40,7 +40,7 @@ order by cnt, bigint_col
 limit 10
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=264.00MB
+|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -52,7 +52,7 @@ PLAN-ROOT SINK
 01:AGGREGATE [FINALIZE]
 |  output: count(int_col)
 |  group by: bigint_col
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=unavailable
 |
 00:SCAN HDFS [functional_parquet.alltypes]
@@ -78,7 +78,7 @@ PLAN-ROOT SINK
 |  tuple-ids=2 row-size=16B cardinality=10
 |
 F01:PLAN FRAGMENT [HASH(bigint_col)] hosts=3 instances=9
-Per-Host Resources: mem-estimate=384.00MB mem-reservation=792.00MB
+Per-Host Resources: mem-estimate=384.00MB mem-reservation=102.00MB
 02:TOP-N [LIMIT=10]
 |  order by: count(int_col) ASC, bigint_col ASC
 |  mem-estimate=160B mem-reservation=0B
@@ -87,7 +87,7 @@ Per-Host Resources: mem-estimate=384.00MB mem-reservation=792.00MB
 04:AGGREGATE [FINALIZE]
 |  output: count:merge(int_col)
 |  group by: bigint_col
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=unavailable
 |
 03:EXCHANGE [HASH(bigint_col)]
@@ -99,7 +99,7 @@ Per-Host Resources: mem-estimate=432.00MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: count(int_col)
 |  group by: bigint_col
-|  mem-estimate=128.00MB mem-reservation=0B
+|  mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=unavailable
 |
 00:SCAN HDFS [functional_parquet.alltypes, RANDOM]
@@ -119,7 +119,7 @@ from functional_parquet.alltypes
 where id < 10
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=16.00MB mem-reservation=40.00MB
+|  Per-Host Resources: mem-estimate=16.00MB mem-reservation=10.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -128,12 +128,12 @@ PLAN-ROOT SINK
 |  partition by: int_col
 |  order by: id ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=4,3 row-size=16B cardinality=unavailable
 |
 01:SORT
 |  order by: int_col ASC NULLS FIRST, id ASC
-|  mem-estimate=0B mem-reservation=24.00MB
+|  mem-estimate=0B mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=4 row-size=8B cardinality=unavailable
 |
 00:SCAN HDFS [functional_parquet.alltypes]
@@ -157,18 +157,18 @@ PLAN-ROOT SINK
 |  tuple-ids=4,3 row-size=16B cardinality=unavailable
 |
 F01:PLAN FRAGMENT [HASH(int_col)] hosts=3 instances=9
-Per-Host Resources: mem-estimate=0B mem-reservation=120.00MB
+Per-Host Resources: mem-estimate=0B mem-reservation=30.00MB
 02:ANALYTIC
 |  functions: row_number()
 |  partition by: int_col
 |  order by: id ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=4,3 row-size=16B cardinality=unavailable
 |
 01:SORT
 |  order by: int_col ASC NULLS FIRST, id ASC
-|  mem-estimate=0B mem-reservation=24.00MB
+|  mem-estimate=0B mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=4 row-size=8B cardinality=unavailable
 |
 03:EXCHANGE [HASH(int_col)]
@@ -313,7 +313,7 @@ from tpch_nested_parquet.customer c, c.c_orders o1, c.c_orders o2
 where o1.o_orderkey = o2.o_orderkey + 2 and o1.o_orderkey < 5
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=88.00MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=88.00MB mem-reservation=1.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -324,7 +324,7 @@ PLAN-ROOT SINK
 |--06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o1.o_orderkey = o2.o_orderkey + 2
 |  |  fk/pk conjuncts: assumed fk/pk
-|  |  mem-estimate=0B mem-reservation=136.00MB
+|  |  mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=1,0,2 row-size=286B cardinality=10
 |  |
 |  |--04:UNNEST [c.c_orders o2]
@@ -366,7 +366,7 @@ PLAN-ROOT SINK
 |  tuple-ids=1,0,2 row-size=286B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=9
-Per-Host Resources: mem-estimate=264.00MB mem-reservation=408.00MB
+Per-Host Resources: mem-estimate=264.00MB mem-reservation=3.19MB
 01:SUBPLAN
 |  mem-estimate=0B mem-reservation=0B
 |  tuple-ids=1,0,2 row-size=286B cardinality=1500000
@@ -374,7 +374,7 @@ Per-Host Resources: mem-estimate=264.00MB mem-reservation=408.00MB
 |--06:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o1.o_orderkey = o2.o_orderkey + 2
 |  |  fk/pk conjuncts: assumed fk/pk
-|  |  mem-estimate=0B mem-reservation=136.00MB
+|  |  mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=1,0,2 row-size=286B cardinality=10
 |  |
 |  |--04:UNNEST [c.c_orders o2]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
index 0de7109..4165e70 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
@@ -14,7 +14,7 @@ PLAN-ROOT SINK
 |
 01:AGGREGATE [FINALIZE]
 |  output: count(*)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 00:SCAN HDFS [functional_parquet.alltypes]
@@ -44,7 +44,7 @@ PLAN-ROOT SINK
 |
 01:AGGREGATE [FINALIZE]
 |  output: count(*)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 00:SCAN HDFS [functional_parquet.alltypes]
@@ -75,7 +75,7 @@ PLAN-ROOT SINK
 |
 01:AGGREGATE [FINALIZE]
 |  output: count(*)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 00:SCAN HDFS [functional_parquet.alltypes]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
index f3dd19a..90a318e 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
@@ -353,18 +353,18 @@ select l_orderkey, count(*)
 from tpch_parquet.lineitem
 group by l_orderkey
 ---- PLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=106.24MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=106.24MB mem-reservation=264.00MB
+|  Per-Host Resources: mem-estimate=106.24MB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:AGGREGATE [FINALIZE]
 |  output: count(*)
 |  group by: l_orderkey
-|  mem-estimate=26.24MB mem-reservation=264.00MB
+|  mem-estimate=26.24MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 00:SCAN HDFS [tpch_parquet.lineitem]
@@ -375,7 +375,7 @@ PLAN-ROOT SINK
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=8B cardinality=6001215
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=8.50MB
 Per-Host Resource Estimates: Memory=116.24MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -388,11 +388,11 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 F01:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=10.00MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=10.00MB mem-reservation=8.50MB
 03:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
 |  group by: l_orderkey
-|  mem-estimate=10.00MB mem-reservation=264.00MB
+|  mem-estimate=10.00MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 02:EXCHANGE [HASH(l_orderkey)]
@@ -404,7 +404,7 @@ Per-Host Resources: mem-estimate=106.24MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: count(*)
 |  group by: l_orderkey
-|  mem-estimate=26.24MB mem-reservation=0B
+|  mem-estimate=26.24MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -415,7 +415,7 @@ Per-Host Resources: mem-estimate=106.24MB mem-reservation=0B
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=8B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=528.00MB
+Per-Host Resource Reservation: Memory=8.50MB
 Per-Host Resource Estimates: Memory=232.48MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -428,11 +428,11 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 F01:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=20.00MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=20.00MB mem-reservation=8.50MB
 03:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
 |  group by: l_orderkey
-|  mem-estimate=10.00MB mem-reservation=264.00MB
+|  mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 02:EXCHANGE [HASH(l_orderkey)]
@@ -444,7 +444,7 @@ Per-Host Resources: mem-estimate=212.48MB mem-reservation=0B
 01:AGGREGATE [STREAMING]
 |  output: count(*)
 |  group by: l_orderkey
-|  mem-estimate=26.24MB mem-reservation=0B
+|  mem-estimate=26.24MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=1563438
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -468,7 +468,7 @@ PLAN-ROOT SINK
 |
 01:AGGREGATE [FINALIZE]
 |  output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 00:SCAN HDFS [tpch_parquet.lineitem]
@@ -489,7 +489,7 @@ PLAN-ROOT SINK
 |
 03:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 02:EXCHANGE [UNPARTITIONED]
@@ -500,7 +500,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
 Per-Host Resources: mem-estimate=90.00MB mem-reservation=0B
 01:AGGREGATE
 |  output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -521,7 +521,7 @@ PLAN-ROOT SINK
 |
 03:AGGREGATE [FINALIZE]
 |  output: count:merge(*)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 02:EXCHANGE [UNPARTITIONED]
@@ -532,7 +532,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
 Per-Host Resources: mem-estimate=180.00MB mem-reservation=0B
 01:AGGREGATE
 |  output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=1
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -548,17 +548,17 @@ select *
 from tpch_parquet.lineitem
 order by l_comment
 ---- PLAN
-Per-Host Resource Reservation: Memory=48.00MB
-Per-Host Resource Estimates: Memory=240.00MB
+Per-Host Resource Reservation: Memory=12.00MB
+Per-Host Resource Estimates: Memory=120.00MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=240.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=120.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: l_comment ASC
-|  mem-estimate=160.00MB mem-reservation=48.00MB
+|  mem-estimate=40.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 00:SCAN HDFS [tpch_parquet.lineitem]
@@ -569,8 +569,8 @@ PLAN-ROOT SINK
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=48.00MB
-Per-Host Resource Estimates: Memory=240.00MB
+Per-Host Resource Reservation: Memory=12.00MB
+Per-Host Resource Estimates: Memory=120.00MB
 
 F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
 |  Per-Host Resources: mem-estimate=0B mem-reservation=0B
@@ -583,10 +583,10 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=240.00MB mem-reservation=48.00MB
+Per-Host Resources: mem-estimate=120.00MB mem-reservation=12.00MB
 01:SORT
 |  order by: l_comment ASC
-|  mem-estimate=160.00MB mem-reservation=48.00MB
+|  mem-estimate=40.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -597,8 +597,8 @@ Per-Host Resources: mem-estimate=240.00MB mem-reservation=48.00MB
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=96.00MB
-Per-Host Resource Estimates: Memory=480.00MB
+Per-Host Resource Reservation: Memory=24.00MB
+Per-Host Resource Estimates: Memory=240.00MB
 
 F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
 |  Per-Host Resources: mem-estimate=0B mem-reservation=0B
@@ -611,10 +611,10 @@ PLAN-ROOT SINK
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=480.00MB mem-reservation=96.00MB
+Per-Host Resources: mem-estimate=240.00MB mem-reservation=24.00MB
 01:SORT
 |  order by: l_comment ASC
-|  mem-estimate=160.00MB mem-reservation=48.00MB
+|  mem-estimate=40.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=263B cardinality=6001215
 |
 00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -714,11 +714,11 @@ Per-Host Resources: mem-estimate=160.05MB mem-reservation=0B
 select *
 from tpch.lineitem inner join tpch.orders on l_orderkey = o_orderkey
 ---- PLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=476.41MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=476.41MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=476.41MB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -726,7 +726,7 @@ PLAN-ROOT SINK
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=300.41MB mem-reservation=136.00MB
+|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 |--01:SCAN HDFS [tpch.orders]
@@ -746,7 +746,7 @@ PLAN-ROOT SINK
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=476.41MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -759,12 +759,12 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=300.41MB mem-reservation=136.00MB
+|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 |--03:EXCHANGE [BROADCAST]
@@ -790,7 +790,7 @@ Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=952.83MB
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -803,13 +803,13 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=776.83MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=776.83MB mem-reservation=68.00MB
 02:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash-table-id=00
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=300.41MB mem-reservation=136.00MB
+|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 |--F03:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -846,11 +846,11 @@ Per-Host Resources: mem-estimate=776.83MB mem-reservation=272.00MB
 select *
 from tpch.lineitem inner join /* +shuffle */ tpch.orders on l_orderkey = o_orderkey
 ---- PLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=476.41MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=476.41MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=476.41MB mem-reservation=34.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -858,7 +858,7 @@ PLAN-ROOT SINK
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=300.41MB mem-reservation=136.00MB
+|  mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 |--01:SCAN HDFS [tpch.orders]
@@ -878,7 +878,7 @@ PLAN-ROOT SINK
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
 Per-Host Resource Estimates: Memory=276.14MB
 
 F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -891,12 +891,12 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=100.14MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=100.14MB mem-reservation=34.00MB
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=100.14MB mem-reservation=136.00MB
+|  mem-estimate=100.14MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 |--04:EXCHANGE [HASH(o_orderkey)]
@@ -928,7 +928,7 @@ Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=263B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=452.14MB
 
 F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -941,13 +941,13 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=100.14MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=100.14MB mem-reservation=68.00MB
 02:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash-table-id=00
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=50.07MB mem-reservation=136.00MB
+|  mem-estimate=50.07MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1 row-size=454B cardinality=5757710
 |
 |--F04:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -1151,24 +1151,24 @@ PLAN-ROOT SINK
 select max(tinyint_col) over(partition by int_col)
 from functional.alltypes
 ---- PLAN
-Per-Host Resource Reservation: Memory=40.00MB
-Per-Host Resource Estimates: Memory=24.00MB
+Per-Host Resource Reservation: Memory=10.00MB
+Per-Host Resource Estimates: Memory=18.00MB
 Codegen disabled by planner
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=24.00MB mem-reservation=40.00MB
+|  Per-Host Resources: mem-estimate=18.00MB mem-reservation=10.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 02:ANALYTIC
 |  functions: max(tinyint_col)
 |  partition by: int_col
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=3,2 row-size=6B cardinality=7300
 |
 01:SORT
 |  order by: int_col ASC NULLS FIRST
-|  mem-estimate=8.00MB mem-reservation=24.00MB
+|  mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=3 row-size=5B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -1179,8 +1179,8 @@ PLAN-ROOT SINK
    mem-estimate=16.00MB mem-reservation=0B
    tuple-ids=0 row-size=5B cardinality=7300
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=40.00MB
-Per-Host Resource Estimates: Memory=24.00MB
+Per-Host Resource Reservation: Memory=10.00MB
+Per-Host Resource Estimates: Memory=18.00MB
 Codegen disabled by planner
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1193,16 +1193,16 @@ PLAN-ROOT SINK
 |  tuple-ids=3,2 row-size=6B cardinality=7300
 |
 F01:PLAN FRAGMENT [HASH(int_col)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=8.00MB mem-reservation=40.00MB
+Per-Host Resources: mem-estimate=2.00MB mem-reservation=10.00MB
 02:ANALYTIC
 |  functions: max(tinyint_col)
 |  partition by: int_col
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=3,2 row-size=6B cardinality=7300
 |
 01:SORT
 |  order by: int_col ASC NULLS FIRST
-|  mem-estimate=8.00MB mem-reservation=24.00MB
+|  mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=3 row-size=5B cardinality=7300
 |
 03:EXCHANGE [HASH(int_col)]
@@ -1219,8 +1219,8 @@ Per-Host Resources: mem-estimate=16.00MB mem-reservation=0B
    mem-estimate=16.00MB mem-reservation=0B
    tuple-ids=0 row-size=5B cardinality=7300
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=80.00MB
-Per-Host Resource Estimates: Memory=48.00MB
+Per-Host Resource Reservation: Memory=20.00MB
+Per-Host Resource Estimates: Memory=36.00MB
 Codegen disabled by planner
 
 F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1233,16 +1233,16 @@ PLAN-ROOT SINK
 |  tuple-ids=3,2 row-size=6B cardinality=7300
 |
 F01:PLAN FRAGMENT [HASH(int_col)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=16.00MB mem-reservation=80.00MB
+Per-Host Resources: mem-estimate=4.00MB mem-reservation=20.00MB
 02:ANALYTIC
 |  functions: max(tinyint_col)
 |  partition by: int_col
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=3,2 row-size=6B cardinality=7300
 |
 01:SORT
 |  order by: int_col ASC NULLS FIRST
-|  mem-estimate=8.00MB mem-reservation=24.00MB
+|  mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=3 row-size=5B cardinality=7300
 |
 03:EXCHANGE [HASH(int_col)]
@@ -1266,11 +1266,11 @@ select *, row_number() over (order by o_totalprice) rnum_price,
   row_number() over (order by o_orderpriority) rnum_priority
 from tpch_parquet.orders
 ---- PLAN
-Per-Host Resource Reservation: Memory=144.00MB
-Per-Host Resource Estimates: Memory=160.00MB
+Per-Host Resource Reservation: Memory=36.00MB
+Per-Host Resource Estimates: Memory=58.00MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=160.00MB mem-reservation=144.00MB
+|  Per-Host Resources: mem-estimate=58.00MB mem-reservation=36.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -1278,36 +1278,36 @@ PLAN-ROOT SINK
 |  functions: row_number()
 |  order by: o_orderpriority ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=10,5 row-size=215B cardinality=1500000
 |
 05:SORT
 |  order by: o_orderpriority ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=10 row-size=207B cardinality=1500000
 |
 04:ANALYTIC
 |  functions: row_number()
 |  order by: o_orderdate ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=8,4 row-size=207B cardinality=1500000
 |
 03:SORT
 |  order by: o_orderdate ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=8 row-size=199B cardinality=1500000
 |
 02:ANALYTIC
 |  functions: row_number()
 |  order by: o_totalprice ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=6,3 row-size=199B cardinality=1500000
 |
 01:SORT
 |  order by: o_totalprice ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=191B cardinality=1500000
 |
 00:SCAN HDFS [tpch_parquet.orders]
@@ -1318,11 +1318,11 @@ PLAN-ROOT SINK
    mem-estimate=40.00MB mem-reservation=0B
    tuple-ids=0 row-size=191B cardinality=1500000
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=176.00MB
-Per-Host Resource Estimates: Memory=280.00MB
+Per-Host Resource Reservation: Memory=44.00MB
+Per-Host Resource Estimates: Memory=94.00MB
 
 F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=160.00MB mem-reservation=128.00MB
+|  Per-Host Resources: mem-estimate=36.00MB mem-reservation=32.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -1330,31 +1330,31 @@ PLAN-ROOT SINK
 |  functions: row_number()
 |  order by: o_orderpriority ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=10,5 row-size=215B cardinality=1500000
 |
 05:SORT
 |  order by: o_orderpriority ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=10 row-size=207B cardinality=1500000
 |
 04:ANALYTIC
 |  functions: row_number()
 |  order by: o_orderdate ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=8,4 row-size=207B cardinality=1500000
 |
 03:SORT
 |  order by: o_orderdate ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=8 row-size=199B cardinality=1500000
 |
 02:ANALYTIC
 |  functions: row_number()
 |  order by: o_totalprice ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=6,3 row-size=199B cardinality=1500000
 |
 07:MERGING-EXCHANGE [UNPARTITIONED]
@@ -1363,10 +1363,10 @@ PLAN-ROOT SINK
 |  tuple-ids=6 row-size=191B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
-Per-Host Resources: mem-estimate=120.00MB mem-reservation=48.00MB
+Per-Host Resources: mem-estimate=58.00MB mem-reservation=12.00MB
 01:SORT
 |  order by: o_totalprice ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=191B cardinality=1500000
 |
 00:SCAN HDFS [tpch_parquet.orders, RANDOM]
@@ -1377,11 +1377,11 @@ Per-Host Resources: mem-estimate=120.00MB mem-reservation=48.00MB
    mem-estimate=40.00MB mem-reservation=0B
    tuple-ids=0 row-size=191B cardinality=1500000
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=224.00MB
-Per-Host Resource Estimates: Memory=400.00MB
+Per-Host Resource Reservation: Memory=56.00MB
+Per-Host Resource Estimates: Memory=152.00MB
 
 F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=160.00MB mem-reservation=128.00MB
+|  Per-Host Resources: mem-estimate=36.00MB mem-reservation=32.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -1389,31 +1389,31 @@ PLAN-ROOT SINK
 |  functions: row_number()
 |  order by: o_orderpriority ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=10,5 row-size=215B cardinality=1500000
 |
 05:SORT
 |  order by: o_orderpriority ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=10 row-size=207B cardinality=1500000
 |
 04:ANALYTIC
 |  functions: row_number()
 |  order by: o_orderdate ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=8,4 row-size=207B cardinality=1500000
 |
 03:SORT
 |  order by: o_orderdate ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=8 row-size=199B cardinality=1500000
 |
 02:ANALYTIC
 |  functions: row_number()
 |  order by: o_totalprice ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=6,3 row-size=199B cardinality=1500000
 |
 07:MERGING-EXCHANGE [UNPARTITIONED]
@@ -1422,10 +1422,10 @@ PLAN-ROOT SINK
 |  tuple-ids=6 row-size=191B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
-Per-Host Resources: mem-estimate=240.00MB mem-reservation=96.00MB
+Per-Host Resources: mem-estimate=116.00MB mem-reservation=24.00MB
 01:SORT
 |  order by: o_totalprice ASC
-|  mem-estimate=80.00MB mem-reservation=48.00MB
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=191B cardinality=1500000
 |
 00:SCAN HDFS [tpch_parquet.orders, RANDOM]
@@ -1449,11 +1449,11 @@ select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
 from tpch_parquet.lineitem join tpch_parquet.orders on l_orderkey = o_orderkey
 where l_shipmode = 'F'
 ---- PLAN
-Per-Host Resource Reservation: Memory=400.00MB
+Per-Host Resource Reservation: Memory=51.00MB
 Per-Host Resource Estimates: Memory=135.17MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=135.17MB mem-reservation=400.00MB
+|  Per-Host Resources: mem-estimate=135.17MB mem-reservation=51.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -1466,7 +1466,7 @@ PLAN-ROOT SINK
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  |  runtime filters: RF002 <- o_orderkey
-|  |  mem-estimate=12.59MB mem-reservation=136.00MB
+|  |  mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=5,6 row-size=99B cardinality=822530
 |  |
 |  |--09:SCAN HDFS [tpch_parquet.orders]
@@ -1493,7 +1493,7 @@ PLAN-ROOT SINK
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  |  runtime filters: RF001 <- o_orderkey
-|  |  mem-estimate=10.20MB mem-reservation=136.00MB
+|  |  mem-estimate=10.20MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=3,4 row-size=103B cardinality=1151542
 |  |
 |  |--06:SCAN HDFS [tpch_parquet.orders]
@@ -1518,14 +1518,14 @@ PLAN-ROOT SINK
 |
 04:AGGREGATE [FINALIZE]
 |  group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-|  mem-estimate=42.58MB mem-reservation=264.00MB
+|  mem-estimate=42.58MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 03:HASH JOIN [INNER JOIN]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=12.59MB mem-reservation=136.00MB
+|  mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=0,1 row-size=86B cardinality=575772
 |
 |--02:SCAN HDFS [tpch_parquet.orders]
@@ -1548,7 +1548,7 @@ PLAN-ROOT SINK
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=78B cardinality=600122
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=400.00MB
+Per-Host Resource Reservation: Memory=38.25MB
 Per-Host Resource Estimates: Memory=339.36MB
 
 F09:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1561,7 +1561,7 @@ PLAN-ROOT SINK
 |  tuple-ids=7 row-size=70B cardinality=2549844
 |
 F08:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=92.59MB mem-reservation=34.00MB
 00:UNION
 |  pass-through-operands: 14
 |  mem-estimate=0B mem-reservation=0B
@@ -1571,7 +1571,7 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  |  runtime filters: RF002 <- o_orderkey
-|  |  mem-estimate=12.59MB mem-reservation=136.00MB
+|  |  mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=5,6 row-size=99B cardinality=822530
 |  |
 |  |--16:EXCHANGE [BROADCAST]
@@ -1604,7 +1604,7 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  |  runtime filters: RF001 <- o_orderkey
-|  |  mem-estimate=10.20MB mem-reservation=136.00MB
+|  |  mem-estimate=10.20MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=3,4 row-size=103B cardinality=1151542
 |  |
 |  |--15:EXCHANGE [BROADCAST]
@@ -1635,7 +1635,7 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
 |
 14:AGGREGATE [FINALIZE]
 |  group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-|  mem-estimate=42.58MB mem-reservation=264.00MB
+|  mem-estimate=42.58MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 13:EXCHANGE [HASH(l_orderkey,l_partkey,l_suppkey,l_linenumber,l_comment)]
@@ -1643,17 +1643,17 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=46.78MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=46.78MB mem-reservation=4.25MB
 04:AGGREGATE [STREAMING]
 |  group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-|  mem-estimate=42.58MB mem-reservation=0B
+|  mem-estimate=42.58MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=4.20MB mem-reservation=136.00MB
+|  mem-estimate=4.20MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  tuple-ids=0,1 row-size=86B cardinality=575772
 |
 |--12:EXCHANGE [HASH(o_orderkey)]
@@ -1688,7 +1688,7 @@ Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
    mem-estimate=80.00MB mem-reservation=0B
    tuple-ids=0 row-size=78B cardinality=600122
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=800.00MB
+Per-Host Resource Reservation: Memory=72.25MB
 Per-Host Resource Estimates: Memory=674.53MB
 
 F09:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1701,7 +1701,7 @@ PLAN-ROOT SINK
 |  tuple-ids=7 row-size=70B cardinality=2549844
 |
 F08:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=185.18MB mem-reservation=68.00MB
 00:UNION
 |  pass-through-operands: 14
 |  mem-estimate=0B mem-reservation=0B
@@ -1712,7 +1712,7 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  |  runtime filters: RF002 <- o_orderkey
-|  |  mem-estimate=12.59MB mem-reservation=136.00MB
+|  |  mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=5,6 row-size=99B cardinality=822530
 |  |
 |  |--F11:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -1753,7 +1753,7 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
 |  |  hash predicates: l_orderkey = o_orderkey
 |  |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  |  runtime filters: RF001 <- o_orderkey
-|  |  mem-estimate=10.20MB mem-reservation=136.00MB
+|  |  mem-estimate=10.20MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=3,4 row-size=103B cardinality=1151542
 |  |
 |  |--F10:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -1791,7 +1791,7 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
 |
 14:AGGREGATE [FINALIZE]
 |  group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-|  mem-estimate=42.58MB mem-reservation=264.00MB
+|  mem-estimate=42.58MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 13:EXCHANGE [HASH(l_orderkey,l_partkey,l_suppkey,l_linenumber,l_comment)]
@@ -1799,10 +1799,10 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=89.35MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=89.35MB mem-reservation=4.25MB
 04:AGGREGATE [STREAMING]
 |  group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-|  mem-estimate=42.58MB mem-reservation=0B
+|  mem-estimate=42.58MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=2 row-size=70B cardinality=575772
 |
 03:HASH JOIN [INNER JOIN, PARTITIONED]
@@ -1810,7 +1810,7 @@ Per-Host Resources: mem-estimate=89.35MB mem-reservation=272.00MB
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF000 <- o_orderkey
-|  mem-estimate=2.10MB mem-reservation=136.00MB
+|  mem-estimate=2.10MB mem-reservation=2.12MB spill-buffer=128.00KB
 |  tuple-ids=0,1 row-size=86B cardinality=575772
 |
 |--F12:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -1888,11 +1888,11 @@ order by
   o_orderdate
 limit 100
 ---- PLAN
-Per-Host Resource Reservation: Memory=672.00MB
+Per-Host Resource Reservation: Memory=80.75MB
 Per-Host Resource Estimates: Memory=391.29MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=391.29MB mem-reservation=672.00MB
+|  Per-Host Resources: mem-estimate=391.29MB mem-reservation=80.75MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -1904,20 +1904,20 @@ PLAN-ROOT SINK
 08:AGGREGATE [FINALIZE]
 |  output: sum(l_quantity)
 |  group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-|  mem-estimate=60.40MB mem-reservation=264.00MB
+|  mem-estimate=60.40MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 07:HASH JOIN [LEFT SEMI JOIN]
 |  hash predicates: o_orderkey = l_orderkey
 |  runtime filters: RF000 <- l_orderkey
-|  mem-estimate=3.94MB mem-reservation=136.00MB
+|  mem-estimate=3.94MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=575772
 |
 |--04:AGGREGATE [FINALIZE]
 |  |  output: sum(l_quantity)
 |  |  group by: l_orderkey
 |  |  having: sum(l_quantity) > 300
-|  |  mem-estimate=10.00MB mem-reservation=264.00MB
+|  |  mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  |  tuple-ids=4 row-size=24B cardinality=156344
 |  |
 |  03:SCAN HDFS [tpch.lineitem]
@@ -1932,7 +1932,7 @@ PLAN-ROOT SINK
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF001 <- c_custkey
-|  mem-estimate=6.61MB mem-reservation=136.00MB
+|  mem-estimate=6.61MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=5757710
 |
 |--00:SCAN HDFS [tpch.customer]
@@ -1947,7 +1947,7 @@ PLAN-ROOT SINK
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF002 <- o_orderkey
-|  mem-estimate=78.68MB mem-reservation=136.00MB
+|  mem-estimate=78.68MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2,1 row-size=66B cardinality=5757710
 |
 |--01:SCAN HDFS [tpch.orders]
@@ -1968,7 +1968,7 @@ PLAN-ROOT SINK
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=2 row-size=16B cardinality=6001215
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=936.00MB
+Per-Host Resource Reservation: Memory=82.88MB
 Per-Host Resource Estimates: Memory=500.32MB
 
 F07:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1983,7 +1983,7 @@ PLAN-ROOT SINK
 |  tuple-ids=7 row-size=100B cardinality=100
 |
 F06:PLAN FRAGMENT [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=60.41MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=60.41MB mem-reservation=34.00MB
 09:TOP-N [LIMIT=100]
 |  order by: o_totalprice DESC, o_orderdate ASC
 |  mem-estimate=9.77KB mem-reservation=0B
@@ -1992,7 +1992,7 @@ Per-Host Resources: mem-estimate=60.41MB mem-reservation=264.00MB
 16:AGGREGATE [FINALIZE]
 |  output: sum:merge(l_quantity)
 |  group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-|  mem-estimate=60.40MB mem-reservation=264.00MB
+|  mem-estimate=60.40MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 15:EXCHANGE [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)]
@@ -2000,24 +2000,24 @@ Per-Host Resources: mem-estimate=60.41MB mem-reservation=264.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
+Per-Host Resources: mem-estimate=104.55MB mem-reservation=48.88MB
 08:AGGREGATE [STREAMING]
 |  output: sum(l_quantity)
 |  group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-|  mem-estimate=60.40MB mem-reservation=0B
+|  mem-estimate=60.40MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 07:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]
 |  hash predicates: o_orderkey = l_orderkey
 |  runtime filters: RF000 <- l_orderkey
-|  mem-estimate=1.31MB mem-reservation=136.00MB
+|  mem-estimate=1.31MB mem-reservation=2.12MB spill-buffer=128.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=575772
 |
 |--14:AGGREGATE [FINALIZE]
 |  |  output: sum:merge(l_quantity)
 |  |  group by: l_orderkey
 |  |  having: sum(l_quantity) > 300
-|  |  mem-estimate=10.00MB mem-reservation=264.00MB
+|  |  mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  |  tuple-ids=4 row-size=24B cardinality=156344
 |  |
 |  13:EXCHANGE [HASH(l_orderkey)]
@@ -2029,7 +2029,7 @@ Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
 |  04:AGGREGATE [STREAMING]
 |  |  output: sum(l_quantity)
 |  |  group by: l_orderkey
-|  |  mem-estimate=39.36MB mem-reservation=0B
+|  |  mem-estimate=39.36MB mem-reservation=0B spill-buffer=2.00MB
 |  |  tuple-ids=4 row-size=24B cardinality=1563438
 |  |
 |  03:SCAN HDFS [tpch.lineitem, RANDOM]
@@ -2044,7 +2044,7 @@ Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF001 <- c_custkey
-|  mem-estimate=6.61MB mem-reservation=136.00MB
+|  mem-estimate=6.61MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=5757710
 |
 |--12:EXCHANGE [BROADCAST]
@@ -2065,7 +2065,7 @@ Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF002 <- o_orderkey
-|  mem-estimate=26.23MB mem-reservation=136.00MB
+|  mem-estimate=26.23MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=2,1 row-size=66B cardinality=5757710
 |
 |--11:EXCHANGE [HASH(o_orderkey)]
@@ -2098,7 +2098,7 @@ Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=2 row-size=16B cardinality=6001215
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=1.31GB
+Per-Host Resource Reservation: Memory=121.12MB
 Per-Host Resource Estimates: Memory=953.10MB
 
 F07:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -2113,7 +2113,7 @@ PLAN-ROOT SINK
 |  tuple-ids=7 row-size=100B cardinality=100
 |
 F06:PLAN FRAGMENT [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=120.82MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=120.82MB mem-reservation=68.00MB
 09:TOP-N [LIMIT=100]
 |  order by: o_totalprice DESC, o_orderdate ASC
 |  mem-estimate=9.77KB mem-reservation=0B
@@ -2122,7 +2122,7 @@ Per-Host Resources: mem-estimate=120.82MB mem-reservation=528.00MB
 16:AGGREGATE [FINALIZE]
 |  output: sum:merge(l_quantity)
 |  group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-|  mem-estimate=60.40MB mem-reservation=264.00MB
+|  mem-estimate=60.40MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 15:EXCHANGE [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)]
@@ -2130,18 +2130,18 @@ Per-Host Resources: mem-estimate=120.82MB mem-reservation=528.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
+Per-Host Resources: mem-estimate=161.56MB mem-reservation=53.12MB
 08:AGGREGATE [STREAMING]
 |  output: sum(l_quantity)
 |  group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-|  mem-estimate=60.40MB mem-reservation=0B
+|  mem-estimate=60.40MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=6 row-size=100B cardinality=575772
 |
 07:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]
 |  hash-table-id=00
 |  hash predicates: o_orderkey = l_orderkey
 |  runtime filters: RF000 <- l_orderkey
-|  mem-estimate=671.79KB mem-reservation=136.00MB
+|  mem-estimate=671.79KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=575772
 |
 |--F08:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
@@ -2155,7 +2155,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
 |  |  output: sum:merge(l_quantity)
 |  |  group by: l_orderkey
 |  |  having: sum(l_quantity) > 300
-|  |  mem-estimate=10.00MB mem-reservation=264.00MB
+|  |  mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  |  tuple-ids=4 row-size=24B cardinality=156344
 |  |
 |  13:EXCHANGE [HASH(l_orderkey)]
@@ -2167,7 +2167,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
 |  04:AGGREGATE [STREAMING]
 |  |  output: sum(l_quantity)
 |  |  group by: l_orderkey
-|  |  mem-estimate=39.36MB mem-reservation=0B
+|  |  mem-estimate=39.36MB mem-reservation=0B spill-buffer=2.00MB
 |  |  tuple-ids=4 row-size=24B cardinality=1563438
 |  |
 |  03:SCAN HDFS [tpch.lineitem, RANDOM]
@@ -2183,7 +2183,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
 |  hash predicates: o_custkey = c_custkey
 |  fk/pk conjuncts: o_custkey = c_custkey
 |  runtime filters: RF001 <- c_custkey
-|  mem-estimate=6.61MB mem-reservation=136.00MB
+|  mem-estimate=6.61MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  tuple-ids=2,1,0 row-size=108B cardinality=5757710
 |
 |--F09:PLAN FRAGMENT [HASH(l_orderkey)] hosts=1 instances=2
@@ -2212,7 +2212,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
 |  hash predicates: l_orderkey = o_orderkey
 |  fk/pk conjuncts: l_orderkey = o_orderkey
 |  runtime filters: RF002 <- o_orderkey
-|  mem-estimate=13.11MB mem-reservation=136.00MB
+|  mem-estimate=13.11MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  tuple-ids=2,1 row-size=66B cardinality=5757710
 |
 |--F10:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -2390,19 +2390,19 @@ from tpch_nested_parquet.customer c,
    join c.c_orders o2 on o1.o_orderkey = o2.o_orderkey
    order by o1.o_orderkey limit 100) v
 ---- PLAN
-Per-Host Resource Reservation: Memory=664.00MB
+Per-Host Resource Reservation: Memory=69.06MB
 Per-Host Resource Estimates: Memory=344.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 tpch_nested_parquet.customer
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=344.00MB mem-reservation=664.00MB
+|  Per-Host Resources: mem-estimate=344.00MB mem-reservation=69.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 09:AGGREGATE [FINALIZE]
 |  group by: c_name, o1.o_orderkey, o2.o_orderstatus
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 01:SUBPLAN
@@ -2425,13 +2425,13 @@ PLAN-ROOT SINK
 |  |
 |  06:AGGREGATE [FINALIZE]
 |  |  group by: o1.o_orderkey, o2.o_orderstatus
-|  |  mem-estimate=128.00MB mem-reservation=264.00MB
+|  |  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  |  tuple-ids=3 row-size=24B cardinality=10
 |  |
 |  05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o1.o_orderkey = o2.o_orderkey
 |  |  fk/pk conjuncts: assumed fk/pk
-|  |  mem-estimate=0B mem-reservation=136.00MB
+|  |  mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=1,2 row-size=32B cardinality=10
 |  |
 |  |--04:UNNEST [c.c_orders o2]
@@ -2452,7 +2452,7 @@ PLAN-ROOT SINK
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=66B cardinality=150000
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=664.00MB
+Per-Host Resource Reservation: Memory=69.06MB
 Per-Host Resource Estimates: Memory=472.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 tpch_nested_parquet.customer
@@ -2467,10 +2467,10 @@ PLAN-ROOT SINK
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 F01:PLAN FRAGMENT [HASH(c_name,v.o_orderkey,v.o_orderstatus)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=128.00MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=128.00MB mem-reservation=34.00MB
 11:AGGREGATE [FINALIZE]
 |  group by: c_name, v.o_orderkey, v.o_orderstatus
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 10:EXCHANGE [HASH(c_name,v.o_orderkey,v.o_orderstatus)]
@@ -2478,10 +2478,10 @@ Per-Host Resources: mem-estimate=128.00MB mem-reservation=264.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=344.00MB mem-reservation=400.00MB
+Per-Host Resources: mem-estimate=344.00MB mem-reservation=35.06MB
 09:AGGREGATE [STREAMING]
 |  group by: c_name, o1.o_orderkey, o2.o_orderstatus
-|  mem-estimate=128.00MB mem-reservation=0B
+|  mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 01:SUBPLAN
@@ -2504,13 +2504,13 @@ Per-Host Resources: mem-estimate=344.00MB mem-reservation=400.00MB
 |  |
 |  06:AGGREGATE [FINALIZE]
 |  |  group by: o1.o_orderkey, o2.o_orderstatus
-|  |  mem-estimate=128.00MB mem-reservation=264.00MB
+|  |  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  |  tuple-ids=3 row-size=24B cardinality=10
 |  |
 |  05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o1.o_orderkey = o2.o_orderkey
 |  |  fk/pk conjuncts: assumed fk/pk
-|  |  mem-estimate=0B mem-reservation=136.00MB
+|  |  mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=1,2 row-size=32B cardinality=10
 |  |
 |  |--04:UNNEST [c.c_orders o2]
@@ -2531,7 +2531,7 @@ Per-Host Resources: mem-estimate=344.00MB mem-reservation=400.00MB
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=66B cardinality=150000
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=1.30GB
+Per-Host Resource Reservation: Memory=138.12MB
 Per-Host Resource Estimates: Memory=944.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 tpch_nested_parquet.customer
@@ -2546,10 +2546,10 @@ PLAN-ROOT SINK
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 F01:PLAN FRAGMENT [HASH(c_name,v.o_orderkey,v.o_orderstatus)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=256.00MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=256.00MB mem-reservation=68.00MB
 11:AGGREGATE [FINALIZE]
 |  group by: c_name, v.o_orderkey, v.o_orderstatus
-|  mem-estimate=128.00MB mem-reservation=264.00MB
+|  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 10:EXCHANGE [HASH(c_name,v.o_orderkey,v.o_orderstatus)]
@@ -2557,10 +2557,10 @@ Per-Host Resources: mem-estimate=256.00MB mem-reservation=528.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=688.00MB mem-reservation=800.00MB
+Per-Host Resources: mem-estimate=688.00MB mem-reservation=70.12MB
 09:AGGREGATE [STREAMING]
 |  group by: c_name, o1.o_orderkey, o2.o_orderstatus
-|  mem-estimate=128.00MB mem-reservation=0B
+|  mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=6 row-size=58B cardinality=1500000
 |
 01:SUBPLAN
@@ -2583,13 +2583,13 @@ Per-Host Resources: mem-estimate=688.00MB mem-reservation=800.00MB
 |  |
 |  06:AGGREGATE [FINALIZE]
 |  |  group by: o1.o_orderkey, o2.o_orderstatus
-|  |  mem-estimate=128.00MB mem-reservation=264.00MB
+|  |  mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  |  tuple-ids=3 row-size=24B cardinality=10
 |  |
 |  05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: o1.o_orderkey = o2.o_orderkey
 |  |  fk/pk conjuncts: assumed fk/pk
-|  |  mem-estimate=0B mem-reservation=136.00MB
+|  |  mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
 |  |  tuple-ids=1,2 row-size=32B cardinality=10
 |  |
 |  |--04:UNNEST [c.c_orders o2]
@@ -2619,13 +2619,13 @@ from tpch_nested_parquet.customer c,
     row_number() over (order by o_orderpriority) rnum_priority
    from c.c_orders) v;
 ---- PLAN
-Per-Host Resource Reservation: Memory=192.00MB
-Per-Host Resource Estimates: Memory=136.00MB
+Per-Host Resource Reservation: Memory=48.00MB
+Per-Host Resource Estimates: Memory=94.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 tpch_nested_parquet.customer
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
+|  Per-Host Resources: mem-estimate=94.00MB mem-reservation=48.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -2646,36 +2646,36 @@ PLAN-ROOT SINK
 |  |  functions: row_number()
 |  |  order by: o_orderpriority ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=12,7 row-size=164B cardinality=10
 |  |
 |  08:SORT
 |  |  order by: o_orderpriority ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=12 row-size=156B cardinality=10
 |  |
 |  07:ANALYTIC
 |  |  functions: row_number()
 |  |  order by: o_orderdate ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=10,6 row-size=156B cardinality=10
 |  |
 |  06:SORT
 |  |  order by: o_orderdate ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=10 row-size=148B cardinality=10
 |  |
 |  05:ANALYTIC
 |  |  functions: row_number()
 |  |  order by: o_totalprice ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=8,5 row-size=148B cardinality=10
 |  |
 |  04:SORT
 |  |  order by: o_totalprice ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=8 row-size=140B cardinality=10
 |  |
 |  03:UNNEST [c.c_orders]
@@ -2691,8 +2691,8 @@ PLAN-ROOT SINK
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=254B cardinality=150000
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=192.00MB
-Per-Host Resource Estimates: Memory=136.00MB
+Per-Host Resource Reservation: Memory=48.00MB
+Per-Host Resource Estimates: Memory=94.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 tpch_nested_parquet.customer
 
@@ -2706,7 +2706,7 @@ PLAN-ROOT SINK
 |  tuple-ids=12,7,0 row-size=418B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
+Per-Host Resources: mem-estimate=94.00MB mem-reservation=48.00MB
 01:SUBPLAN
 |  mem-estimate=0B mem-reservation=0B
 |  tuple-ids=12,7,0 row-size=418B cardinality=1500000
@@ -2724,36 +2724,36 @@ Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
 |  |  functions: row_number()
 |  |  order by: o_orderpriority ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=12,7 row-size=164B cardinality=10
 |  |
 |  08:SORT
 |  |  order by: o_orderpriority ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=12 row-size=156B cardinality=10
 |  |
 |  07:ANALYTIC
 |  |  functions: row_number()
 |  |  order by: o_orderdate ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=10,6 row-size=156B cardinality=10
 |  |
 |  06:SORT
 |  |  order by: o_orderdate ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=10 row-size=148B cardinality=10
 |  |
 |  05:ANALYTIC
 |  |  functions: row_number()
 |  |  order by: o_totalprice ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=8,5 row-size=148B cardinality=10
 |  |
 |  04:SORT
 |  |  order by: o_totalprice ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=8 row-size=140B cardinality=10
 |  |
 |  03:UNNEST [c.c_orders]
@@ -2769,8 +2769,8 @@ Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=254B cardinality=150000
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=384.00MB
-Per-Host Resource Estimates: Memory=272.00MB
+Per-Host Resource Reservation: Memory=96.00MB
+Per-Host Resource Estimates: Memory=188.00MB
 WARNING: The following tables are missing relevant table and/or column statistics.
 tpch_nested_parquet.customer
 
@@ -2784,7 +2784,7 @@ PLAN-ROOT SINK
 |  tuple-ids=12,7,0 row-size=418B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=272.00MB mem-reservation=384.00MB
+Per-Host Resources: mem-estimate=188.00MB mem-reservation=96.00MB
 01:SUBPLAN
 |  mem-estimate=0B mem-reservation=0B
 |  tuple-ids=12,7,0 row-size=418B cardinality=1500000
@@ -2802,36 +2802,36 @@ Per-Host Resources: mem-estimate=272.00MB mem-reservation=384.00MB
 |  |  functions: row_number()
 |  |  order by: o_orderpriority ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=12,7 row-size=164B cardinality=10
 |  |
 |  08:SORT
 |  |  order by: o_orderpriority ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=12 row-size=156B cardinality=10
 |  |
 |  07:ANALYTIC
 |  |  functions: row_number()
 |  |  order by: o_orderdate ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=10,6 row-size=156B cardinality=10
 |  |
 |  06:SORT
 |  |  order by: o_orderdate ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=10 row-size=148B cardinality=10
 |  |
 |  05:ANALYTIC
 |  |  functions: row_number()
 |  |  order by: o_totalprice ASC
 |  |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  |  mem-estimate=0B mem-reservation=16.00MB
+|  |  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  |  tuple-ids=8,5 row-size=148B cardinality=10
 |  |
 |  04:SORT
 |  |  order by: o_totalprice ASC
-|  |  mem-estimate=16.00MB mem-reservation=48.00MB
+|  |  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  |  tuple-ids=8 row-size=140B cardinality=10
 |  |
 |  03:UNNEST [c.c_orders]
@@ -2861,11 +2861,11 @@ join (
   ) v2 on v2.k3 = t2.o_orderkey
 ) v1 on v1.k3 = t1.o_orderkey
 ---- PLAN
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
 Per-Host Resource Estimates: Memory=172.59MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=172.59MB mem-reservation=272.00MB
+|  Per-Host Resources: mem-estimate=172.59MB mem-reservation=68.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -2873,21 +2873,21 @@ PLAN-ROOT SINK
 |  hash predicates: t1.o_orderkey = t3.o_orderkey
 |  fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
 |  runtime filters: RF000 <- t3.o_orderkey
-|  mem-estimate=37.77MB mem-reservation=136.00MB
+|  mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
 |--05:HASH JOIN [INNER JOIN]
 |  |  hash predicates: t2.o_orderkey = t3.o_orderkey
 |  |  fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
 |  |  runtime filters: RF001 <- t3.o_orderkey
-|  |  mem-estimate=25.18MB mem-reservation=136.00MB
+|  |  mem-estimate=25.18MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  |--04:HASH JOIN [INNER JOIN]
 |  |  |  hash predicates: t3.o_orderkey = t4.o_orderkey
 |  |  |  fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
 |  |  |  runtime filters: RF002 <- t4.o_orderkey
-|  |  |  mem-estimate=12.59MB mem-reservation=136.00MB
+|  |  |  mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  |  tuple-ids=2,3 row-size=16B cardinality=1500000
 |  |  |
 |  |  |--03:SCAN HDFS [tpch_parquet.orders t4]
@@ -2925,7 +2925,7 @@ PLAN-ROOT SINK
    mem-estimate=40.00MB mem-reservation=0B
    tuple-ids=0 row-size=191B cardinality=1500000
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=408.00MB
+Per-Host Resource Reservation: Memory=59.50MB
 Per-Host Resource Estimates: Memory=216.65MB
 
 F05:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -2938,12 +2938,12 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
-Per-Host Resources: mem-estimate=77.77MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash predicates: t1.o_orderkey = t3.o_orderkey
 |  fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
 |  runtime filters: RF000 <- t3.o_orderkey
-|  mem-estimate=37.77MB mem-reservation=136.00MB
+|  mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
 |--10:EXCHANGE [BROADCAST]
@@ -2951,19 +2951,19 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=136.00MB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  F04:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=2
-|  Per-Host Resources: mem-estimate=18.88MB mem-reservation=272.00MB
+|  Per-Host Resources: mem-estimate=18.88MB mem-reservation=25.50MB
 |  05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  hash predicates: t2.o_orderkey = t3.o_orderkey
 |  |  fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
 |  |  runtime filters: RF001 <- t3.o_orderkey
-|  |  mem-estimate=12.59MB mem-reservation=136.00MB
+|  |  mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  |--04:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  |  hash predicates: t3.o_orderkey = t4.o_orderkey
 |  |  |  fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
 |  |  |  runtime filters: RF002 <- t4.o_orderkey
-|  |  |  mem-estimate=6.29MB mem-reservation=136.00MB
+|  |  |  mem-estimate=6.29MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  |  |  tuple-ids=2,3 row-size=16B cardinality=1500000
 |  |  |
 |  |  |--08:EXCHANGE [HASH(t4.o_orderkey)]
@@ -3019,7 +3019,7 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=136.00MB
    mem-estimate=40.00MB mem-reservation=0B
    tuple-ids=0 row-size=191B cardinality=1500000
 ---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=816.00MB
+Per-Host Resource Reservation: Memory=93.50MB
 Per-Host Resource Estimates: Memory=414.41MB
 
 F05:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -3032,13 +3032,13 @@ PLAN-ROOT SINK
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
 F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
-Per-Host Resources: mem-estimate=155.53MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
 06:HASH JOIN [INNER JOIN, BROADCAST]
 |  hash-table-id=00
 |  hash predicates: t1.o_orderkey = t3.o_orderkey
 |  fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
 |  runtime filters: RF000 <- t3.o_orderkey
-|  mem-estimate=37.77MB mem-reservation=136.00MB
+|  mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
 |  tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
 |
 |--F06:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -3053,13 +3053,13 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=272.00MB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  F04:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=4
-|  Per-Host Resources: mem-estimate=18.88MB mem-reservation=544.00MB
+|  Per-Host Resources: mem-estimate=18.88MB mem-reservation=25.50MB
 |  05:HASH JOIN [INNER JOIN, PARTITIONED]
 |  |  hash-table-id=01
 |  |  hash predicates: t2.o_orderkey = t3.o_orderkey
 |  |  fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
 |  |  runtime filters: RF001 <- t3.o_orderkey
-|  |  mem-estimate=6.29MB mem-reservation=136.00MB
+|  |  mem-estimate=6.29MB mem-reservation=8.50MB spill-buffer=512.00KB
 |  |  tuple-ids=1,2,3 row-size=24B cardinality=1500000
 |  |
 |  |--F07:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=4
@@ -3074,7 +3074,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=272.00MB
 |  |  |  hash predicates: t3.o_orderkey = t4.o_orderkey
 |  |  |  fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
 |  |  |  runtime filters: RF002 <- t4.o_orderkey
-|  |  |  mem-estimate=3.15MB mem-reservation=136.00MB
+|  |  |  mem-estimate=3.15MB mem-reservation=4.25MB spill-buffer=256.00KB
 |  |  |  tuple-ids=2,3 row-size=16B cardinality=1500000
 |  |  |
 |  |  |--F08:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=4
@@ -3387,12 +3387,12 @@ sum(smallint_col) over (partition by tinyint_col order by smallint_col
                                                 rows between 1 following and 2 following)
                                                 from functional.alltypesagg
 ---- PLAN
-Per-Host Resource Reservation: Memory=72.00MB
-Per-Host Resource Estimates: Memory=24.00MB
+Per-Host Resource Reservation: Memory=18.00MB
+Per-Host Resource Estimates: Memory=18.00MB
 Codegen disabled by planner
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=24.00MB mem-reservation=72.00MB
+|  Per-Host Resources: mem-estimate=18.00MB mem-reservation=18.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -3401,7 +3401,7 @@ PLAN-ROOT SINK
 |  partition by: tinyint_col
 |  order by: smallint_col ASC
 |  window: ROWS BETWEEN 1 FOLLOWING AND 2 FOLLOWING
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=5,2,3,4 row-size=27B cardinality=11000
 |
 03:ANALYTIC
@@ -3409,7 +3409,7 @@ PLAN-ROOT SINK
 |  partition by: tinyint_col
 |  order by: smallint_col ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=5,2,3 row-size=19B cardinality=11000
 |
 02:ANALYTIC
@@ -3417,12 +3417,12 @@ PLAN-ROOT SINK
 |  partition by: tinyint_col
 |  order by: smallint_col ASC
 |  window: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=5,2 row-size=11B cardinality=11000
 |
 01:SORT
 |  order by: tinyint_col ASC NULLS FIRST, smallint_col ASC
-|  mem-estimate=8.00MB mem-reservation=24.00MB
+|  mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=5 row-size=3B cardinality=11000
 |
 00:SCAN HDFS [functional.alltypesagg]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test b/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
index 3e5fb05..66f8167 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
@@ -2,14 +2,14 @@
 select * from functional.alltypes order by random()
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: random() ASC
 |  materialized: random()
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=105B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -24,14 +24,14 @@ PLAN-ROOT SINK
 select * from functional.alltypes order by abs(id) + abs(id)
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: abs(id) + abs(id) ASC
 |  materialized: abs(id) + abs(id)
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=105B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -46,13 +46,13 @@ PLAN-ROOT SINK
 select * from functional.alltypes order by tinyint_col + 1
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: tinyint_col + 1 ASC
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=97B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -68,14 +68,14 @@ select * from functional.alltypes
 order by dayofweek(timestamp_col), true, id + 1, string_col = date_string_col, id = tinyint_col
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: dayofweek(timestamp_col) ASC, TRUE ASC, id + 1 ASC, string_col = date_string_col ASC, id = tinyint_col ASC
 |  materialized: dayofweek(timestamp_col), string_col = date_string_col
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=102B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -91,7 +91,7 @@ select last_value(id) over (order by to_date(timestamp_col), bool_col is null)
 from functional.alltypes
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=64.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=16.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -99,13 +99,13 @@ PLAN-ROOT SINK
 |  functions: last_value(id)
 |  order by: to_date(timestamp_col) ASC, bool_col IS NULL ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=3,2 row-size=41B cardinality=7300
 |
 01:SORT
 |  order by: to_date(timestamp_col) ASC, bool_col IS NULL ASC
 |  materialized: to_date(timestamp_col)
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=3 row-size=37B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -143,14 +143,14 @@ PLAN-ROOT SINK
 select * from functional.alltypes order by TestFn(double_col)
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: default.testfn(double_col) ASC
 |  materialized: default.testfn(double_col)
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=101B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -165,14 +165,14 @@ PLAN-ROOT SINK
 select concat(date_string_col, string_col) c from functional.alltypes order by c
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: concat(date_string_col, string_col) ASC
 |  materialized: concat(date_string_col, string_col)
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=16B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]



[03/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/sorter.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.h b/be/src/runtime/sorter.h
index 80c5558..cafab72 100644
--- a/be/src/runtime/sorter.h
+++ b/be/src/runtime/sorter.h
@@ -20,7 +20,7 @@
 
 #include <deque>
 
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/buffer-pool.h"
 #include "util/tuple-row-compare.h"
 
 namespace impala {
@@ -31,8 +31,7 @@ class RowBatch;
 
 /// Sorter contains the external sort implementation. Its purpose is to sort arbitrarily
 /// large input data sets with a fixed memory budget by spilling data to disk if
-/// necessary. BufferedBlockMgr is used to allocate and manage blocks of data to be
-/// sorted.
+/// necessary.
 //
 /// The client API for Sorter is as follows:
 /// AddBatch() is used to add input rows to be sorted. Multiple tuples in an input row are
@@ -52,20 +51,20 @@ class RowBatch;
 /// GetNext() is used to retrieve sorted rows. It can be called multiple times.
 /// AddBatch()/AddBatchNoSpill(), InputDone() and GetNext() must be called in that order.
 //
-/// Batches of input rows are collected into a sequence of pinned BufferedBlockMgr blocks
-/// called a run. The maximum size of a run is determined by the number of blocks that
+/// Batches of input rows are collected into a sequence of pinned BufferPool pages
+/// called a run. The maximum size of a run is determined by the number of pages that
 /// can be pinned by the Sorter. After the run is full, it is sorted in memory, unpinned
 /// and the next run is constructed. The variable-length column data (e.g. string slots)
-/// in the materialized sort tuples are stored in a separate sequence of blocks from the
-/// tuples themselves.  When the blocks containing tuples in a run are unpinned, the
+/// in the materialized sort tuples are stored in a separate sequence of pages from the
+/// tuples themselves.  When the pages containing tuples in a run are unpinned, the
 /// var-len slot pointers are converted to offsets from the start of the first var-len
-/// data block. When a block is read back, these offsets are converted back to pointers.
+/// data page. When a page is read back, these offsets are converted back to pointers.
 /// The in-memory sorter sorts the fixed-length tuples in-place. The output rows have the
 /// same schema as the materialized sort tuples.
 //
 /// After the input is consumed, the sorter is left with one or more sorted runs. If
 /// there are multiple runs, the runs are merged using SortedRunMerger. At least one
-/// block per run (two if there are var-length slots) must be pinned in memory during
+/// page per run (two if there are var-length slots) must be pinned in memory during
 /// a merge, so multiple merges may be necessary if the number of runs is too large.
 /// First a series of intermediate merges are performed, until the number of runs is
 /// small enough to do a single final merge that returns batches of sorted rows to the
@@ -73,7 +72,7 @@ class RowBatch;
 ///
 /// If there is a single sorted run (i.e. no merge required), only tuple rows are
 /// copied into the output batch supplied by GetNext(), and the data itself is left in
-/// pinned blocks held by the sorter.
+/// pinned pages held by the sorter.
 ///
 /// When merges are performed, one input batch is created to hold tuple rows for each
 /// input run, and one batch is created to hold deep copied rows (i.e. ptrs + data) from
@@ -84,7 +83,7 @@ class RowBatch;
 /// During a merge, one row batch is created for each input run, and one batch is created
 /// for the output of the merge (if is not the final merge). It is assumed that the memory
 /// for these batches have already been accounted for in the memory budget for the sort.
-/// That is, the memory for these batches does not come out of the block buffer manager.
+/// That is, the memory for these batches does not come out of the buffer pool.
 //
 /// TODO: Not necessary to actually copy var-len data - instead take ownership of the
 /// var-length data in the input batch. Copying can be deferred until a run is unpinned.
@@ -96,17 +95,23 @@ class Sorter {
   /// 'sort_tuple_exprs' are the slot exprs used to materialize the tuples to be
   /// sorted. 'compare_less_than' is a comparator for the sort tuples (returns true if
   /// lhs < rhs). 'merge_batch_size_' is the size of the batches created to provide rows
-  /// to the merger and retrieve rows from an intermediate merger. 'enable_spilling'
-  /// should be set to false to reduce the number of requested buffers if the caller will
-  /// use AddBatchNoSpill().
+  /// to the merger and retrieve rows from an intermediate merger. 'node_id' is the ID of
+  /// the exec node using the sorter for error reporting. 'enable_spilling' should be set
+  /// to false to reduce the number of requested buffers if the caller will use
+  /// AddBatchNoSpill().
+  ///
+  /// The Sorter assumes that it has exclusive use of the client's
+  /// reservations for sorting, and may increase the size of the client's reservation.
+  /// The caller is responsible for ensuring that the minimum reservation (returned from
+  /// ComputeMinReservation()) is available.
   Sorter(const TupleRowComparator& compare_less_than,
       const std::vector<ScalarExpr*>& sort_tuple_exprs, RowDescriptor* output_row_desc,
-      MemTracker* mem_tracker, RuntimeProfile* profile, RuntimeState* state,
-      bool enable_spilling = true);
-
+      MemTracker* mem_tracker, BufferPool::ClientHandle* client, int64_t page_len,
+      RuntimeProfile* profile, RuntimeState* state, int node_id,
+      bool enable_spilling);
   ~Sorter();
 
-  /// Initial set-up of the sorter for execution. Registers with the block mgr.
+  /// Initial set-up of the sorter for execution.
   /// The evaluators for 'sort_tuple_exprs_' will be created and stored in 'obj_pool'.
   /// All allocation from the evaluators will be from 'expr_mem_pool'.
   Status Prepare(ObjectPool* obj_pool, MemPool* expr_mem_pool) WARN_UNUSED_RESULT;
@@ -143,24 +148,29 @@ class Sorter {
   /// Close the Sorter and free resources.
   void Close(RuntimeState* state);
 
+  /// Compute the minimum amount of buffer memory in bytes required to execute a
+  /// sort with the current sorter.
+  int64_t ComputeMinReservation();
+
  private:
+  class Page;
   class Run;
   class TupleIterator;
   class TupleSorter;
 
   /// Create a SortedRunMerger from sorted runs in 'sorted_runs_' and assign it to
   /// 'merger_'. Attempts to set up merger with 'max_num_runs' runs but may set it
-  /// up with fewer if it cannot pin the initial blocks of all of the runs. Fails
+  /// up with fewer if it cannot pin the initial pages of all of the runs. Fails
   /// if it cannot merge at least two runs. The runs to be merged are removed from
   /// 'sorted_runs_'.  The Sorter sets the 'deep_copy_input' flag to true for the
-  /// merger, since the blocks containing input run data will be deleted as input
+  /// merger, since the pages containing input run data will be deleted as input
   /// runs are read.
   Status CreateMerger(int max_num_runs) WARN_UNUSED_RESULT;
 
   /// Repeatedly replaces multiple smaller runs in sorted_runs_ with a single larger
   /// merged run until there are few enough runs to be merged with a single merger.
   /// Returns when 'merger_' is set up to merge the final runs.
-  /// At least 1 (2 if var-len slots) block from each sorted run must be pinned for
+  /// At least 1 (2 if var-len slots) page from each sorted run must be pinned for
   /// a merge. If the number of sorted runs is too large, merge sets of smaller runs
   /// into large runs until a final merge can be performed. An intermediate row batch
   /// containing deep copied rows is used for the output of each intermediate merge.
@@ -177,6 +187,9 @@ class Sorter {
   /// Helper that cleans up all runs in the sorter.
   void CleanupAllRuns();
 
+  /// ID of the ExecNode that owns the sorter, used for error reporting.
+  const int node_id_;
+
   /// Runtime state instance used to check for cancellation. Not owned.
   RuntimeState* const state_;
 
@@ -184,11 +197,11 @@ class Sorter {
   const TupleRowComparator& compare_less_than_;
   boost::scoped_ptr<TupleSorter> in_mem_tuple_sorter_;
 
-  /// Block manager object used to allocate, pin and release runs. Not owned by Sorter.
-  BufferedBlockMgr* block_mgr_;
+  /// Client used to allocate pages from the buffer pool. Not owned.
+  BufferPool::ClientHandle* const buffer_pool_client_;
 
-  /// Handle to block mgr to make allocations from.
-  BufferedBlockMgr::Client* block_mgr_client_;
+  /// The length of page to use.
+  const int64_t page_len_;
 
   /// True if the tuples to be sorted have var-length slots.
   bool has_var_len_slots_;
@@ -211,7 +224,7 @@ class Sorter {
   /// BEGIN: Members that must be Reset()
 
   /// The current unsorted run that is being collected. Is sorted and added to
-  /// sorted_runs_ after it is full (i.e. number of blocks allocated == max available
+  /// sorted_runs_ after it is full (i.e. number of pages allocated == max available
   /// buffers) or after the input is complete. Owned and placed in obj_pool_.
   /// When it is added to sorted_runs_, it is set to NULL.
   Run* unsorted_run_;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/test-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.cc b/be/src/runtime/test-env.cc
index 37b4363..23dfa4c 100644
--- a/be/src/runtime/test-env.cc
+++ b/be/src/runtime/test-env.cc
@@ -20,7 +20,6 @@
 #include <limits>
 #include <memory>
 
-#include "runtime/buffered-block-mgr.h"
 #include "runtime/query-exec-mgr.h"
 #include "runtime/tmp-file-mgr.h"
 #include "runtime/query-state.h"
@@ -38,8 +37,8 @@ scoped_ptr<MetricGroup> TestEnv::static_metrics_;
 
 TestEnv::TestEnv()
   : have_tmp_file_mgr_args_(false),
-    buffer_pool_min_buffer_len_(-1),
-    buffer_pool_capacity_(-1) {}
+    buffer_pool_min_buffer_len_(64 * 1024),
+    buffer_pool_capacity_(0) {}
 
 Status TestEnv::Init() {
   if (static_metrics_ == NULL) {
@@ -59,9 +58,7 @@ Status TestEnv::Init() {
   } else {
     RETURN_IF_ERROR(tmp_file_mgr()->Init(metrics()));
   }
-  if (buffer_pool_min_buffer_len_ != -1 && buffer_pool_capacity_ != -1) {
-    exec_env_->InitBufferPool(buffer_pool_min_buffer_len_, buffer_pool_capacity_);
-  }
+  exec_env_->InitBufferPool(buffer_pool_min_buffer_len_, buffer_pool_capacity_);
   return Status::OK();
 }
 
@@ -88,6 +85,7 @@ void TestEnv::TearDownQueries() {
   for (RuntimeState* runtime_state : runtime_states_) runtime_state->ReleaseResources();
   runtime_states_.clear();
   for (QueryState* query_state : query_states_) {
+    query_state->ReleaseInitialReservationRefcount();
     exec_env_->query_exec_mgr()->ReleaseQueryState(query_state);
   }
   query_states_.clear();
@@ -137,17 +135,4 @@ Status TestEnv::CreateQueryState(
   *runtime_state = rs;
   return Status::OK();
 }
-
-Status TestEnv::CreateQueryStateWithBlockMgr(int64_t query_id, int max_buffers,
-    int block_size, const TQueryOptions* query_options, RuntimeState** runtime_state) {
-  RETURN_IF_ERROR(CreateQueryState(query_id, query_options, runtime_state));
-
-  shared_ptr<BufferedBlockMgr> mgr;
-  RETURN_IF_ERROR(BufferedBlockMgr::Create(*runtime_state,
-      (*runtime_state)->query_state()->query_mem_tracker(),
-      (*runtime_state)->runtime_profile(), tmp_file_mgr(),
-      CalculateMemLimit(max_buffers, block_size), block_size, &mgr));
-  (*runtime_state)->set_block_mgr(mgr);
-  return Status::OK();
-}
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/test-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.h b/be/src/runtime/test-env.h
index f314452..e721510 100644
--- a/be/src/runtime/test-env.h
+++ b/be/src/runtime/test-env.h
@@ -55,13 +55,8 @@ class TestEnv {
   Status CreateQueryState(
       int64_t query_id, const TQueryOptions* query_options, RuntimeState** runtime_state);
 
-  /// Same as CreateQueryState() but also creates a BufferedBlockMgr with the provided
-  /// parameters. If 'max_buffers' is -1, there is no limit, otherwise the limit is
-  /// max_buffers * block_size.
-  Status CreateQueryStateWithBlockMgr(int64_t query_id, int max_buffers, int block_size,
-      const TQueryOptions* query_options, RuntimeState** runtime_state);
-  /// Destroy all query states and associated RuntimeStates, BufferedBlockMgrs,
-  /// etc, that were created since the last TearDownQueries() call.
+  /// Destroy all query states and associated RuntimeStates, etc, that were created since
+  /// the last TearDownQueries() call.
   void TearDownQueries();
 
   /// Calculate memory limit accounting for overflow and negative values.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/tmp-file-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index c94ba1d..343ec93 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -145,6 +145,12 @@ class TmpFileMgrTest : public ::testing::Test {
     return bytes_allocated;
   }
 
+  /// Helpers to call WriteHandle methods.
+  void Cancel(TmpFileMgr::WriteHandle* handle) { handle->Cancel(); }
+  void WaitForWrite(TmpFileMgr::WriteHandle* handle) {
+    handle->WaitForWrite();
+  }
+
   // Write callback, which signals 'cb_cv_' and increments 'cb_counter_'.
   void SignalCallback(Status write_status) {
     {
@@ -481,8 +487,8 @@ TEST_F(TmpFileMgrTest, TestEncryptionDuringCancellation) {
   string file_path = handle->TmpFilePath();
 
   // Cancel the write - prior to the IMPALA-4820 fix decryption could race with the write.
-  handle->Cancel();
-  handle->WaitForWrite();
+  Cancel(handle.get());
+  WaitForWrite(handle.get());
   ASSERT_OK(file_group.RestoreData(move(handle), data_mem_range));
   WaitForCallbacks(1);
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/tmp-file-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.h b/be/src/runtime/tmp-file-mgr.h
index c71c370..ba7210d 100644
--- a/be/src/runtime/tmp-file-mgr.h
+++ b/be/src/runtime/tmp-file-mgr.h
@@ -281,21 +281,9 @@ class TmpFileMgr {
       DCHECK(read_range_ == nullptr);
     }
 
-    /// Cancels any in-flight writes or reads. Reads are cancelled synchronously and
-    /// writes are cancelled asynchronously. After Cancel() is called, writes are not
-    /// retried. The write callback may be called with a CANCELLED status (unless
-    /// it succeeded or encountered a different error first).
-    /// TODO: IMPALA-3200: make this private once BufferedBlockMgr doesn't need it.
-    void Cancel();
-
     /// Cancel any in-flight read synchronously.
     void CancelRead();
 
-    /// Blocks until the write completes either successfully or unsuccessfully.
-    /// May return before the write callback has been called.
-    /// TODO: IMPALA-3200: make this private once BufferedBlockMgr doesn't need it.
-    void WaitForWrite();
-
     /// Path of temporary file backing the block. Intended for use in testing.
     /// Returns empty string if no backing file allocated.
     std::string TmpFilePath() const;
@@ -307,6 +295,7 @@ class TmpFileMgr {
 
    private:
     friend class FileGroup;
+    friend class TmpFileMgrTest;
 
     WriteHandle(RuntimeProfile::Counter* encryption_timer, WriteDoneCallback cb);
 
@@ -327,6 +316,16 @@ class TmpFileMgr {
     /// then calls 'cb_'.
     void WriteComplete(const Status& write_status);
 
+    /// Cancels any in-flight writes or reads. Reads are cancelled synchronously and
+    /// writes are cancelled asynchronously. After Cancel() is called, writes are not
+    /// retried. The write callback may be called with a CANCELLED status (unless
+    /// it succeeded or encountered a different error first).
+    void Cancel();
+
+    /// Blocks until the write completes either successfully or unsuccessfully.
+    /// May return before the write callback has been called.
+    void WaitForWrite();
+
     /// Encrypts the data in 'buffer' in-place and computes 'hash_'.
     Status EncryptAndHash(MemRange buffer) WARN_UNUSED_RESULT;
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/service/client-request-state.cc
----------------------------------------------------------------------
diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index 6be04f6..bf0f9b4 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -399,9 +399,9 @@ Status ClientRequestState::ExecQueryOrDmlRequest(
     ss << query_exec_request.per_host_mem_estimate;
     summary_profile_.AddInfoString(PER_HOST_MEM_KEY, ss.str());
   }
-  if (query_exec_request.__isset.per_host_min_reservation) {
+  if (query_exec_request.query_ctx.__isset.per_host_min_reservation) {
     stringstream ss;
-    ss << query_exec_request.per_host_min_reservation;
+    ss << query_exec_request.query_ctx.per_host_min_reservation;
     summary_profile_.AddInfoString(PER_HOST_MEMORY_RESERVATION_KEY, ss.str());
   }
   if (!query_exec_request.query_ctx.__isset.parent_query_id &&

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/service/query-options.cc
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 8dcd7af..c123902 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -261,10 +261,10 @@ Status impala::SetQueryOption(const string& key, const string& value,
       case TImpalaQueryOptions::QUERY_TIMEOUT_S:
         query_options->__set_query_timeout_s(atoi(value.c_str()));
         break;
-      case TImpalaQueryOptions::MAX_BLOCK_MGR_MEMORY: {
+      case TImpalaQueryOptions::BUFFER_POOL_LIMIT: {
         int64_t mem;
-        RETURN_IF_ERROR(ParseMemValue(value, "block mgr memory limit", &mem));
-        query_options->__set_max_block_mgr_memory(mem);
+        RETURN_IF_ERROR(ParseMemValue(value, "buffer pool limit", &mem));
+        query_options->__set_buffer_pool_limit(mem);
         break;
       }
       case TImpalaQueryOptions::APPX_COUNT_DISTINCT: {
@@ -505,6 +505,28 @@ Status impala::SetQueryOption(const string& key, const string& value,
         query_options->__set_disable_codegen_rows_threshold(val);
         break;
       }
+      case TImpalaQueryOptions::DEFAULT_SPILLABLE_BUFFER_SIZE: {
+        int64_t buffer_size_bytes;
+        RETURN_IF_ERROR(
+            ParseMemValue(value, "Spillable buffer size", &buffer_size_bytes));
+        if (!BitUtil::IsPowerOf2(buffer_size_bytes)) {
+          return Status(
+              Substitute("Buffer size must be a power of two: $0", buffer_size_bytes));
+        }
+        query_options->__set_default_spillable_buffer_size(buffer_size_bytes);
+        break;
+      }
+      case TImpalaQueryOptions::MIN_SPILLABLE_BUFFER_SIZE: {
+        int64_t buffer_size_bytes;
+        RETURN_IF_ERROR(
+            ParseMemValue(value, "Spillable buffer size", &buffer_size_bytes));
+        if (!BitUtil::IsPowerOf2(buffer_size_bytes)) {
+          return Status(
+              Substitute("Buffer size must be a power of two: $0", buffer_size_bytes));
+        }
+        query_options->__set_min_spillable_buffer_size(buffer_size_bytes);
+        break;
+      }
       default:
         // We hit this DCHECK(false) if we forgot to add the corresponding entry here
         // when we add a new query option.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/service/query-options.h
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 603c783..8d6af02 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -35,7 +35,7 @@ class TQueryOptions;
 // the DCHECK.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::DISABLE_CODEGEN_ROWS_THRESHOLD + 1);\
+      TImpalaQueryOptions::MIN_SPILLABLE_BUFFER_SIZE + 1);\
   QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR)\
   QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -62,7 +62,7 @@ class TQueryOptions;
   QUERY_OPT_FN(v_cpu_cores, V_CPU_CORES)\
   QUERY_OPT_FN(rm_initial_mem, RM_INITIAL_MEM)\
   QUERY_OPT_FN(query_timeout_s, QUERY_TIMEOUT_S)\
-  QUERY_OPT_FN(max_block_mgr_memory, MAX_BLOCK_MGR_MEMORY)\
+  QUERY_OPT_FN(buffer_pool_limit, BUFFER_POOL_LIMIT)\
   QUERY_OPT_FN(appx_count_distinct, APPX_COUNT_DISTINCT)\
   QUERY_OPT_FN(disable_unsafe_spills, DISABLE_UNSAFE_SPILLS)\
   QUERY_OPT_FN(seq_compression_mode, SEQ_COMPRESSION_MODE)\
@@ -93,6 +93,8 @@ class TQueryOptions;
   QUERY_OPT_FN(parquet_read_statistics, PARQUET_READ_STATISTICS)\
   QUERY_OPT_FN(default_join_distribution_mode, DEFAULT_JOIN_DISTRIBUTION_MODE)\
   QUERY_OPT_FN(disable_codegen_rows_threshold, DISABLE_CODEGEN_ROWS_THRESHOLD)\
+  QUERY_OPT_FN(default_spillable_buffer_size, DEFAULT_SPILLABLE_BUFFER_SIZE)\
+  QUERY_OPT_FN(min_spillable_buffer_size, MIN_SPILLABLE_BUFFER_SIZE)\
   ;
 
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/util/bloom-filter.h
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.h b/be/src/util/bloom-filter.h
index 5ebd9b5..913b331 100644
--- a/be/src/util/bloom-filter.h
+++ b/be/src/util/bloom-filter.h
@@ -28,7 +28,7 @@
 #include "common/compiler-util.h"
 #include "gen-cpp/ImpalaInternalService_types.h"
 #include "gutil/macros.h"
-#include "runtime/buffered-block-mgr.h"
+#include "util/cpu-info.h"
 #include "util/hash-util.h"
 
 namespace impala {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/util/static-asserts.cc
----------------------------------------------------------------------
diff --git a/be/src/util/static-asserts.cc b/be/src/util/static-asserts.cc
index cf12e36..7662906 100644
--- a/be/src/util/static-asserts.cc
+++ b/be/src/util/static-asserts.cc
@@ -18,7 +18,6 @@
 #include <boost/static_assert.hpp>
 
 #include "common/hdfs.h"
-#include "runtime/buffered-tuple-stream.h"
 #include "runtime/string-value.h"
 #include "runtime/timestamp-value.h"
 #include "udf/udf.h"
@@ -37,7 +36,6 @@ class UnusedClass {
   BOOST_STATIC_ASSERT(sizeof(boost::gregorian::date) == 4);
   BOOST_STATIC_ASSERT(sizeof(hdfsFS) == sizeof(void*));
   BOOST_STATIC_ASSERT(sizeof(hdfsFile) == sizeof(void*));
-  BOOST_STATIC_ASSERT(sizeof(BufferedTupleStream::RowIdx) == sizeof(void*));
 
   // If the memory layout of any of these types changes, it will be necessary to change
   // LlvmCodeGen::GetUdfValType(), and we may also run into calling convention problems

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/Frontend.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/Frontend.thrift b/common/thrift/Frontend.thrift
index 79da0d6..3a88915 100644
--- a/common/thrift/Frontend.thrift
+++ b/common/thrift/Frontend.thrift
@@ -389,23 +389,11 @@ struct TQueryExecRequest {
   // Estimated per-host peak memory consumption in bytes. Used for resource management.
   8: optional i64 per_host_mem_estimate
 
-  // Minimum query-wide buffer reservation required per host in bytes. This is the peak
-  // minimum reservation that may be required by the concurrently-executing operators at
-  // any point in query execution. It may be less than the initial reservation total
-  // claims (below) if execution of some operators never overlaps, which allows reuse of
-  // reservations.
-  9: optional i64 per_host_min_reservation;
-
-  // Total of the initial buffer reservations that we expect to be claimed per host.
-  // I.e. the sum over all operators in all fragment instances that execute on that host.
-  // Measured in bytes.
-  10: optional i64 per_host_initial_reservation_total_claims;
-
   // List of replica hosts.  Used by the host_idx field of TScanRangeLocation.
-  11: required list<Types.TNetworkAddress> host_list
+  9: required list<Types.TNetworkAddress> host_list
 
   // Column lineage graph
-  12: optional LineageGraph.TLineageGraph lineage_graph
+  10: optional LineageGraph.TLineageGraph lineage_graph
 }
 
 enum TCatalogOpType {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/ImpalaInternalService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index 4aefe55..b477299 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -130,7 +130,7 @@ struct TQueryOptions {
   26: optional i32 query_timeout_s = 0
 
   // test hook to cap max memory for spilling operators (to force them to spill).
-  27: optional i64 max_block_mgr_memory
+  27: optional i64 buffer_pool_limit
 
   // If true, transforms all count(distinct) aggregations into NDV()
   28: optional bool appx_count_distinct = 0
@@ -255,6 +255,14 @@ struct TQueryOptions {
   // If the number of rows processed per node is below the threshold codegen will be
   // automatically disabled by the planner.
   57: optional i32 disable_codegen_rows_threshold = 50000
+
+  // The default spillable buffer size in bytes, which may be overridden by the planner.
+  // Defaults to 2MB.
+  58: optional i64 default_spillable_buffer_size = 2097152;
+
+  // The minimum spillable buffer to use. The planner will not choose a size smaller than
+  // this. Defaults to 64KB.
+  59: optional i64 min_spillable_buffer_size = 65536;
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2
@@ -375,6 +383,18 @@ struct TQueryCtx {
   // String containing a timestamp (in UTC) set as the query submission time. It
   // represents the same point in time as now_string
   17: required string utc_timestamp_string
+
+  // Minimum query-wide buffer reservation required per host in bytes. This is the peak
+  // minimum reservation that may be required by the concurrently-executing operators at
+  // any point in query execution. It may be less than the initial reservation total
+  // claims (below) if execution of some operators never overlaps, which allows reuse of
+  // reservations.
+  18: optional i64 per_host_min_reservation;
+
+  // Total of the initial buffer reservations that we expect to be claimed per host.
+  // I.e. the sum over all operators in all fragment instances that execute on that host.
+  // Measured in bytes.
+  19: optional i64 per_host_initial_reservation_total_claims;
 }
 
 // Specification of one output destination of a plan fragment

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/ImpalaService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index ec82bf1..ced884b 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -159,7 +159,7 @@ enum TImpalaQueryOptions {
   QUERY_TIMEOUT_S,
 
   // Test hook for spill to disk operators
-  MAX_BLOCK_MGR_MEMORY,
+  BUFFER_POOL_LIMIT,
 
   // Transforms all count(distinct) aggregations into NDV()
   APPX_COUNT_DISTINCT,
@@ -279,6 +279,12 @@ enum TImpalaQueryOptions {
   // If the number of rows processed per node is below the threshold and disable_codegen
   // is unset, codegen will be automatically be disabled by the planner.
   DISABLE_CODEGEN_ROWS_THRESHOLD,
+
+  // The default spillable buffer size, in bytes.
+  DEFAULT_SPILLABLE_BUFFER_SIZE,
+
+  // The minimum spillable buffer size, in bytes.
+  MIN_SPILLABLE_BUFFER_SIZE,
 }
 
 // The summary of a DML statement.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/PlanNodes.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift
index c1ff302..468ca44 100644
--- a/common/thrift/PlanNodes.thrift
+++ b/common/thrift/PlanNodes.thrift
@@ -481,6 +481,21 @@ struct TUnnestNode {
   1: required Exprs.TExpr collection_expr
 }
 
+// This contains all of the information computed by the plan as part of the resource
+// profile that is needed by the backend to execute.
+struct TBackendResourceProfile {
+  // The minimum reservation for this plan node in bytes.
+  1: required i64 min_reservation
+
+  // The maximum reservation for this plan node in bytes. MAX_INT64 means effectively
+  // unlimited.
+  2: required i64 max_reservation
+
+  // The spillable buffer size in bytes to use for this node, chosen by the planner.
+  // Set iff the node uses spillable buffers.
+  3: optional i64 spillable_buffer_size
+}
+
 // This is essentially a union of all messages corresponding to subclasses
 // of PlanNode.
 struct TPlanNode {
@@ -526,6 +541,9 @@ struct TPlanNode {
 
   // Runtime filters assigned to this plan node
   24: optional list<TRuntimeFilterDesc> runtime_filters
+
+  // Resource profile for this plan node.
+  25: required TBackendResourceProfile resource_profile
 }
 
 // A flattened representation of a tree of PlanNodes, obtained by depth-first

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 80e054e..ccd713c 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -240,11 +240,11 @@ error_codes = (
 
   ("PARTITIONED_HASH_JOIN_REPARTITION_FAILS", 76, "Cannot perform hash join at node with "
    "id $0. Repartitioning did not reduce the size of a spilled partition. Repartitioning "
-   "level $1. Number of rows $2."),
+   "level $1. Number of rows $2:\\n$3\\n$4"),
 
   ("PARTITIONED_AGG_REPARTITION_FAILS", 77,  "Cannot perform aggregation at node with "
    "id $0. Repartitioning did not reduce the size of a spilled partition. Repartitioning "
-   "level $1. Number of rows $2."),
+   "level $1. Number of rows $2:\\n$3\\n$4"),
 
   ("AVRO_TRUNCATED_BLOCK", 78, "File '$0' is corrupt: truncated data block at offset $1"),
 
@@ -322,10 +322,14 @@ error_codes = (
 
   # TODO: IMPALA-3200: make sure that this references the correct query option.
   ("MAX_ROW_SIZE", 104, "Row of size $0 could not be materialized in plan node with "
-    "id $1. Limit is $2, which can be increased with query option max_row_size"),
+    "id $1. Increase the <TBD> query option (currently $2) to process larger rows."),
 
   ("IR_VERIFY_FAILED", 105,
    "Failed to verify generated IR function $0, see log for more details."),
+
+  ("MINIMUM_RESERVATION_UNAVAILABLE", 106, "Failed to get minimum memory reservation of "
+     "$0 on daemon $1:$2 for query $3 because it would exceed an applicable query, "
+     "request pool or process memory limit. Memory usage:\\n$4"),
 )
 
 import sys

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java b/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
index 2041090..3c33bf1 100644
--- a/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
+++ b/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
@@ -17,8 +17,6 @@
 
 package org.apache.impala.common;
 
-import org.apache.impala.service.BackendConfig;
-
 /**
  * Contains runtime-specific parameters such as the number of CPU cores. Currently only
  * used in Plan cost estimation. The static RuntimeEnv members can be set so that tests
@@ -33,9 +31,6 @@ public class RuntimeEnv {
   // PlanNode.computeResourceProfile(). Currently the backend only support a single
   // spillable buffer size, so this is equal to PlanNode.DEFAULT_SPILLABLE_BUFFER_BYTES,
   // except in planner tests.
-  // TODO: IMPALA-3200: this get from query option
-  private long minSpillableBufferBytes_;
-
   // Indicates whether this is an environment for testing.
   private boolean isTestEnv_;
 
@@ -48,15 +43,10 @@ public class RuntimeEnv {
    */
   public void reset() {
     numCores_ = Runtime.getRuntime().availableProcessors();
-    minSpillableBufferBytes_ = BackendConfig.INSTANCE.getReadSize();
   }
 
   public int getNumCores() { return numCores_; }
   public void setNumCores(int numCores) { this.numCores_ = numCores; }
-  public long getMinSpillableBufferBytes() { return minSpillableBufferBytes_; }
-  public void setMinSpillableBufferBytes(long minSpillableBufferBytes) {
-    minSpillableBufferBytes_ = minSpillableBufferBytes;
-  }
   public void setTestEnv(boolean v) { isTestEnv_ = v; }
   public boolean isTestEnv() { return isTestEnv_; }
   public boolean isKuduSupported() {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/AggregationNode.java b/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
index 004c84e..c938f76 100644
--- a/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
@@ -30,7 +30,6 @@ import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.FunctionCallExpr;
 import org.apache.impala.analysis.SlotId;
 import org.apache.impala.common.InternalException;
-import org.apache.impala.common.RuntimeEnv;
 import org.apache.impala.thrift.TAggregationNode;
 import org.apache.impala.thrift.TExplainLevel;
 import org.apache.impala.thrift.TExpr;
@@ -302,24 +301,24 @@ public class AggregationNode extends PlanNode {
 
     // Must be kept in sync with PartitionedAggregationNode::MinRequiredBuffers() in be.
     long perInstanceMinBuffers;
+    long bufferSize = queryOptions.getDefault_spillable_buffer_size();
     if (aggInfo_.getGroupingExprs().isEmpty() || useStreamingPreagg_) {
       perInstanceMinBuffers = 0;
     } else {
       final int PARTITION_FANOUT = 16;
-      long minBuffers = 2 * PARTITION_FANOUT + 1 + (aggInfo_.needsSerialize() ? 1 : 0);
-      long bufferSize = getDefaultSpillableBufferBytes();
+      long minBuffers = PARTITION_FANOUT + 1 + (aggInfo_.needsSerialize() ? 1 : 0);
       if (perInstanceDataBytes != -1) {
         long bytesPerBuffer = perInstanceDataBytes / PARTITION_FANOUT;
         // Scale down the buffer size if we think there will be excess free space with the
         // default buffer size, e.g. with small dimension tables.
         bufferSize = Math.min(bufferSize, Math.max(
-            RuntimeEnv.INSTANCE.getMinSpillableBufferBytes(),
+            queryOptions.getMin_spillable_buffer_size(),
             BitUtil.roundUpToPowerOf2(bytesPerBuffer)));
       }
       perInstanceMinBuffers = bufferSize * minBuffers;
     }
 
-    nodeResourceProfile_ =
-        new ResourceProfile(perInstanceMemEstimate, perInstanceMinBuffers);
+    nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+        perInstanceMemEstimate, perInstanceMinBuffers, bufferSize);
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java b/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
index d4bafcf..0322d88 100644
--- a/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
@@ -248,8 +248,11 @@ public class AnalyticEvalNode extends PlanNode {
     // TODO: come up with estimate based on window
     long perInstanceMemEstimate = 0;
 
+    // Analytic always uses the default spillable buffer size.
+    long bufferSize = queryOptions.getDefault_spillable_buffer_size();
     // Must be kept in sync with MIN_REQUIRED_BUFFERS in AnalyticEvalNode in be.
-    long perInstanceMinBufferBytes = 2 * getDefaultSpillableBufferBytes();
-    nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, perInstanceMinBufferBytes);
+    long perInstanceMinBufferBytes = 2 * bufferSize;
+    nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+        perInstanceMemEstimate, perInstanceMinBufferBytes, bufferSize);
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
index 879d9d8..cea9b53 100644
--- a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
@@ -332,7 +332,7 @@ public class DataSourceScanNode extends ScanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: What's a good estimate of memory consumption?
-    nodeResourceProfile_ = new ResourceProfile(1024L * 1024L * 1024L, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(1024L * 1024L * 1024L);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java b/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
index d1369f5..af4f9a6 100644
--- a/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
@@ -51,7 +51,7 @@ public class DataStreamSink extends DataSink {
 
   @Override
   public void computeResourceProfile(TQueryOptions queryOptions) {
-    resourceProfile_ = new ResourceProfile(0, 0);
+    resourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java b/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
index 0d0acc9..3fb8bae 100644
--- a/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
@@ -62,7 +62,7 @@ public class EmptySetNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java b/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
index 478a054..87d2fd2 100644
--- a/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
@@ -184,7 +184,7 @@ public class ExchangeNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ =  new ResourceProfile(0, 0);
+    nodeResourceProfile_ =  ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
index bbecbf1..d56aa98 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
@@ -497,7 +497,7 @@ public class HBaseScanNode extends ScanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: What's a good estimate of memory consumption?
-    nodeResourceProfile_ =  new ResourceProfile(1024L * 1024L * 1024L, 0);
+    nodeResourceProfile_ =  ResourceProfile.noReservation(1024L * 1024L * 1024L);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java b/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
index 947665e..28939ed 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
@@ -44,7 +44,7 @@ public class HBaseTableSink extends TableSink {
 
   @Override
   public void computeResourceProfile(TQueryOptions queryOptions) {
-    resourceProfile_ = new ResourceProfile(0, 0);
+    resourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java b/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
index e828125..5ff17c5 100644
--- a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
@@ -28,7 +28,6 @@ import org.apache.impala.catalog.Type;
 import org.apache.impala.common.AnalysisException;
 import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.InternalException;
-import org.apache.impala.common.RuntimeEnv;
 import org.apache.impala.thrift.TEqJoinCondition;
 import org.apache.impala.thrift.TExplainLevel;
 import org.apache.impala.thrift.THashJoinNode;
@@ -223,17 +222,18 @@ public class HashJoinNode extends JoinNode {
     long minBuffers = PARTITION_FANOUT + 1
         + (joinOp_ == JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN ? 3 : 0);
 
-    long bufferSize = getDefaultSpillableBufferBytes();
+    long bufferSize = queryOptions.getDefault_spillable_buffer_size();
     if (perInstanceDataBytes != -1) {
       long bytesPerBuffer = perInstanceDataBytes / PARTITION_FANOUT;
       // Scale down the buffer size if we think there will be excess free space with the
       // default buffer size, e.g. if the right side is a small dimension table.
       bufferSize = Math.min(bufferSize, Math.max(
-          RuntimeEnv.INSTANCE.getMinSpillableBufferBytes(),
+          queryOptions.getMin_spillable_buffer_size(),
           BitUtil.roundUpToPowerOf2(bytesPerBuffer)));
     }
 
     long perInstanceMinBufferBytes = bufferSize * minBuffers;
-    nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, perInstanceMinBufferBytes);
+    nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+        perInstanceMemEstimate, perInstanceMinBufferBytes, bufferSize);
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 0ba5bc6..bf183be 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1021,7 +1021,7 @@ public class HdfsScanNode extends ScanNode {
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     Preconditions.checkNotNull(scanRanges_, "Cost estimation requires scan ranges.");
     if (scanRanges_.isEmpty()) {
-      nodeResourceProfile_ = new ResourceProfile(0, 0);
+      nodeResourceProfile_ = ResourceProfile.noReservation(0);
       return;
     }
     Preconditions.checkState(0 < numNodes_ && numNodes_ <= scanRanges_.size());
@@ -1075,7 +1075,7 @@ public class HdfsScanNode extends ScanNode {
           PrintUtils.printBytes(perHostUpperBound)));
       perInstanceMemEstimate = perHostUpperBound;
     }
-    nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
index fed4ffd..46709c0 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
@@ -99,7 +99,7 @@ public class HdfsTableSink extends TableSink {
           PlanNode.checkedMultiply(numPartitionsPerInstance, perPartitionMemReq);
       perInstanceMemEstimate = Math.min(perInstanceInputBytes, perInstanceMemReq);
     }
-    resourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+    resourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java b/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
index 69cc133..14acb26 100644
--- a/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
@@ -87,6 +87,6 @@ public class JoinBuildSink extends DataSink {
   @Override
   public void computeResourceProfile(TQueryOptions queryOptions) {
     // The memory consumption is counted against the join PlanNode.
-    resourceProfile_ = new ResourceProfile(0, 0);
+    resourceProfile_ = ResourceProfile.noReservation(0);
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
index 57403e4..37a4e5c 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
@@ -270,7 +270,7 @@ public class KuduScanNode extends ScanNode {
 
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java b/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
index b7dcdd8..f75b170 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
@@ -60,7 +60,7 @@ public class KuduTableSink extends TableSink {
   @Override
   public void computeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add a memory estimate
-    resourceProfile_ = new ResourceProfile(0, 0);
+    resourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java b/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
index 0ec8e4f..16a3caf 100644
--- a/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
@@ -20,9 +20,6 @@ package org.apache.impala.planner;
 import java.util.Collections;
 import java.util.List;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import org.apache.impala.analysis.Analyzer;
 import org.apache.impala.analysis.BinaryPredicate;
 import org.apache.impala.analysis.Expr;
@@ -86,7 +83,7 @@ public class NestedLoopJoinNode extends JoinNode {
       perInstanceMemEstimate =
           (long) Math.ceil(getChild(1).cardinality_ * getChild(1).avgRowSize_);
     }
-    nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/PlanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index 9723c4a..2557f98 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -33,7 +33,7 @@ import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.PrintUtils;
 import org.apache.impala.common.TreeNode;
 import org.apache.impala.planner.RuntimeFilterGenerator.RuntimeFilter;
-import org.apache.impala.service.BackendConfig;
+import org.apache.impala.thrift.TBackendResourceProfile;
 import org.apache.impala.thrift.TExecStats;
 import org.apache.impala.thrift.TExplainLevel;
 import org.apache.impala.thrift.TPlan;
@@ -408,6 +408,8 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
       msg.addToRuntime_filters(filter.toThrift());
     }
     msg.setDisable_codegen(disableCodegen_);
+    Preconditions.checkState(nodeResourceProfile_.isValid());
+    msg.resource_profile = nodeResourceProfile_.toThrift();
     toThrift(msg);
     container.addToNodes(msg);
     // For the purpose of the BE consider ExchangeNodes to have no children.
@@ -677,16 +679,6 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
   }
 
   /**
-   * The default size of buffer used in spilling nodes. Used in
-   * computeNodeResourceProfile().
-   */
-  protected final static long getDefaultSpillableBufferBytes() {
-    // BufferedBlockMgr uses --read_size to determine buffer size.
-    // TODO: IMPALA-3200: get from query option
-    return BackendConfig.INSTANCE.getReadSize();
-  }
-
-  /**
    * The input cardinality is the sum of output cardinalities of its children.
    * For scan nodes the input cardinality is the expected number of rows scanned.
    */

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java b/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
index fba9149..07eb58b 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
@@ -37,7 +37,7 @@ public class PlanRootSink extends DataSink {
   @Override
   public void computeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add a memory estimate
-    resourceProfile_ = new ResourceProfile(0, 0);
+    resourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   protected TDataSink toThrift() {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/Planner.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/Planner.java b/fe/src/main/java/org/apache/impala/planner/Planner.java
index 4cfd57e..ed6e8df 100644
--- a/fe/src/main/java/org/apache/impala/planner/Planner.java
+++ b/fe/src/main/java/org/apache/impala/planner/Planner.java
@@ -63,7 +63,7 @@ public class Planner {
   public static final long MIN_PER_HOST_MEM_ESTIMATE_BYTES = 10 * 1024 * 1024;
 
   public static final ResourceProfile MIN_PER_HOST_RESOURCES =
-      new ResourceProfile(MIN_PER_HOST_MEM_ESTIMATE_BYTES, 0);
+      ResourceProfile.withMinReservation(MIN_PER_HOST_MEM_ESTIMATE_BYTES, 0);
 
   private final PlannerContext ctx_;
 
@@ -262,9 +262,9 @@ public class Planner {
       TQueryExecRequest request, TExplainLevel explainLevel) {
     StringBuilder str = new StringBuilder();
     boolean hasHeader = false;
-    if (request.isSetPer_host_min_reservation()) {
+    if (request.query_ctx.isSetPer_host_min_reservation()) {
       str.append(String.format("Per-Host Resource Reservation: Memory=%s\n",
-              PrintUtils.printBytes(request.getPer_host_min_reservation()))) ;
+          PrintUtils.printBytes(request.query_ctx.getPer_host_min_reservation())));
       hasHeader = true;
     }
     if (request.isSetPer_host_mem_estimate()) {
@@ -344,7 +344,7 @@ public class Planner {
    * per-host resource values in 'request'.
    */
   public void computeResourceReqs(List<PlanFragment> planRoots,
-      TQueryExecRequest request) {
+      TQueryCtx queryCtx, TQueryExecRequest request) {
     Preconditions.checkState(!planRoots.isEmpty());
     Preconditions.checkNotNull(request);
     TQueryOptions queryOptions = ctx_.getRootAnalyzer().getQueryOptions();
@@ -389,8 +389,8 @@ public class Planner {
     perHostPeakResources = MIN_PER_HOST_RESOURCES.max(perHostPeakResources);
 
     request.setPer_host_mem_estimate(perHostPeakResources.getMemEstimateBytes());
-    request.setPer_host_min_reservation(perHostPeakResources.getMinReservationBytes());
-    request.setPer_host_initial_reservation_total_claims(perHostInitialReservationTotal);
+    queryCtx.setPer_host_min_reservation(perHostPeakResources.getMinReservationBytes());
+    queryCtx.setPer_host_initial_reservation_total_claims(perHostInitialReservationTotal);
     if (LOG.isTraceEnabled()) {
       LOG.trace("Per-host min buffer : " + perHostPeakResources.getMinReservationBytes());
       LOG.trace(

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java b/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
index 18cde7e..3c13812 100644
--- a/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
+++ b/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
@@ -18,6 +18,7 @@
 package org.apache.impala.planner;
 
 import org.apache.impala.common.PrintUtils;
+import org.apache.impala.thrift.TBackendResourceProfile;
 import org.apache.impala.util.MathUtil;
 
 /**
@@ -35,25 +36,56 @@ public class ResourceProfile {
   private final long memEstimateBytes_;
 
   // Minimum buffer reservation required to execute in bytes.
+  // The valid range is [0, maxReservationBytes_].
   private final long minReservationBytes_;
 
-  private ResourceProfile(boolean isValid, long memEstimateBytes, long minReservationBytes) {
+  // Maximum buffer reservation allowed for this plan node.
+  // The valid range is [minReservationBytes_, Long.MAX_VALUE].
+  private final long maxReservationBytes_;
+
+  // The spillable buffer size to use in a plan node. Only valid for resource profiles
+  // for spilling PlanNodes. Operations like sum(), max(), etc., produce profiles without
+  // valid spillableBufferBytes_ values. -1 means invalid.
+  private final long spillableBufferBytes_;
+
+  private ResourceProfile(boolean isValid, long memEstimateBytes,
+      long minReservationBytes, long maxReservationBytes, long spillableBufferBytes) {
     isValid_ = isValid;
     memEstimateBytes_ = memEstimateBytes;
     minReservationBytes_ = minReservationBytes;
+    maxReservationBytes_ = maxReservationBytes;
+    spillableBufferBytes_ = spillableBufferBytes;
+  }
+
+  // Create a resource profile with zero min or max reservation.
+  public static ResourceProfile noReservation(long memEstimateBytes) {
+    return new ResourceProfile(true, memEstimateBytes, 0, 0, -1);
+  }
+
+  // Create a resource profile with a minimum reservation (but no maximum).
+  public static ResourceProfile withMinReservation(long memEstimateBytes,
+      long minReservationBytes) {
+    return new ResourceProfile(
+        true, memEstimateBytes, minReservationBytes, Long.MAX_VALUE, -1);
   }
 
-  public ResourceProfile(long memEstimateBytes, long minReservationBytes) {
-    this(true, memEstimateBytes, minReservationBytes);
+  // Create a resource profile with a minimum reservation (but no maximum) and a
+  // spillable buffer size.
+  public static ResourceProfile spillableWithMinReservation(long memEstimateBytes,
+      long minReservationBytes, long spillableBufferBytes) {
+    return new ResourceProfile(true, memEstimateBytes, minReservationBytes,
+        Long.MAX_VALUE, spillableBufferBytes);
   }
 
   public static ResourceProfile invalid() {
-    return new ResourceProfile(false, -1, -1);
+    return new ResourceProfile(false, -1, -1, -1, -1);
   }
 
   public boolean isValid() { return isValid_; }
   public long getMemEstimateBytes() { return memEstimateBytes_; }
   public long getMinReservationBytes() { return minReservationBytes_; }
+  public long getMaxReservationBytes() { return maxReservationBytes_; }
+  public long getSpillableBufferBytes() { return spillableBufferBytes_; }
 
   // Return a string with the resource profile information suitable for display in an
   // explain plan in a format like: "resource1=value resource2=value"
@@ -63,6 +95,12 @@ public class ResourceProfile {
     output.append(isValid_ ? PrintUtils.printBytes(memEstimateBytes_) : "invalid");
     output.append(" mem-reservation=");
     output.append(isValid_ ? PrintUtils.printBytes(minReservationBytes_) : "invalid");
+    // TODO: output maxReservation_ here if the planner becomes more sophisticated in
+    // choosing it (beyond 0/unlimited).
+    if (isValid_ && spillableBufferBytes_ != -1) {
+      output.append(" spill-buffer=");
+      output.append(PrintUtils.printBytes(spillableBufferBytes_));
+    }
     return output.toString();
   }
 
@@ -70,25 +108,39 @@ public class ResourceProfile {
   public ResourceProfile max(ResourceProfile other) {
     if (!isValid()) return other;
     if (!other.isValid()) return this;
-    return new ResourceProfile(
+    return new ResourceProfile(true,
         Math.max(getMemEstimateBytes(), other.getMemEstimateBytes()),
-        Math.max(getMinReservationBytes(), other.getMinReservationBytes()));
+        Math.max(getMinReservationBytes(), other.getMinReservationBytes()),
+        Math.max(getMaxReservationBytes(), other.getMaxReservationBytes()), -1);
   }
 
   // Returns a profile with the sum of each value in 'this' and 'other'.
   public ResourceProfile sum(ResourceProfile other) {
     if (!isValid()) return other;
     if (!other.isValid()) return this;
-    return new ResourceProfile(
+    return new ResourceProfile(true,
         MathUtil.saturatingAdd(getMemEstimateBytes(), other.getMemEstimateBytes()),
-        MathUtil.saturatingAdd(getMinReservationBytes(), other.getMinReservationBytes()));
+        MathUtil.saturatingAdd(getMinReservationBytes(),other.getMinReservationBytes()),
+        MathUtil.saturatingAdd(getMaxReservationBytes(), other.getMaxReservationBytes()),
+        -1);
   }
 
   // Returns a profile with all values multiplied by 'factor'.
   public ResourceProfile multiply(int factor) {
     if (!isValid()) return this;
-    return new ResourceProfile(
+    return new ResourceProfile(true,
         MathUtil.saturatingMultiply(memEstimateBytes_, factor),
-        MathUtil.saturatingMultiply(minReservationBytes_, factor));
+        MathUtil.saturatingMultiply(minReservationBytes_, factor),
+        MathUtil.saturatingMultiply(maxReservationBytes_, factor), -1);
+  }
+
+  public TBackendResourceProfile toThrift() {
+    TBackendResourceProfile result = new TBackendResourceProfile();
+    result.setMin_reservation(minReservationBytes_);
+    result.setMax_reservation(maxReservationBytes_);
+    if (spillableBufferBytes_ != -1) {
+      result.setSpillable_buffer_size(spillableBufferBytes_);
+    }
+    return result;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SelectNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SelectNode.java b/fe/src/main/java/org/apache/impala/planner/SelectNode.java
index 97dfa5b..3ffc975 100644
--- a/fe/src/main/java/org/apache/impala/planner/SelectNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SelectNode.java
@@ -84,7 +84,7 @@ public class SelectNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java b/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
index bed1c9a..bdf3a01 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
@@ -68,7 +68,7 @@ public class SingularRowSrcNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SortNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SortNode.java b/fe/src/main/java/org/apache/impala/planner/SortNode.java
index aee8fda..75e8034 100644
--- a/fe/src/main/java/org/apache/impala/planner/SortNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SortNode.java
@@ -29,7 +29,6 @@ import org.apache.impala.analysis.SlotDescriptor;
 import org.apache.impala.analysis.SlotRef;
 import org.apache.impala.analysis.SortInfo;
 import org.apache.impala.common.InternalException;
-import org.apache.impala.service.BackendConfig;
 import org.apache.impala.thrift.TExplainLevel;
 import org.apache.impala.thrift.TPlanNode;
 import org.apache.impala.thrift.TPlanNodeType;
@@ -255,7 +254,7 @@ public class SortNode extends PlanNode {
     if (type_ == TSortType.TOPN) {
       long perInstanceMemEstimate =
               (long) Math.ceil((cardinality_ + offset_) * avgRowSize_);
-      nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+      nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
       return;
     }
 
@@ -265,44 +264,40 @@ public class SortNode extends PlanNode {
     // size sqrt(N) blocks, and we could merge sqrt(N) such runs with sqrt(N) blocks
     // of memory.
     double fullInputSize = getChild(0).cardinality_ * avgRowSize_;
-    boolean hasVarLenSlots = false;
+    boolean usesVarLenBlocks = false;
     for (SlotDescriptor slotDesc: info_.getSortTupleDescriptor().getSlots()) {
       if (slotDesc.isMaterialized() && !slotDesc.getType().isFixedLengthType()) {
-        hasVarLenSlots = true;
+        usesVarLenBlocks = true;
         break;
       }
     }
 
-    // The block size used by the sorter is the same as the configured I/O read size.
-    long blockSize = BackendConfig.INSTANCE.getReadSize();
-    // The external sorter writes fixed-len and var-len data in separate sequences of
-    // blocks on disk and reads from both sequences when merging. This effectively
-    // doubles the block size when there are var-len columns present.
-    if (hasVarLenSlots) blockSize *= 2;
+    // Sort always uses the default spillable buffer size.
+    long bufferSize = queryOptions.getDefault_spillable_buffer_size();
 
+    // The external sorter writes fixed-len and var-len data in separate sequences of
+    // pages on disk and reads from both sequences when merging. This effectively
+    // doubles the number of pages required when there are var-len columns present.
+    // Must be kept in sync with ComputeMinReservation() in Sorter in be.
+    int pageMultiplier = usesVarLenBlocks ? 2 : 1;
+    long perInstanceMemEstimate;
+    long perInstanceMinReservation;
     if (type_ == TSortType.PARTIAL) {
       // The memory limit cannot be less than the size of the required blocks.
-      long mem_limit =
-          PARTIAL_SORT_MEM_LIMIT > blockSize ? PARTIAL_SORT_MEM_LIMIT : blockSize;
+      long mem_limit = Math.max(PARTIAL_SORT_MEM_LIMIT, bufferSize * pageMultiplier);
       // 'fullInputSize' will be negative if stats are missing, just use the limit.
-      long perInstanceMemEstimate = fullInputSize < 0 ?
+      perInstanceMemEstimate = fullInputSize < 0 ?
           mem_limit :
           Math.min((long) Math.ceil(fullInputSize), mem_limit);
-      nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, blockSize);
+      perInstanceMinReservation = bufferSize * pageMultiplier;
     } else {
-      Preconditions.checkState(type_ == TSortType.TOTAL);
-      double numInputBlocks = Math.ceil(fullInputSize / blockSize);
-      long perInstanceMemEstimate =
-          blockSize * (long) Math.ceil(Math.sqrt(numInputBlocks));
-
-      // Must be kept in sync with min_buffers_required in Sorter in be.
-      long perInstanceMinReservation = 3 * getDefaultSpillableBufferBytes();
-      if (info_.getSortTupleDescriptor().hasVarLenSlots()) {
-        perInstanceMinReservation *= 2;
-      }
-      nodeResourceProfile_ =
-          new ResourceProfile(perInstanceMemEstimate, perInstanceMinReservation);
+      double numInputBlocks = Math.ceil(fullInputSize / (bufferSize * pageMultiplier));
+      perInstanceMemEstimate =
+          bufferSize * (long) Math.ceil(Math.sqrt(numInputBlocks));
+      perInstanceMinReservation = 3 * bufferSize * pageMultiplier;
     }
+    nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+        perInstanceMemEstimate, perInstanceMinReservation, bufferSize);
   }
 
   private static String getDisplayName(TSortType type) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SubplanNode.java b/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
index c09efe5..e41290e 100644
--- a/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
@@ -95,7 +95,7 @@ public class SubplanNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/UnionNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/UnionNode.java b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
index 44e2967..302f62d 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnionNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
@@ -131,7 +131,7 @@ public class UnionNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/UnnestNode.java b/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
index 695ec24..7e0a87e 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
@@ -76,7 +76,7 @@ public class UnnestNode extends PlanNode {
   @Override
   public void computeNodeResourceProfile(TQueryOptions queryOptions) {
     // TODO: add an estimate
-    nodeResourceProfile_ = new ResourceProfile(0, 0);
+    nodeResourceProfile_ = ResourceProfile.noReservation(0);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/service/Frontend.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/Frontend.java b/fe/src/main/java/org/apache/impala/service/Frontend.java
index 2c71a9b..60e84b4 100644
--- a/fe/src/main/java/org/apache/impala/service/Frontend.java
+++ b/fe/src/main/java/org/apache/impala/service/Frontend.java
@@ -1005,7 +1005,7 @@ public class Frontend {
     }
 
     // Compute resource requirements of the final plans.
-    planner.computeResourceReqs(planRoots, result);
+    planner.computeResourceReqs(planRoots, queryCtx, result);
 
     // create per-plan exec info;
     // also assemble list of names of tables with missing or corrupt stats for

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
index 8289ee8..b0f1e2e 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -425,8 +425,6 @@ public class PlannerTest extends PlannerTestBase {
     TQueryOptions options = defaultQueryOptions();
     options.setExplain_level(TExplainLevel.EXTENDED);
     options.setNum_scanner_threads(1); // Required so that output doesn't vary by machine
-    // TODO: IMPALA-3200 - this should become a query option.
-    RuntimeEnv.INSTANCE.setMinSpillableBufferBytes(64 * 1024);
     runPlannerTestFile("spillable-buffer-sizing", options, false);
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
index ed4f684..f4ae6c3 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
@@ -102,7 +102,7 @@ having 1024 * 1024 * count(*) % 2 = 0
   and (sm between 5 and 10)
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=138.00MB mem-reservation=264.00MB
+|  Per-Host Resources: mem-estimate=138.00MB mem-reservation=1.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -110,7 +110,7 @@ PLAN-ROOT SINK
 |  output: sum(2 + id), count(*)
 |  group by: timestamp_col = TIMESTAMP '2016-11-15 00:00:00'
 |  having: sum(2 + id) <= 10, sum(2 + id) > 1, sum(2 + id) >= 5, 1048576 * count(*) % 2 = 0
-|  mem-estimate=10.00MB mem-reservation=264.00MB
+|  mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=1 row-size=17B cardinality=0
 |
 00:SCAN HDFS [functional.alltypes]
@@ -129,7 +129,7 @@ left outer join functional.alltypes b
 where round(1.11 + 2.22 + 3.33 + 4.44, 1) < cast(b.double_col as decimal(3, 2))
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=256.02MB mem-reservation=136.00MB
+|  Per-Host Resources: mem-estimate=256.02MB mem-reservation=1.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -138,7 +138,7 @@ PLAN-ROOT SINK
 |  fk/pk conjuncts: assumed fk/pk
 |  other join predicates: a.int_col <= b.bigint_col + 97, a.int_col >= 0 + b.bigint_col
 |  other predicates: CAST(b.double_col AS DECIMAL(3,2)) > 11.1
-|  mem-estimate=15.68KB mem-reservation=136.00MB
+|  mem-estimate=15.68KB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=0,1N row-size=28B cardinality=7300
 |
 |--01:SCAN HDFS [functional.alltypes b]
@@ -203,7 +203,7 @@ group by timestamp_col = cast('2015-11-15' as timestamp) + interval 1 year
 having 1024 * 1024 * count(*) % 2 = 0
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=138.00MB mem-reservation=528.00MB
+|  Per-Host Resources: mem-estimate=138.00MB mem-reservation=2.12MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -211,13 +211,13 @@ PLAN-ROOT SINK
 |  output: sum(2 + id), count:merge(*)
 |  group by: timestamp_col = TIMESTAMP '2016-11-15 00:00:00'
 |  having: 1048576 * count(*) % 2 = 0
-|  mem-estimate=10.00MB mem-reservation=264.00MB
+|  mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=2 row-size=17B cardinality=0
 |
 01:AGGREGATE
 |  output: count(*)
 |  group by: timestamp_col = TIMESTAMP '2016-11-15 00:00:00', 2 + id
-|  mem-estimate=10.00MB mem-reservation=264.00MB
+|  mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=1 row-size=17B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -234,20 +234,20 @@ from functional.alltypes
 having 1024 * 1024 * count(*) % 2 = 0
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=138.00MB mem-reservation=264.00MB
+|  Per-Host Resources: mem-estimate=138.00MB mem-reservation=1.06MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 02:AGGREGATE [FINALIZE]
 |  output: sum(2 + id), count:merge(*)
 |  having: 1048576 * zeroifnull(count(*)) % 2 = 0
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=2 row-size=16B cardinality=0
 |
 01:AGGREGATE
 |  output: count(*)
 |  group by: 2 + id
-|  mem-estimate=10.00MB mem-reservation=264.00MB
+|  mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
 |  tuple-ids=1 row-size=16B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -265,7 +265,7 @@ select first_value(1 + 1 + int_col - (1 - 1)) over
 from functional.alltypes
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=144.00MB mem-reservation=64.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=16.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
@@ -274,13 +274,13 @@ PLAN-ROOT SINK
 |  partition by: concat('ab', string_col)
 |  order by: greatest(20, bigint_col) ASC
 |  window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-|  mem-estimate=0B mem-reservation=16.00MB
+|  mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
 |  tuple-ids=3,2 row-size=61B cardinality=7300
 |
 01:SORT
 |  order by: concat('ab', string_col) ASC NULLS FIRST, greatest(20, bigint_col) ASC
 |  materialized: concat('ab', string_col), greatest(20, bigint_col)
-|  mem-estimate=16.00MB mem-reservation=48.00MB
+|  mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
 |  tuple-ids=3 row-size=53B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -296,13 +296,13 @@ select int_col from functional.alltypes
 order by id * abs((factorial(5) / power(2, 4)))
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=136.00MB mem-reservation=24.00MB
+|  Per-Host Resources: mem-estimate=130.00MB mem-reservation=6.00MB
 PLAN-ROOT SINK
 |  mem-estimate=0B mem-reservation=0B
 |
 01:SORT
 |  order by: id * 7.5 ASC
-|  mem-estimate=8.00MB mem-reservation=24.00MB
+|  mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
 |  tuple-ids=1 row-size=8B cardinality=7300
 |
 00:SCAN HDFS [functional.alltypes]
@@ -347,7 +347,7 @@ PLAN-ROOT SINK
 |
 01:AGGREGATE [FINALIZE]
 |  output: sum(id + 10 + 20 + 30)
-|  mem-estimate=10.00MB mem-reservation=0B
+|  mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
 |  tuple-ids=4 row-size=8B cardinality=1
 |
 00:SCAN HDFS [functional.alltypes]

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test b/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
index e64691a..d367424 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
@@ -78,7 +78,7 @@ select count(*)
 from functional.alltypes t1
 join functional.alltypestiny t2 on t1.id = t2.id
 ---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=1.06MB
 Per-Host Resource Estimates: Memory=180.00MB
 Codegen disabled by planner
 


[07/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-block-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr.cc b/be/src/runtime/buffered-block-mgr.cc
deleted file mode 100644
index e4737c2..0000000
--- a/be/src/runtime/buffered-block-mgr.cc
+++ /dev/null
@@ -1,1254 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/mem-pool.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/runtime-state.h"
-#include "runtime/tmp-file-mgr.h"
-#include "util/debug-util.h"
-#include "util/disk-info.h"
-#include "util/filesystem-util.h"
-#include "util/impalad-metrics.h"
-#include "util/runtime-profile-counters.h"
-#include "util/uid-util.h"
-
-#include <gutil/strings/substitute.h>
-
-#include "common/names.h"
-
-using namespace strings; // for Substitute
-
-namespace impala {
-
-BufferedBlockMgr::BlockMgrsMap BufferedBlockMgr::query_to_block_mgrs_;
-SpinLock BufferedBlockMgr::static_block_mgrs_lock_;
-
-
-struct BufferedBlockMgr::Client {
-  Client(const string& debug_info, BufferedBlockMgr* mgr, int num_reserved_buffers,
-      bool tolerates_oversubscription, MemTracker* tracker,
-         RuntimeState* state)
-      : debug_info_(debug_info),
-        mgr_(mgr),
-        state_(state),
-        tracker_(tracker),
-        query_tracker_(mgr_->mem_tracker_->parent()),
-        num_reserved_buffers_(num_reserved_buffers),
-        tolerates_oversubscription_(tolerates_oversubscription),
-        num_tmp_reserved_buffers_(0),
-        num_pinned_buffers_(0),
-        logged_large_allocation_warning_(false) {
-    DCHECK(tracker != NULL);
-  }
-
-  /// A string that will be printed to identify the client, e.g. which exec node it
-  /// belongs to.
-  string debug_info_;
-
-  /// Unowned.
-  BufferedBlockMgr* mgr_;
-
-  /// Unowned.
-  RuntimeState* state_;
-
-  /// Tracker for this client. Unowned.
-  /// When the client gets a buffer, we update the consumption on this tracker. However,
-  /// we don't want to transfer the buffer from the block mgr to the client (i.e. release
-  /// from the block mgr), since the block mgr is where the block mem usage limit is
-  /// enforced. Even when we give a buffer to a client, the buffer is still owned and
-  /// counts against the block mgr tracker (i.e. there is a fixed pool of buffers
-  /// regardless of if they are in the block mgr or the clients).
-  MemTracker* tracker_;
-
-  /// This is the common ancestor between the block mgr tracker and the client tracker.
-  /// When memory is transferred to the client, we want it to stop at this tracker.
-  MemTracker* query_tracker_;
-
-  /// Number of buffers reserved by this client.
-  int num_reserved_buffers_;
-
-  /// If false, return MEM_LIMIT_EXCEEDED when a reserved buffer cannot be allocated.
-  /// If true, return Status::OK() as with a non-reserved buffer.
-  bool tolerates_oversubscription_;
-
-  /// Number of buffers temporarily reserved.
-  int num_tmp_reserved_buffers_;
-
-  /// Number of buffers pinned by this client.
-  int num_pinned_buffers_;
-
-  /// Whether a warning about a large allocation has been made for this client. Used
-  /// to avoid producing excessive log messages.
-  bool logged_large_allocation_warning_;
-
-  void PinBuffer(BufferDescriptor* buffer) {
-    DCHECK(buffer != NULL);
-    if (buffer->len == mgr_->max_block_size()) {
-      ++num_pinned_buffers_;
-      tracker_->ConsumeLocal(buffer->len, query_tracker_);
-    }
-  }
-
-  void UnpinBuffer(BufferDescriptor* buffer) {
-    DCHECK(buffer != NULL);
-    if (buffer->len == mgr_->max_block_size()) {
-      DCHECK_GT(num_pinned_buffers_, 0);
-      --num_pinned_buffers_;
-      tracker_->ReleaseLocal(buffer->len, query_tracker_);
-    }
-  }
-
-  string DebugString() const {
-    stringstream ss;
-    ss << "Client " << this << endl
-       << " " << debug_info_ << endl
-       << "  num_reserved_buffers=" << num_reserved_buffers_ << endl
-       << "  num_tmp_reserved_buffers=" << num_tmp_reserved_buffers_ << endl
-       << "  num_pinned_buffers=" << num_pinned_buffers_;
-    return ss.str();
-  }
-};
-
-// BufferedBlockMgr::Block methods.
-BufferedBlockMgr::Block::Block(BufferedBlockMgr* block_mgr)
-  : buffer_desc_(NULL),
-    block_mgr_(block_mgr),
-    client_(NULL),
-    valid_data_len_(0),
-    num_rows_(0) {}
-
-Status BufferedBlockMgr::Block::Pin(bool* pinned, Block* release_block, bool unpin) {
-  return block_mgr_->PinBlock(this, pinned, release_block, unpin);
-}
-
-Status BufferedBlockMgr::Block::Unpin() {
-  return block_mgr_->UnpinBlock(this);
-}
-
-void BufferedBlockMgr::Block::Delete() {
-  block_mgr_->DeleteBlock(this);
-}
-
-void BufferedBlockMgr::Block::Init() {
-  // No locks are taken because the block is new or has previously been deleted.
-  is_pinned_ = false;
-  in_write_ = false;
-  is_deleted_ = false;
-  valid_data_len_ = 0;
-  client_ = NULL;
-  num_rows_ = 0;
-}
-
-bool BufferedBlockMgr::Block::Validate() const {
-  if (is_deleted_ && (is_pinned_ || (!in_write_ && buffer_desc_ != NULL))) {
-    LOG(ERROR) << "Deleted block in use - " << DebugString();
-    return false;
-  }
-
-  if (buffer_desc_ == NULL && (is_pinned_ || in_write_)) {
-    LOG(ERROR) << "Block without buffer in use - " << DebugString();
-    return false;
-  }
-
-  if (buffer_desc_ == NULL && block_mgr_->unpinned_blocks_.Contains(this)) {
-    LOG(ERROR) << "Unpersisted block without buffer - " << DebugString();
-    return false;
-  }
-
-  if (buffer_desc_ != NULL && (buffer_desc_->block != this)) {
-    LOG(ERROR) << "Block buffer inconsistency - " << DebugString();
-    return false;
-  }
-
-  return true;
-}
-
-string BufferedBlockMgr::Block::TmpFilePath() const {
-  if (write_handle_ == NULL) return "";
-  return write_handle_->TmpFilePath();
-}
-
-string BufferedBlockMgr::Block::DebugString() const {
-  stringstream ss;
-  ss << "Block: " << this << endl
-     << "  Buffer Desc: " << buffer_desc_ << endl
-     << "  Data Len: " << valid_data_len_ << endl
-     << "  Num Rows: " << num_rows_ << endl;
-  if (is_pinned_) ss << "  Buffer Len: " << buffer_len() << endl;
-  ss << "  Deleted: " << is_deleted_ << endl
-     << "  Pinned: " << is_pinned_ << endl
-     << "  Write Issued: " << in_write_ << endl
-     << "  Client Local: " << client_local_ << endl;
-  if (write_handle_ != NULL) {
-    ss << "  Write handle: " << write_handle_->DebugString() << endl;
-  }
-  if (client_ != NULL) ss << "  Client: " << client_->DebugString();
-  return ss.str();
-}
-
-BufferedBlockMgr::BufferedBlockMgr(RuntimeState* state, TmpFileMgr* tmp_file_mgr,
-    int64_t block_size, int64_t scratch_limit)
-  : max_block_size_(block_size),
-    // Keep two writes in flight per scratch disk so the disks can stay busy.
-    block_write_threshold_(tmp_file_mgr->NumActiveTmpDevices() * 2),
-    disable_spill_(state->query_ctx().disable_spilling || block_write_threshold_ == 0
-        || scratch_limit == 0),
-    query_id_(state->query_id()),
-    initialized_(false),
-    unfullfilled_reserved_buffers_(0),
-    total_pinned_buffers_(0),
-    non_local_outstanding_writes_(0),
-    tmp_file_group_(NULL),
-    is_cancelled_(false),
-    writes_issued_(0),
-    debug_write_delay_ms_(0) {}
-
-Status BufferedBlockMgr::Create(RuntimeState* state, MemTracker* parent,
-    RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, int64_t mem_limit,
-    int64_t block_size, shared_ptr<BufferedBlockMgr>* block_mgr) {
-  DCHECK(parent != NULL);
-  int64_t scratch_limit = state->query_options().scratch_limit;
-  block_mgr->reset();
-  {
-    lock_guard<SpinLock> lock(static_block_mgrs_lock_);
-    BlockMgrsMap::iterator it = query_to_block_mgrs_.find(state->query_id());
-    if (it != query_to_block_mgrs_.end()) *block_mgr = it->second.lock();
-    if (*block_mgr == NULL) {
-      // weak_ptr::lock returns NULL if the weak_ptr is expired. This means
-      // all shared_ptr references have gone to 0 and it is in the process of
-      // being deleted. This can happen if the last shared reference is released
-      // but before the weak ptr is removed from the map.
-      block_mgr->reset(
-          new BufferedBlockMgr(state, tmp_file_mgr, block_size, scratch_limit));
-      query_to_block_mgrs_[state->query_id()] = *block_mgr;
-    }
-  }
-  (*block_mgr)
-      ->Init(state->io_mgr(), tmp_file_mgr, profile, parent, mem_limit, scratch_limit);
-  return Status::OK();
-}
-
-int64_t BufferedBlockMgr::available_buffers(Client* client) const {
-  int64_t unused_reserved = client->num_reserved_buffers_ +
-      client->num_tmp_reserved_buffers_ - client->num_pinned_buffers_;
-  return max<int64_t>(0, remaining_unreserved_buffers()) +
-      max<int64_t>(0, unused_reserved);
-}
-
-int64_t BufferedBlockMgr::remaining_unreserved_buffers() const {
-  int64_t num_buffers = free_io_buffers_.size() +
-      unpinned_blocks_.size() + non_local_outstanding_writes_;
-  num_buffers += mem_tracker_->SpareCapacity() / max_block_size();
-  num_buffers -= unfullfilled_reserved_buffers_;
-  return num_buffers;
-}
-
-Status BufferedBlockMgr::RegisterClient(const string& debug_info,
-    int num_reserved_buffers, bool tolerates_oversubscription, MemTracker* tracker,
-    RuntimeState* state, Client** client) {
-  DCHECK_GE(num_reserved_buffers, 0);
-  Client* aClient = new Client(debug_info, this, num_reserved_buffers,
-      tolerates_oversubscription, tracker, state);
-  lock_guard<mutex> lock(lock_);
-  *client = obj_pool_.Add(aClient);
-  unfullfilled_reserved_buffers_ += num_reserved_buffers;
-  return Status::OK();
-}
-
-void BufferedBlockMgr::ClearReservations(Client* client) {
-  lock_guard<mutex> lock(lock_);
-  if (client->num_pinned_buffers_ < client->num_reserved_buffers_) {
-    unfullfilled_reserved_buffers_ -=
-        client->num_reserved_buffers_ - client->num_pinned_buffers_;
-  }
-  client->num_reserved_buffers_ = 0;
-
-  unfullfilled_reserved_buffers_ -= client->num_tmp_reserved_buffers_;
-  client->num_tmp_reserved_buffers_ = 0;
-}
-
-bool BufferedBlockMgr::TryAcquireTmpReservation(Client* client, int num_buffers) {
-  lock_guard<mutex> lock(lock_);
-  DCHECK_EQ(client->num_tmp_reserved_buffers_, 0);
-  if (client->num_pinned_buffers_ < client->num_reserved_buffers_) {
-    // If client has unused reserved buffers, we use those first.
-    num_buffers -= client->num_reserved_buffers_ - client->num_pinned_buffers_;
-  }
-  if (num_buffers < 0) return true;
-  if (available_buffers(client) < num_buffers) return false;
-
-  client->num_tmp_reserved_buffers_ = num_buffers;
-  unfullfilled_reserved_buffers_ += num_buffers;
-  return true;
-}
-
-bool BufferedBlockMgr::ConsumeMemory(Client* client, int64_t size) {
-  int64_t buffers_needed = BitUtil::Ceil(size, max_block_size());
-  if (UNLIKELY(!BitUtil::IsNonNegative32Bit(buffers_needed))) {
-    VLOG_QUERY << "Trying to consume " << size << " which is out of range.";
-    return false;
-  }
-  DCHECK_GT(buffers_needed, 0) << "Trying to consume 0 memory";
-
-  unique_lock<mutex> lock(lock_);
-  if (size < max_block_size() && mem_tracker_->TryConsume(size)) {
-    // For small allocations (less than a block size), just let the allocation through.
-    client->tracker_->ConsumeLocal(size, client->query_tracker_);
-    return true;
-  }
-
-  if (max<int64_t>(0, remaining_unreserved_buffers()) +
-      client->num_tmp_reserved_buffers_ < buffers_needed) {
-    return false;
-  }
-
-  if (mem_tracker_->TryConsume(size)) {
-    // There was still unallocated memory, don't need to recycle allocated blocks.
-    client->tracker_->ConsumeLocal(size, client->query_tracker_);
-    return true;
-  }
-
-  // Bump up client->num_tmp_reserved_buffers_ to satisfy this request. We don't want
-  // another client to grab the buffer.
-  int additional_tmp_reservations = 0;
-  if (client->num_tmp_reserved_buffers_ < buffers_needed) {
-    additional_tmp_reservations = buffers_needed - client->num_tmp_reserved_buffers_;
-    client->num_tmp_reserved_buffers_ += additional_tmp_reservations;
-    unfullfilled_reserved_buffers_ += additional_tmp_reservations;
-  }
-
-  // Loop until we have freed enough memory.
-  // We free all the memory at the end. We don't want another component to steal the
-  // memory.
-  int buffers_acquired = 0;
-  do {
-    BufferDescriptor* buffer_desc = NULL;
-    Status s = FindBuffer(lock, &buffer_desc); // This waits on the lock.
-    if (buffer_desc == NULL) break;
-    DCHECK(s.ok());
-    all_io_buffers_.erase(buffer_desc->all_buffers_it);
-    if (buffer_desc->block != NULL) buffer_desc->block->buffer_desc_ = NULL;
-    delete[] buffer_desc->buffer;
-    ++buffers_acquired;
-  } while (buffers_acquired != buffers_needed);
-
-  Status status = Status::OK();
-  if (buffers_acquired == buffers_needed) status = WriteUnpinnedBlocks();
-  // If we either couldn't acquire enough buffers or WriteUnpinnedBlocks() failed, undo
-  // the reservation.
-  if (buffers_acquired != buffers_needed || !status.ok()) {
-    if (!status.ok() && !status.IsCancelled()) {
-      VLOG_QUERY << "Query: " << query_id_ << " write unpinned buffers failed.";
-      client->state_->LogError(status.msg());
-    }
-    client->num_tmp_reserved_buffers_ -= additional_tmp_reservations;
-    unfullfilled_reserved_buffers_ -= additional_tmp_reservations;
-    mem_tracker_->Release(buffers_acquired * max_block_size());
-    return false;
-  }
-
-  client->num_tmp_reserved_buffers_ -= buffers_acquired;
-  unfullfilled_reserved_buffers_ -= buffers_acquired;
-
-  DCHECK_GE(buffers_acquired * max_block_size(), size);
-  mem_tracker_->Release(buffers_acquired * max_block_size());
-  if (!mem_tracker_->TryConsume(size)) return false;
-  client->tracker_->ConsumeLocal(size, client->query_tracker_);
-  DCHECK(Validate()) << endl << DebugInternal();
-  return true;
-}
-
-void BufferedBlockMgr::ReleaseMemory(Client* client, int64_t size) {
-  mem_tracker_->Release(size);
-  client->tracker_->ReleaseLocal(size, client->query_tracker_);
-}
-
-void BufferedBlockMgr::Cancel() {
-  {
-    lock_guard<mutex> lock(lock_);
-    if (is_cancelled_) return;
-    is_cancelled_ = true;
-  }
-}
-
-bool BufferedBlockMgr::IsCancelled() {
-  lock_guard<mutex> lock(lock_);
-  return is_cancelled_;
-}
-
-Status BufferedBlockMgr::MemLimitTooLowError(Client* client, int node_id) {
-  VLOG_QUERY << "Query: " << query_id_ << ". Node=" << node_id
-             << " ran out of memory: " << endl
-             << DebugInternal() << endl << client->DebugString();
-  int64_t min_memory = client->num_reserved_buffers_ * max_block_size();
-  string msg = Substitute(
-      "The memory limit is set too low to initialize spilling operator (id=$0). The "
-      "minimum required memory to spill this operator is $1.",
-      node_id, PrettyPrinter::Print(min_memory, TUnit::BYTES));
-  return client->tracker_->MemLimitExceeded(client->state_, msg);
-}
-
-Status BufferedBlockMgr::GetNewBlock(Client* client, Block* unpin_block, Block** block,
-    int64_t len) {
-  DCHECK_LE(len, max_block_size_) << "Cannot request block bigger than max_len";
-  DCHECK_NE(len, 0) << "Cannot request block of zero size";
-  *block = NULL;
-  Block* new_block = NULL;
-  Status status;
-
-  {
-    lock_guard<mutex> lock(lock_);
-    if (is_cancelled_) return Status::CANCELLED;
-    new_block = GetUnusedBlock(client);
-    DCHECK(new_block->Validate()) << endl << new_block->DebugString();
-    DCHECK_EQ(new_block->client_, client);
-    DCHECK_NE(new_block, unpin_block);
-
-    if (len > 0 && len < max_block_size_) {
-      DCHECK(unpin_block == NULL);
-      if (client->tracker_->TryConsume(len)) {
-        uint8_t* buffer = new uint8_t[len];
-        // Descriptors for non-I/O sized buffers are deleted when the block is deleted.
-        new_block->buffer_desc_ = new BufferDescriptor(buffer, len);
-        new_block->buffer_desc_->block = new_block;
-        new_block->is_pinned_ = true;
-        client->PinBuffer(new_block->buffer_desc_);
-        ++total_pinned_buffers_;
-        *block = new_block;
-        return Status::OK();
-      } else {
-        status = Status::OK();
-        goto no_buffer_avail;
-      }
-    }
-  }
-
-  bool in_mem;
-  status = FindBufferForBlock(new_block, &in_mem);
-  if (!status.ok()) goto no_buffer_avail;
-  DCHECK(!in_mem) << "A new block cannot start in mem.";
-  DCHECK(!new_block->is_pinned() || new_block->buffer_desc_ != NULL)
-      << new_block->DebugString();
-
-  if (!new_block->is_pinned()) {
-    if (unpin_block == NULL) {
-      // We couldn't get a new block and no unpin block was provided. Can't return
-      // a block.
-      status = Status::OK();
-      goto no_buffer_avail;
-    } else {
-      // We need to transfer the buffer from unpin_block to new_block.
-      status = TransferBuffer(new_block, unpin_block, true);
-      if (!status.ok()) goto no_buffer_avail;
-    }
-  } else if (unpin_block != NULL) {
-    // Got a new block without needing to transfer. Just unpin this block.
-    status = unpin_block->Unpin();
-    if (!status.ok()) goto no_buffer_avail;
-  }
-
-  DCHECK(new_block->is_pinned());
-  *block = new_block;
-  return Status::OK();
-
-no_buffer_avail:
-  DCHECK(new_block != NULL);
-  DeleteBlock(new_block);
-  return status;
-}
-
-Status BufferedBlockMgr::TransferBuffer(Block* dst, Block* src, bool unpin) {
-  Status status = Status::OK();
-  DCHECK(dst != NULL);
-  DCHECK(src != NULL);
-  unique_lock<mutex> lock(lock_);
-
-  DCHECK(src->is_pinned_);
-  DCHECK(!dst->is_pinned_);
-  DCHECK(dst->buffer_desc_ == NULL);
-  DCHECK_EQ(src->buffer_desc_->len, max_block_size_);
-
-  // Ensure that there aren't any writes in flight for 'src'.
-  WaitForWrite(lock, src);
-  src->is_pinned_ = false;
-
-  if (unpin) {
-    // First write out the src block so we can grab its buffer.
-    src->client_local_ = true;
-    status = WriteUnpinnedBlock(src);
-    if (!status.ok()) {
-      // The transfer failed, return the buffer to src.
-      src->is_pinned_ = true;
-      return status;
-    }
-    // Wait for the write to complete.
-    WaitForWrite(lock, src);
-    if (is_cancelled_) {
-      // We can't be sure the write succeeded, so return the buffer to src.
-      src->is_pinned_ = true;
-      return Status::CANCELLED;
-    }
-    DCHECK(!src->in_write_);
-  }
-  // Assign the buffer to the new block.
-  dst->buffer_desc_ = src->buffer_desc_;
-  dst->buffer_desc_->block = dst;
-  src->buffer_desc_ = NULL;
-  dst->is_pinned_ = true;
-  if (!unpin) DeleteBlockLocked(lock, src);
-  return Status::OK();
-}
-
-BufferedBlockMgr::~BufferedBlockMgr() {
-  shared_ptr<BufferedBlockMgr> other_mgr_ptr;
-  {
-    lock_guard<SpinLock> lock(static_block_mgrs_lock_);
-    BlockMgrsMap::iterator it = query_to_block_mgrs_.find(query_id_);
-    // IMPALA-2286: Another fragment may have called Create() for this query_id_ and
-    // saw that this BufferedBlockMgr is being destructed.  That fragement will
-    // overwrite the map entry for query_id_, pointing it to a different
-    // BufferedBlockMgr object.  We should let that object's destructor remove the
-    // entry.  On the other hand, if the second BufferedBlockMgr is destructed before
-    // this thread acquires the lock, then we'll remove the entry (because we can't
-    // distinguish between the two expired pointers), and when the other
-    // ~BufferedBlockMgr() call occurs, it won't find an entry for this query_id_.
-    if (it != query_to_block_mgrs_.end()) {
-      other_mgr_ptr = it->second.lock();
-      if (other_mgr_ptr.get() == NULL) {
-        // The BufferBlockMgr object referenced by this entry is being deconstructed.
-        query_to_block_mgrs_.erase(it);
-      } else {
-        // The map references another (still valid) BufferedBlockMgr.
-        DCHECK_NE(other_mgr_ptr.get(), this);
-      }
-    }
-  }
-  // IMPALA-4274: releasing the reference count can recursively call ~BufferedBlockMgr().
-  // Do not do that with 'static_block_mgrs_lock_' held.
-  other_mgr_ptr.reset();
-
-  // Delete tmp files and cancel any in-flight writes.
-  tmp_file_group_->Close();
-
-  // If there are any outstanding writes and we are here it means that when the
-  // WriteComplete() callback gets executed it is going to access invalid memory.
-  // See IMPALA-1890.
-  DCHECK_EQ(non_local_outstanding_writes_, 0) << endl << DebugInternal();
-
-  // Validate that clients deleted all of their blocks. Since all writes have
-  // completed at this point, any deleted blocks should be in unused_blocks_.
-  for (auto it = all_blocks_.begin(); it != all_blocks_.end(); ++it) {
-    Block* block = *it;
-    DCHECK(block->Validate()) << block->DebugString();
-    DCHECK(unused_blocks_.Contains(block)) << block->DebugString();
-  }
-
-  // Free memory resources.
-  for (BufferDescriptor* buffer: all_io_buffers_) {
-    mem_tracker_->Release(buffer->len);
-    delete[] buffer->buffer;
-  }
-  DCHECK_EQ(mem_tracker_->consumption(), 0);
-  mem_tracker_->UnregisterFromParent();
-  mem_tracker_.reset();
-}
-
-int64_t BufferedBlockMgr::bytes_allocated() const {
-  return mem_tracker_->consumption();
-}
-
-int BufferedBlockMgr::num_pinned_buffers(Client* client) const {
-  return client->num_pinned_buffers_;
-}
-
-int BufferedBlockMgr::num_reserved_buffers_remaining(Client* client) const {
-  return max<int>(client->num_reserved_buffers_ - client->num_pinned_buffers_, 0);
-}
-
-MemTracker* BufferedBlockMgr::get_tracker(Client* client) const {
-  return client->tracker_;
-}
-
-int64_t BufferedBlockMgr::GetNumWritesOutstanding() {
-  // Acquire lock to avoid returning mid-way through WriteComplete() when the
-  // state may be inconsistent.
-  lock_guard<mutex> lock(lock_);
-  return profile()->GetCounter("BlockWritesOutstanding")->value();
-}
-
-Status BufferedBlockMgr::DeleteOrUnpinBlock(Block* block, bool unpin) {
-  if (block == NULL) {
-    return IsCancelled() ? Status::CANCELLED : Status::OK();
-  }
-  if (unpin) {
-    return block->Unpin();
-  } else {
-    block->Delete();
-    return IsCancelled() ? Status::CANCELLED : Status::OK();
-  }
-}
-
-Status BufferedBlockMgr::PinBlock(Block* block, bool* pinned, Block* release_block,
-    bool unpin) {
-  DCHECK(block != NULL);
-  DCHECK(!block->is_deleted_);
-  Status status;
-  *pinned = false;
-  if (block->is_pinned_) {
-    *pinned = true;
-    return DeleteOrUnpinBlock(release_block, unpin);
-  }
-
-  bool in_mem = false;
-  status = FindBufferForBlock(block, &in_mem);
-  if (!status.ok()) goto error;
-  *pinned = block->is_pinned_;
-
-  if (in_mem) {
-    // The block's buffer is still in memory with the original data.
-    status = CancelWrite(block);
-    if (!status.ok()) goto error;
-    return DeleteOrUnpinBlock(release_block, unpin);
-  }
-
-  if (!block->is_pinned_) {
-    if (release_block == NULL) return Status::OK();
-
-    if (block->buffer_desc_ != NULL) {
-      // The block's buffer is still in memory but we couldn't get an additional buffer
-      // because it would eat into another client's reservation. However, we can use
-      // release_block's reservation, so reclaim the buffer.
-      {
-        lock_guard<mutex> lock(lock_);
-        if (free_io_buffers_.Contains(block->buffer_desc_)) {
-          DCHECK(!block->is_pinned_ && !block->in_write_ &&
-                 !unpinned_blocks_.Contains(block)) << endl << block->DebugString();
-          free_io_buffers_.Remove(block->buffer_desc_);
-        } else if (unpinned_blocks_.Contains(block)) {
-          unpinned_blocks_.Remove(block);
-        } else {
-          DCHECK(block->in_write_);
-        }
-        block->is_pinned_ = true;
-        *pinned = true;
-        block->client_->PinBuffer(block->buffer_desc_);
-        ++total_pinned_buffers_;
-        status = WriteUnpinnedBlocks();
-        if (!status.ok()) goto error;
-      }
-      status = CancelWrite(block);
-      if (!status.ok()) goto error;
-      return DeleteOrUnpinBlock(release_block, unpin);
-    }
-    // FindBufferForBlock() wasn't able to find a buffer so transfer the one from
-    // 'release_block'.
-    status = TransferBuffer(block, release_block, unpin);
-    if (!status.ok()) goto error;
-    DCHECK(!release_block->is_pinned_);
-    release_block = NULL; // Handled by transfer.
-    DCHECK(block->is_pinned_);
-    *pinned = true;
-  }
-
-  DCHECK(block->write_handle_ != NULL) << block->DebugString() << endl << release_block;
-
-  // The block is on disk - read it back into memory.
-  if (block->valid_data_len() > 0) {
-    status = tmp_file_group_->Read(block->write_handle_.get(), block->valid_data());
-    if (!status.ok()) goto error;
-  }
-  tmp_file_group_->DestroyWriteHandle(move(block->write_handle_));
-  return DeleteOrUnpinBlock(release_block, unpin);
-
-error:
-  DCHECK(!status.ok());
-  // Make sure to delete the block if we hit an error before calling DeleteOrUnpin().
-  if (release_block != NULL && !unpin) DeleteBlock(release_block);
-  return status;
-}
-
-Status BufferedBlockMgr::CancelWrite(Block* block) {
-  {
-    unique_lock<mutex> lock(lock_);
-    DCHECK(block->buffer_desc_ != NULL);
-    // If there is an in-flight write, wait for it to finish. This is sub-optimal
-    // compared to just cancelling the write, but reduces the number of possible
-    // code paths in this legacy code.
-    WaitForWrite(lock, block);
-    if (is_cancelled_) return Status::CANCELLED;
-  }
-  if (block->write_handle_ != NULL) {
-    // Make sure the write is not in-flight.
-    block->write_handle_->Cancel();
-    block->write_handle_->WaitForWrite();
-    // Restore the in-memory data without reading from disk (e.g. decrypt it).
-    RETURN_IF_ERROR(
-        tmp_file_group_->RestoreData(move(block->write_handle_), block->valid_data()));
-  }
-  return Status::OK();
-}
-
-Status BufferedBlockMgr::UnpinBlock(Block* block) {
-  DCHECK(!block->is_deleted_) << "Unpin for deleted block.";
-
-  lock_guard<mutex> unpinned_lock(lock_);
-  if (is_cancelled_) return Status::CANCELLED;
-  DCHECK(block->Validate()) << endl << block->DebugString();
-  if (!block->is_pinned_) return Status::OK();
-  DCHECK_EQ(block->buffer_desc_->len, max_block_size_) << "Can only unpin io blocks.";
-  DCHECK(Validate()) << endl << DebugInternal();
-  // Add 'block' to the list of unpinned blocks and set is_pinned_ to false.
-  // Cache its position in the list for later removal.
-  block->is_pinned_ = false;
-  DCHECK(!unpinned_blocks_.Contains(block)) << " Unpin for block in unpinned list";
-  if (!block->in_write_) unpinned_blocks_.Enqueue(block);
-  block->client_->UnpinBuffer(block->buffer_desc_);
-  if (block->client_->num_pinned_buffers_ < block->client_->num_reserved_buffers_) {
-    ++unfullfilled_reserved_buffers_;
-  }
-  --total_pinned_buffers_;
-  RETURN_IF_ERROR(WriteUnpinnedBlocks());
-  DCHECK(Validate()) << endl << DebugInternal();
-  DCHECK(block->Validate()) << endl << block->DebugString();
-  return Status::OK();
-}
-
-Status BufferedBlockMgr::WriteUnpinnedBlocks() {
-  if (disable_spill_) return Status::OK();
-
-  // Assumes block manager lock is already taken.
-  while (non_local_outstanding_writes_ + free_io_buffers_.size() < block_write_threshold_
-      && !unpinned_blocks_.empty()) {
-    // Pop a block from the back of the list (LIFO).
-    Block* write_block = unpinned_blocks_.PopBack();
-    write_block->client_local_ = false;
-    RETURN_IF_ERROR(WriteUnpinnedBlock(write_block));
-    ++non_local_outstanding_writes_;
-  }
-  DCHECK(Validate()) << endl << DebugInternal();
-  return Status::OK();
-}
-
-Status BufferedBlockMgr::WriteUnpinnedBlock(Block* block) {
-  // Assumes block manager lock is already taken.
-  DCHECK(!block->is_pinned_) << block->DebugString();
-  DCHECK(!block->in_write_) << block->DebugString();
-  DCHECK(block->write_handle_ == NULL) << block->DebugString();
-  DCHECK_EQ(block->buffer_desc_->len, max_block_size_);
-
-  // The block is on disk - read it back into memory.
-  RETURN_IF_ERROR(tmp_file_group_->Write(block->valid_data(),
-      [this, block](const Status& write_status) { WriteComplete(block, write_status); },
-      &block->write_handle_));
-
-  block->in_write_ = true;
-  DCHECK(block->Validate()) << endl << block->DebugString();
-  outstanding_writes_counter_->Add(1);
-  ++writes_issued_;
-  if (writes_issued_ == 1) {
-    if (ImpaladMetrics::NUM_QUERIES_SPILLED != NULL) {
-      ImpaladMetrics::NUM_QUERIES_SPILLED->Increment(1);
-    }
-  }
-  return Status::OK();
-}
-
-void BufferedBlockMgr::WaitForWrite(unique_lock<mutex>& lock, Block* block) {
-  DCHECK(!block->is_deleted_);
-  while (block->in_write_ && !is_cancelled_) {
-    block->write_complete_cv_.wait(lock);
-  }
-}
-
-void BufferedBlockMgr::WriteComplete(Block* block, const Status& write_status) {
-#ifndef NDEBUG
-  if (debug_write_delay_ms_ > 0) {
-    usleep(static_cast<int64_t>(debug_write_delay_ms_) * 1000);
-  }
-#endif
-  Status status = Status::OK();
-  lock_guard<mutex> lock(lock_);
-  DCHECK(Validate()) << endl << DebugInternal();
-  DCHECK(is_cancelled_ || block->in_write_) << "WriteComplete() for block not in write."
-                                            << endl
-                                            << block->DebugString();
-  DCHECK(block->buffer_desc_ != NULL);
-
-  outstanding_writes_counter_->Add(-1);
-  if (!block->client_local_) {
-    DCHECK_GT(non_local_outstanding_writes_, 0) << block->DebugString();
-    --non_local_outstanding_writes_;
-  }
-  block->in_write_ = false;
-
-  // ReturnUnusedBlock() will clear the block, so save required state in local vars.
-  // state is not valid if the block was deleted because the state may be torn down
-  // after the state's fragment has deleted all of its blocks.
-  RuntimeState* state = block->is_deleted_ ? NULL : block->client_->state_;
-
-  // If the block was re-pinned when it was in the IOMgr queue, don't free it.
-  if (block->is_pinned_) {
-    // The number of outstanding writes has decreased but the number of free buffers
-    // hasn't.
-    DCHECK(!block->is_deleted_);
-    DCHECK(!block->client_local_)
-        << "Client should be waiting. No one should have pinned this block.";
-    if (write_status.ok() && !is_cancelled_ && !state->is_cancelled()) {
-      status = WriteUnpinnedBlocks();
-    }
-  } else if (block->client_local_) {
-    DCHECK(!block->is_deleted_)
-        << "Client should be waiting. No one should have deleted this block.";
-  } else {
-    DCHECK_EQ(block->buffer_desc_->len, max_block_size_)
-        << "Only io sized buffers should spill";
-    free_io_buffers_.Enqueue(block->buffer_desc_);
-  }
-
-  if (!write_status.ok() || !status.ok() || is_cancelled_) {
-    VLOG_FILE << "Query: " << query_id_ << ". Write did not complete successfully: "
-                                           "write_status="
-              << write_status.GetDetail() << ", status=" << status.GetDetail()
-              << ". is_cancelled_=" << is_cancelled_;
-    // If the instance is already cancelled, don't confuse things with these errors.
-    if (!write_status.ok() && !write_status.IsCancelled()) {
-      // Report but do not attempt to recover from write error.
-      VLOG_QUERY << "Query: " << query_id_ << " write complete callback with error.";
-
-      if (state != NULL) state->LogError(write_status.msg());
-    }
-    if (!status.ok() && !status.IsCancelled()) {
-      VLOG_QUERY << "Query: " << query_id_ << " error while writing unpinned blocks.";
-      if (state != NULL) state->LogError(status.msg());
-    }
-    // Set cancelled. Threads waiting for a write will be woken up in the normal way when
-    // one of the writes they are waiting for completes.
-    is_cancelled_ = true;
-  }
-
-  // Notify any threads that may have been expecting to get block's buffer based on
-  // the value of 'non_local_outstanding_writes_'. Wake them all up. If we added
-  // a buffer to 'free_io_buffers_', one thread will get a buffer. All the others
-  // will re-evaluate whether they should continue waiting and if another write needs
-  // to be initiated.
-  if (!block->client_local_) buffer_available_cv_.notify_all();
-  if (block->is_deleted_) {
-    // Finish the DeleteBlock() work.
-    tmp_file_group_->DestroyWriteHandle(move(block->write_handle_));
-    block->buffer_desc_->block = NULL;
-    block->buffer_desc_ = NULL;
-    ReturnUnusedBlock(block);
-    block = NULL;
-  } else {
-    // Wake up the thread waiting on this block (if any).
-    block->write_complete_cv_.notify_one();
-  }
-
-  DCHECK(Validate()) << endl << DebugInternal();
-}
-
-void BufferedBlockMgr::DeleteBlock(Block* block) {
-  unique_lock<mutex> lock(lock_);
-  DeleteBlockLocked(lock, block);
-}
-
-void BufferedBlockMgr::DeleteBlockLocked(const unique_lock<mutex>& lock, Block* block) {
-  DCHECK(lock.mutex() == &lock_ && lock.owns_lock());
-  DCHECK(block->Validate()) << endl << DebugInternal();
-  DCHECK(!block->is_deleted_);
-  block->is_deleted_ = true;
-
-  if (block->is_pinned_) {
-    if (block->is_max_size()) --total_pinned_buffers_;
-    block->is_pinned_ = false;
-    block->client_->UnpinBuffer(block->buffer_desc_);
-    if (block->client_->num_pinned_buffers_ < block->client_->num_reserved_buffers_) {
-      ++unfullfilled_reserved_buffers_;
-    }
-  } else if (unpinned_blocks_.Contains(block)) {
-    // Remove block from unpinned list.
-    unpinned_blocks_.Remove(block);
-  }
-
-  if (block->in_write_) {
-    DCHECK(block->buffer_desc_ != NULL && block->buffer_desc_->len == max_block_size_)
-        << "Should never be writing a small buffer";
-    // If a write is still pending, cancel it and return. Cleanup will be done in
-    // WriteComplete(). Cancelling the write ensures that it won't try to log to the
-    // RuntimeState (which may be torn down before the block manager).
-    DCHECK(block->Validate()) << endl << block->DebugString();
-    return;
-  }
-
-  if (block->buffer_desc_ != NULL) {
-    if (block->buffer_desc_->len != max_block_size_) {
-      // Just delete the block for now.
-      delete[] block->buffer_desc_->buffer;
-      block->client_->tracker_->Release(block->buffer_desc_->len);
-      delete block->buffer_desc_;
-      block->buffer_desc_ = NULL;
-    } else {
-      if (!free_io_buffers_.Contains(block->buffer_desc_)) {
-        free_io_buffers_.Enqueue(block->buffer_desc_);
-        // Wake up one of the waiting threads, which will grab the buffer.
-        buffer_available_cv_.notify_one();
-      }
-      block->buffer_desc_->block = NULL;
-      block->buffer_desc_ = NULL;
-    }
-  }
-
-  // Discard any on-disk data. The write is finished so this won't call back into
-  // BufferedBlockMgr.
-  if (block->write_handle_ != NULL) {
-    tmp_file_group_->DestroyWriteHandle(move(block->write_handle_));
-  }
-  ReturnUnusedBlock(block);
-  DCHECK(block->Validate()) << endl << block->DebugString();
-  DCHECK(Validate()) << endl << DebugInternal();
-}
-
-void BufferedBlockMgr::ReturnUnusedBlock(Block* block) {
-  DCHECK(block->is_deleted_) << block->DebugString();
-  DCHECK(!block->is_pinned_) << block->DebugString();;
-  DCHECK(block->buffer_desc_ == NULL);
-  block->Init();
-  unused_blocks_.Enqueue(block);
-}
-
-Status BufferedBlockMgr::FindBufferForBlock(Block* block, bool* in_mem) {
-  DCHECK(block != NULL);
-  Client* client = block->client_;
-  DCHECK(client != NULL);
-  DCHECK(!block->is_pinned_ && !block->is_deleted_)
-      << "Pinned or deleted block " << endl << block->DebugString();
-  *in_mem = false;
-
-  unique_lock<mutex> l(lock_);
-  if (is_cancelled_) return Status::CANCELLED;
-
-  // First check if there is enough reserved memory to satisfy this request.
-  bool is_reserved_request = false;
-  if (client->num_pinned_buffers_ < client->num_reserved_buffers_) {
-    is_reserved_request = true;
-  } else if (client->num_tmp_reserved_buffers_ > 0) {
-    is_reserved_request = true;
-    --client->num_tmp_reserved_buffers_;
-  }
-
-  DCHECK(Validate()) << endl << DebugInternal();
-  if (is_reserved_request) --unfullfilled_reserved_buffers_;
-
-  if (!is_reserved_request && remaining_unreserved_buffers() < 1) {
-    // The client already has its quota and there are no unreserved blocks left.
-    // Note that even if this passes, it is still possible for the path below to
-    // see OOM because another query consumed memory from the process tracker. This
-    // only happens if the buffer has not already been allocated by the block mgr.
-    // This check should ensure that the memory cannot be consumed by another client
-    // of the block mgr.
-    return Status::OK();
-  }
-
-  if (block->buffer_desc_ != NULL) {
-    // The block is in memory. It may be in 3 states:
-    //  1. In the unpinned list. The buffer will not be in the free list.
-    //  2. in_write_ == true. The buffer will not be in the free list.
-    //  3. The buffer is free, but hasn't yet been reassigned to a different block.
-    DCHECK_EQ(block->buffer_desc_->len, max_block_size())
-        << "Non-I/O blocks are always pinned";
-    DCHECK(unpinned_blocks_.Contains(block) ||
-           block->in_write_ ||
-           free_io_buffers_.Contains(block->buffer_desc_));
-    if (unpinned_blocks_.Contains(block)) {
-      unpinned_blocks_.Remove(block);
-      DCHECK(!free_io_buffers_.Contains(block->buffer_desc_));
-    } else if (block->in_write_) {
-      DCHECK(block->in_write_ && !free_io_buffers_.Contains(block->buffer_desc_));
-    } else {
-      free_io_buffers_.Remove(block->buffer_desc_);
-    }
-    buffered_pin_counter_->Add(1);
-    *in_mem = true;
-  } else {
-    BufferDescriptor* buffer_desc = NULL;
-    RETURN_IF_ERROR(FindBuffer(l, &buffer_desc));
-
-    if (buffer_desc == NULL) {
-      // There are no free buffers or blocks we can evict. We need to fail this request.
-      // If this is an optional request, return OK. If it is required, return OOM.
-      if (!is_reserved_request || client->tolerates_oversubscription_) return Status::OK();
-
-      if (VLOG_QUERY_IS_ON) {
-        stringstream ss;
-        ss << "Query id=" << query_id_ << " was unable to get minimum required buffers."
-           << endl << DebugInternal() << endl << client->DebugString();
-        VLOG_QUERY << ss.str();
-      }
-      return client->tracker_->MemLimitExceeded(client->state_,
-          "Query did not have enough memory to get the minimum required buffers in the "
-          "block manager.");
-    }
-
-    DCHECK(buffer_desc != NULL);
-    DCHECK_EQ(buffer_desc->len, max_block_size()) << "Non-I/O buffer";
-    if (buffer_desc->block != NULL) {
-      // This buffer was assigned to a block but now we are reusing it. Reset the
-      // previous block->buffer link.
-      DCHECK(buffer_desc->block->Validate()) << endl << buffer_desc->block->DebugString();
-      buffer_desc->block->buffer_desc_ = NULL;
-    }
-    buffer_desc->block = block;
-    block->buffer_desc_ = buffer_desc;
-  }
-  DCHECK(block->buffer_desc_ != NULL);
-  DCHECK(block->buffer_desc_->len < max_block_size() || !block->is_pinned_)
-      << "Trying to pin already pinned block. "
-      << block->buffer_desc_->len << " " << block->is_pinned_;
-  block->is_pinned_ = true;
-  client->PinBuffer(block->buffer_desc_);
-  ++total_pinned_buffers_;
-
-  DCHECK(block->Validate()) << endl << block->DebugString();
-  // The number of free buffers has decreased. Write unpinned blocks if the number
-  // of free buffers is less than the threshold.
-  RETURN_IF_ERROR(WriteUnpinnedBlocks());
-  DCHECK(Validate()) << endl << DebugInternal();
-  return Status::OK();
-}
-
-// We need to find a new buffer. We prefer getting this buffer in this order:
-//  1. Allocate a new block if the number of free blocks is less than the write threshold
-//     or if we are running without spilling, until we run out of memory.
-//  2. Pick a buffer from the free list.
-//  3. Wait and evict an unpinned buffer.
-Status BufferedBlockMgr::FindBuffer(unique_lock<mutex>& lock,
-    BufferDescriptor** buffer_desc) {
-  DCHECK(lock.mutex() == &lock_ && lock.owns_lock());
-  *buffer_desc = NULL;
-
-  // First, try to allocate a new buffer.
-  DCHECK(block_write_threshold_ > 0 || disable_spill_);
-  if ((free_io_buffers_.size() < block_write_threshold_ || disable_spill_) &&
-      mem_tracker_->TryConsume(max_block_size_)) {
-    uint8_t* new_buffer = new uint8_t[max_block_size_];
-    *buffer_desc = obj_pool_.Add(new BufferDescriptor(new_buffer, max_block_size_));
-    (*buffer_desc)->all_buffers_it = all_io_buffers_.insert(
-        all_io_buffers_.end(), *buffer_desc);
-    return Status::OK();
-  }
-
-  // Second, try to pick a buffer from the free list.
-  if (free_io_buffers_.empty()) {
-    // There are no free buffers. If spills are disabled or there no unpinned blocks we
-    // can write, return. We can't get a buffer.
-    if (disable_spill_) {
-      if (block_write_threshold_ == 0) {
-        return Status("Spilling has been disabled due to no usable scratch space. "
-            "Please specify a usable scratch space location via the --scratch_dirs "
-            "impalad flag.");
-      } else {
-        return Status("Spilling has been disabled for plans that do not have stats and "
-            "are not hinted to prevent potentially bad plans from using too many cluster "
-            "resources. Please run COMPUTE STATS on these tables, hint the plan or "
-            "disable this behavior via the DISABLE_UNSAFE_SPILLS query option.");
-      }
-    }
-
-    // Third, this block needs to use a buffer that was unpinned from another block.
-    // Get a free buffer from the front of the queue and assign it to the block.
-    do {
-      if (unpinned_blocks_.empty() && non_local_outstanding_writes_ == 0) {
-        return Status::OK();
-      }
-      SCOPED_TIMER(buffer_wait_timer_);
-      // Try to evict unpinned blocks before waiting.
-      RETURN_IF_ERROR(WriteUnpinnedBlocks());
-      DCHECK_GT(non_local_outstanding_writes_, 0) << endl << DebugInternal();
-      buffer_available_cv_.wait(lock);
-      if (is_cancelled_) return Status::CANCELLED;
-    } while (free_io_buffers_.empty());
-  }
-  *buffer_desc = free_io_buffers_.Dequeue();
-  return Status::OK();
-}
-
-BufferedBlockMgr::Block* BufferedBlockMgr::GetUnusedBlock(Client* client) {
-  DCHECK(client != NULL);
-  Block* new_block = NULL;
-  if (unused_blocks_.empty()) {
-    new_block = obj_pool_.Add(new Block(this));
-    all_blocks_.push_back(new_block);
-    new_block->Init();
-    created_block_counter_->Add(1);
-  } else {
-    new_block = unused_blocks_.Dequeue();
-    recycled_blocks_counter_->Add(1);
-  }
-  DCHECK(new_block != NULL);
-  new_block->client_ = client;
-  return new_block;
-}
-
-bool BufferedBlockMgr::Validate() const {
-  int num_free_io_buffers = 0;
-
-  if (total_pinned_buffers_ < 0) {
-    LOG(ERROR) << "total_pinned_buffers_ < 0: " << total_pinned_buffers_;
-    return false;
-  }
-
-  for (BufferDescriptor* buffer: all_io_buffers_) {
-    bool is_free = free_io_buffers_.Contains(buffer);
-    num_free_io_buffers += is_free;
-
-    if (*buffer->all_buffers_it != buffer) {
-      LOG(ERROR) << "All buffers list is corrupt. Buffer iterator is not valid.";
-      return false;
-    }
-
-    if (buffer->block == NULL && !is_free) {
-      LOG(ERROR) << "Buffer with no block not in free list." << endl << DebugInternal();
-      return false;
-    }
-
-    if (buffer->len != max_block_size_) {
-      LOG(ERROR) << "Non-io sized buffers should not end up on free list.";
-      return false;
-    }
-
-    if (buffer->block != NULL) {
-      if (buffer->block->buffer_desc_ != buffer) {
-        LOG(ERROR) << "buffer<->block pointers inconsistent. Buffer: " << buffer
-          << endl << buffer->block->DebugString();
-        return false;
-      }
-
-      if (!buffer->block->Validate()) {
-        LOG(ERROR) << "buffer->block inconsistent."
-          << endl << buffer->block->DebugString();
-        return false;
-      }
-
-      if (is_free && (buffer->block->is_pinned_ || buffer->block->in_write_ ||
-            unpinned_blocks_.Contains(buffer->block))) {
-        LOG(ERROR) << "Block with buffer in free list and"
-          << " is_pinned_ = " << buffer->block->is_pinned_
-          << " in_write_ = " << buffer->block->in_write_
-          << " Unpinned_blocks_.Contains = "
-          << unpinned_blocks_.Contains(buffer->block)
-          << endl << buffer->block->DebugString();
-        return false;
-      }
-    }
-  }
-
-  if (free_io_buffers_.size() != num_free_io_buffers) {
-    LOG(ERROR) << "free_buffer_list_ inconsistency."
-      << " num_free_io_buffers = " << num_free_io_buffers
-      << " free_io_buffers_.size() = " << free_io_buffers_.size()
-      << endl << DebugInternal();
-    return false;
-  }
-
-  Block* block = unpinned_blocks_.head();
-  while (block != NULL) {
-    if (!block->Validate()) {
-      LOG(ERROR) << "Block inconsistent in unpinned list."
-        << endl << block->DebugString();
-      return false;
-    }
-
-    if (block->in_write_ || free_io_buffers_.Contains(block->buffer_desc_)) {
-      LOG(ERROR) << "Block in unpinned list with"
-        << " in_write_ = " << block->in_write_
-        << " free_io_buffers_.Contains = "
-        << free_io_buffers_.Contains(block->buffer_desc_)
-        << endl << block->DebugString();
-      return false;
-    }
-    block = block->Next();
-  }
-
-  // Check if we're writing blocks when the number of free buffers is less than
-  // the write threshold. We don't write blocks after cancellation.
-  if (!is_cancelled_ && !unpinned_blocks_.empty() && !disable_spill_ &&
-      (free_io_buffers_.size() + non_local_outstanding_writes_ <
-       block_write_threshold_)) {
-    // TODO: this isn't correct when WriteUnpinnedBlocks() fails during the call to
-    // WriteUnpinnedBlock() so just log the condition but don't return false. Figure
-    // out a way to re-enable this change?
-    LOG(ERROR) << "Missed writing unpinned blocks";
-  }
-  return true;
-}
-
-string BufferedBlockMgr::DebugString(Client* client) {
-  stringstream ss;
-  unique_lock<mutex> l(lock_);
-  ss <<  DebugInternal();
-  if (client != NULL) ss << endl << client->DebugString();
-  return ss.str();
-}
-
-string BufferedBlockMgr::DebugInternal() const {
-  stringstream ss;
-  ss << "Buffered block mgr " << this << endl
-     << "  Num writes outstanding: " << outstanding_writes_counter_->value() << endl
-     << "  Num free io buffers: " << free_io_buffers_.size() << endl
-     << "  Num unpinned blocks: " << unpinned_blocks_.size() << endl
-     << "  Num available buffers: " << remaining_unreserved_buffers() << endl
-     << "  Total pinned buffers: " << total_pinned_buffers_ << endl
-     << "  Unfullfilled reserved buffers: " << unfullfilled_reserved_buffers_ << endl
-     << "  Remaining memory: " << mem_tracker_->SpareCapacity()
-     << " (#blocks=" << (mem_tracker_->SpareCapacity() / max_block_size_) << ")" << endl
-     << "  Block write threshold: " << block_write_threshold_;
-  if (tmp_file_group_ != NULL) ss << tmp_file_group_->DebugString();
-  return ss.str();
-}
-
-void BufferedBlockMgr::Init(DiskIoMgr* io_mgr, TmpFileMgr* tmp_file_mgr,
-    RuntimeProfile* parent_profile, MemTracker* parent_tracker, int64_t mem_limit,
-    int64_t scratch_limit) {
-  unique_lock<mutex> l(lock_);
-  if (initialized_) return;
-
-  profile_.reset(new RuntimeProfile(&obj_pool_, "BlockMgr"));
-  parent_profile->AddChild(profile_.get());
-
-  tmp_file_group_.reset(new TmpFileMgr::FileGroup(
-      tmp_file_mgr, io_mgr, profile_.get(), query_id_, scratch_limit));
-
-  mem_limit_counter_ = ADD_COUNTER(profile_.get(), "MemoryLimit", TUnit::BYTES);
-  mem_limit_counter_->Set(mem_limit);
-  block_size_counter_ = ADD_COUNTER(profile_.get(), "MaxBlockSize", TUnit::BYTES);
-  block_size_counter_->Set(max_block_size_);
-  created_block_counter_ = ADD_COUNTER(profile_.get(), "BlocksCreated", TUnit::UNIT);
-  recycled_blocks_counter_ = ADD_COUNTER(profile_.get(), "BlocksRecycled", TUnit::UNIT);
-  outstanding_writes_counter_ =
-      ADD_COUNTER(profile_.get(), "BlockWritesOutstanding", TUnit::UNIT);
-  buffered_pin_counter_ = ADD_COUNTER(profile_.get(), "BufferedPins", TUnit::UNIT);
-  buffer_wait_timer_ = ADD_TIMER(profile_.get(), "TotalBufferWaitTime");
-
-  // Create a new mem_tracker and allocate buffers.
-  mem_tracker_.reset(
-      new MemTracker(profile(), mem_limit, "Block Manager", parent_tracker));
-
-  initialized_ = true;
-}
-
-} // namespace impala

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-block-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr.h b/be/src/runtime/buffered-block-mgr.h
deleted file mode 100644
index ab05329..0000000
--- a/be/src/runtime/buffered-block-mgr.h
+++ /dev/null
@@ -1,606 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_BUFFERED_BLOCK_MGR
-#define IMPALA_RUNTIME_BUFFERED_BLOCK_MGR
-
-#include "runtime/disk-io-mgr.h"
-#include "runtime/tmp-file-mgr.h"
-#include "util/mem-range.h"
-
-#include <boost/unordered_map.hpp>
-
-namespace impala {
-
-class RuntimeState;
-
-/// The BufferedBlockMgr is used to allocate and manage blocks of data using a fixed memory
-/// budget. Available memory is split into a pool of fixed-size memory buffers. When a
-/// client allocates or requests a block, the block is assigned a buffer from this pool and
-/// is 'pinned' in memory. Clients can also unpin a block, allowing the manager to reassign
-/// its buffer to a different block.
-//
-/// The BufferedBlockMgr typically allocates blocks in IO buffer size to get maximal IO
-/// efficiency when spilling. Clients can also request smaller buffers that cannot spill
-/// (note that it would be possible to spill small buffers, but we currently do not allow
-/// it). This is useful to present the same block API and mem tracking for clients (one can
-/// use the block mgr API to mem track non-spillable (smaller) buffers). Clients that do
-/// partitioning (e.g. PHJ and PAGG) will start with these smaller buffer sizes to reduce
-/// the minimum buffering requirements and grow to max sized buffers as the input grows.
-/// For simplicity, these small buffers are not recycled (there's also not really a need
-/// since they are allocated all at once on query startup). These buffers are not counted
-/// against the reservation.
-//
-/// The BufferedBlockMgr reserves one buffer per disk ('block_write_threshold_') for
-/// itself. When the number of free buffers falls below 'block_write_threshold', unpinned
-/// blocks are flushed in Last-In-First-Out order. (It is assumed that unpinned blocks are
-/// re-read in FIFO order). The TmpFileMgr is used to obtain file handles to write to
-/// within the tmp directories configured for Impala.
-//
-/// It is expected to have one BufferedBlockMgr per query. All allocations that can grow
-/// proportional to the input size and that might need to spill to disk should allocate
-/// from the same BufferedBlockMgr.
-//
-/// A client must pin a block in memory to read/write its contents and unpin it when it is
-/// no longer in active use. The BufferedBlockMgr guarantees that:
-///  a) The memory buffer assigned to a block is not removed or released while it is pinned.
-///  b) The contents of an unpinned block will be available on a subsequent call to pin.
-//
-/// The Client supports the following operations:
-///  GetNewBlock(): Returns a new pinned block.
-///  Close(): Frees all memory and disk space. Called when a query is closed or cancelled.
-///   Close() is idempotent.
-//
-/// A Block supports the following operations:
-///  Pin(): Pins a block to a buffer in memory, and reads its contents from disk if
-///   necessary. If there are no free buffers, waits for a buffer to become available.
-///   Invoked before the contents of a block are read or written. The block
-///   will be maintained in memory until Unpin() is called.
-///  Unpin(): Invoked to indicate the block is not in active use. The block is added to a
-///   list of unpinned blocks. Unpinned blocks are only written when the number of free
-///   blocks falls below the 'block_write_threshold'.
-///  Delete(): Invoked to deallocate a block. The buffer associated with the block is
-///   immediately released and its on-disk location (if any) reused. All blocks must be
-///   deleted before the block manager is torn down.
-///
-/// The block manager is thread-safe with the following caveat: A single block cannot be
-/// used simultaneously by multiple clients in any capacity.
-/// However, the block manager client is not thread-safe. That is, the block manager
-/// allows multiple single-threaded block manager clients.
-///
-/// TODO: replace with BufferPool.
-class BufferedBlockMgr {
- private:
-  struct BufferDescriptor;
-
- public:
-  /// A client of the BufferedBlockMgr. There is a single BufferedBlockMgr per plan
-  /// fragment and all operators that need blocks from it should use a separate client.
-  /// Each client has the option to reserve a number of blocks that it can claim later.
-  /// The remaining memory that is not reserved by any clients is free for all and
-  /// available to all clients.
-  /// This is an opaque handle.
-  struct Client;
-
-  /// A fixed-size block of data that may be be persisted to disk. The state of the block
-  /// is maintained by the block manager and is described by 3 bools:
-  /// is_pinned_ = True if the block is pinned. The block has a non-null buffer_desc_,
-  ///   buffer_desc_ cannot be in the free buffer list and the block cannot be in
-  ///   unused_blocks_ or unpinned_blocks_. Newly allocated blocks are pinned.
-  /// in_write_ = True if a write has been issued but not completed for this block.
-  ///   The block cannot be in the unpinned_blocks_ and must have a non-null buffer_desc_
-  ///   that's not in the free buffer list. It may be pinned or unpinned.
-  /// is_deleted_ = True if Delete() has been called on a block. After this, no API call
-  ///   is valid on the block.
-  //
-  /// Pin() and Unpin() can be invoked on a block any number of times before Delete().
-  /// When a pinned block is unpinned for the first time, it is added to the
-  /// unpinned_blocks_ list and its buffer is removed from the free list.
-  /// If it is pinned or deleted at any time while it is on the unpinned list, it is
-  /// simply removed from that list. When it is dequeued from that list and enqueued
-  /// for writing, in_write_ is set to true. The block may be pinned, unpinned or deleted
-  /// while in_write_ is true. After the write has completed, the block's buffer will be
-  /// returned to the free buffer list if it is no longer pinned, and the block itself
-  /// will be put on the unused blocks list if Delete() was called.
-  //
-  /// A block MUST have a non-null buffer_desc_ if
-  ///  a) is_pinned_ is true (i.e. the client is using it), or
-  ///  b) in_write_ is true, (i.e. IO mgr is using it), or
-  ///  c) It is on the unpinned list (buffer has not been persisted.)
-  //
-  /// In addition to the block manager API, Block exposes Allocate(), ReturnAllocation()
-  /// and BytesRemaining() to allocate and free memory within a block, and buffer() and
-  /// valid_data_len() to read/write the contents of a block. These are not thread-safe.
-  class Block : public InternalQueue<Block>::Node {
-   public:
-    /// Pins a block in memory--assigns a free buffer to a block and reads it from disk if
-    /// necessary. If there are no free blocks and no unpinned blocks, '*pinned' is set to
-    /// false and the block is not pinned. If 'release_block' is non-NULL, if there is
-    /// memory pressure, this block will be pinned using the buffer from 'release_block'.
-    /// If 'unpin' is true, 'release_block' will be unpinned (regardless of whether or not
-    /// the buffer was used for this block). If 'unpin' is false, 'release_block' is
-    /// deleted. 'release_block' must be pinned. If an error occurs and 'unpin' was false,
-    /// 'release_block' is always deleted. If 'unpin' was true and an error occurs,
-    /// 'release_block' may be left pinned or unpinned.
-    Status Pin(bool* pinned, Block* release_block = NULL, bool unpin = true);
-
-    /// Unpins a block by adding it to the list of unpinned blocks maintained by the block
-    /// manager. An unpinned block must be flushed before its buffer is released or
-    /// assigned to a different block. Is non-blocking.
-    Status Unpin();
-
-    /// Delete a block. Its buffer is released and on-disk location can be over-written.
-    /// Non-blocking.
-    void Delete();
-
-    void AddRow() { ++num_rows_; }
-    int num_rows() const { return num_rows_; }
-
-    /// Allocates the specified number of bytes from this block.
-    template <typename T> T* Allocate(int size) {
-      DCHECK_GE(BytesRemaining(), size);
-      uint8_t* current_location = buffer_desc_->buffer + valid_data_len_;
-      valid_data_len_ += size;
-      return reinterpret_cast<T*>(current_location);
-    }
-
-    /// Return the number of remaining bytes that can be allocated in this block.
-    int BytesRemaining() const {
-      DCHECK(buffer_desc_ != NULL);
-      return buffer_desc_->len - valid_data_len_;
-    }
-
-    /// Return size bytes from the most recent allocation.
-    void ReturnAllocation(int size) {
-      DCHECK_GE(valid_data_len_, size);
-      valid_data_len_ -= size;
-    }
-
-    /// Pointer to start of the block data in memory. Only guaranteed to be valid if the
-    /// block is pinned.
-    uint8_t* buffer() const {
-      DCHECK(buffer_desc_ != NULL);
-      return buffer_desc_->buffer;
-    }
-
-    /// Returns a reference to the valid data in the block's buffer. Only guaranteed to
-    /// be valid if the block is pinned.
-    MemRange valid_data() const {
-      DCHECK(buffer_desc_ != NULL);
-      return MemRange(buffer_desc_->buffer, valid_data_len_);
-    }
-
-    /// Return the number of bytes allocated in this block.
-    int64_t valid_data_len() const { return valid_data_len_; }
-
-    /// Returns the length of the underlying buffer. Only callable if the block is
-    /// pinned.
-    int64_t buffer_len() const {
-      DCHECK(is_pinned());
-      return buffer_desc_->len;
-    }
-
-    /// Returns true if this block is the max block size. Only callable if the block
-    /// is pinned.
-    bool is_max_size() const {
-      DCHECK(is_pinned());
-      return buffer_desc_->len == block_mgr_->max_block_size();
-    }
-
-    bool is_pinned() const { return is_pinned_; }
-
-    /// Path of temporary file backing the block. Intended for use in testing.
-    /// Returns empty string if no backing file allocated.
-    std::string TmpFilePath() const;
-
-    /// Debug helper method to print the state of a block.
-    std::string DebugString() const;
-
-   private:
-    friend class BufferedBlockMgr;
-
-    Block(BufferedBlockMgr* block_mgr);
-
-    /// Initialize the state of a block and set the number of bytes allocated to 0.
-    void Init();
-
-    /// Debug helper method to validate the state of a block. block_mgr_ lock must already
-    /// be taken.
-    bool Validate() const;
-
-    /// Pointer to the buffer associated with the block. NULL if the block is not in
-    /// memory and cannot be changed while the block is pinned or being written.
-    BufferDescriptor* buffer_desc_;
-
-    /// Parent block manager object. Responsible for maintaining the state of the block.
-    BufferedBlockMgr* block_mgr_;
-
-    /// The client that owns this block.
-    Client* client_;
-
-    /// Non-NULL when the block data is written to scratch or is in the process of being
-    /// written.
-    std::unique_ptr<TmpFileMgr::WriteHandle> write_handle_;
-
-    /// Length of valid (i.e. allocated) data within the block.
-    int64_t valid_data_len_;
-
-    /// Number of rows in this block.
-    int num_rows_;
-
-    /// Block state variables. The block's buffer can be freed only if is_pinned_ and
-    /// in_write_ are both false.
-
-    /// is_pinned_ is true while the block is pinned by a client.
-    bool is_pinned_;
-
-    /// in_write_ is set to true when the block is enqueued for writing via DiskIoMgr,
-    /// and set to false when the write is complete.
-    bool in_write_;
-
-    /// True if the block is deleted by the client.
-    bool is_deleted_;
-
-    /// Condition variable to wait for the write to this block to finish. If 'in_write_'
-    /// is true, notify_one() will eventually be called on this condition variable. Only
-    /// on thread should wait on this cv at a time.
-    boost::condition_variable write_complete_cv_;
-
-    /// If true, this block is being written out so the underlying buffer can be
-    /// transferred to another block from the same client. We don't want this buffer
-    /// getting picked up by another client.
-    bool client_local_;
-  }; // class Block
-
-  /// Create a block manager with the specified mem_limit. If a block mgr with the
-  /// same query id has already been created, that block mgr is returned.
-  /// - mem_limit: maximum memory that will be used by the block mgr.
-  /// - buffer_size: maximum size of each buffer.
-  static Status Create(RuntimeState* state, MemTracker* parent,
-      RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, int64_t mem_limit,
-      int64_t buffer_size, std::shared_ptr<BufferedBlockMgr>* block_mgr);
-
-  ~BufferedBlockMgr();
-
-  /// Registers a client with 'num_reserved_buffers'. The returned client is owned
-  /// by the BufferedBlockMgr and has the same lifetime as it.
-  /// We allow oversubscribing the reserved buffers. It is likely that the
-  /// 'num_reserved_buffers' will be very pessimistic for small queries and we don't want
-  /// to
-  /// fail all of them with mem limit exceeded.
-  /// The min reserved buffers is often independent of data size and we still want
-  /// to run small queries with very small limits.
-  /// Buffers used by this client are reflected in tracker.
-  /// 'tolerates_oversubscription' determines how oversubscription is handled. If true,
-  /// failure to allocate a reserved buffer is not an error. If false, failure to
-  /// allocate a reserved buffer is a MEM_LIMIT_EXCEEDED error.
-  /// 'debug_info' is a string that will be printed in debug messages and errors to
-  /// identify the client.
-  Status RegisterClient(const std::string& debug_info, int num_reserved_buffers,
-      bool tolerates_oversubscription, MemTracker* tracker, RuntimeState* state,
-      Client** client);
-
-  /// Clears all reservations for this client.
-  void ClearReservations(Client* client);
-
-  /// Tries to acquire a one-time reservation of num_buffers. The semantics are:
-  ///  - If this call fails, the next 'num_buffers' calls to Pin()/GetNewBlock() might
-  ///    not have enough memory.
-  ///  - If this call succeeds, the next 'num_buffers' call to Pin()/GetNewBlock() will
-  ///    be guaranteed to get the block. Once these blocks have been pinned, the
-  ///    reservation from this call has no more effect.
-  /// Blocks coming from the tmp reservation also count towards the regular reservation.
-  /// This is useful to Pin() a number of blocks and guarantee all or nothing behavior.
-  bool TryAcquireTmpReservation(Client* client, int num_buffers);
-
-  /// Return a new pinned block. If there is no memory for this block, *block will be set
-  /// to NULL.
-  /// If len > 0, GetNewBlock() will return a block with a buffer of size len. len
-  /// must be less than max_block_size and this block cannot be unpinned.
-  /// This function will try to allocate new memory for the block up to the limit.
-  /// Otherwise it will (conceptually) write out an unpinned block and use that memory.
-  /// The caller can pass a non-NULL 'unpin_block' to transfer memory from 'unpin_block'
-  /// to the new block. If 'unpin_block' is non-NULL, the new block can never fail to
-  /// get a buffer. The semantics of this are:
-  ///   - If 'unpin_block' is non-NULL, it must be pinned.
-  ///   - If the call succeeds, 'unpin_block' is unpinned.
-  ///   - If there is no memory pressure, block will get a newly allocated buffer.
-  ///   - If there is memory pressure, block will get the buffer from 'unpin_block'.
-  Status GetNewBlock(Client* client, Block* unpin_block, Block** block, int64_t len = -1);
-
-  /// Test helper to cancel the block mgr. All subsequent calls that return a Status fail
-  /// with Status::CANCELLED. Idempotent.
-  void Cancel();
-
-  /// Returns true if the block manager was cancelled.
-  bool IsCancelled();
-
-  /// Dumps block mgr state. Grabs lock. If client is not NULL, also dumps its state.
-  std::string DebugString(Client* client = NULL);
-
-  /// Consumes 'size' bytes from the buffered block mgr. This is used by callers that want
-  /// the memory to come from the block mgr pool (and therefore trigger spilling) but need
-  /// the allocation to be more flexible than blocks. Buffer space reserved with
-  /// TryAcquireTmpReservation may be used to fulfill the request if available. If the
-  /// request is unsuccessful, that temporary buffer space is not consumed.
-  /// Returns false if there was not enough memory.
-  ///
-  /// This is used only for the Buckets structure in the hash table, which cannot be
-  /// segmented into blocks.
-  bool ConsumeMemory(Client* client, int64_t size);
-
-  /// All successful allocates bytes from ConsumeMemory() must have a corresponding
-  /// ReleaseMemory() call.
-  void ReleaseMemory(Client* client, int64_t size);
-
-  /// Returns a MEM_LIMIT_EXCEEDED error which includes the minimum memory required by
-  /// this 'client' that acts on behalf of the node with id 'node_id'. 'node_id' is used
-  /// only for error reporting.
-  Status MemLimitTooLowError(Client* client, int node_id);
-
-  int num_pinned_buffers(Client* client) const;
-  int num_reserved_buffers_remaining(Client* client) const;
-  MemTracker* mem_tracker() const { return mem_tracker_.get(); }
-  MemTracker* get_tracker(Client* client) const;
-  int64_t max_block_size() const { return max_block_size_; }
-  int64_t bytes_allocated() const;
-  RuntimeProfile* profile() { return profile_.get(); }
-  int writes_issued() const { return writes_issued_; }
-
-  void set_debug_write_delay_ms(int val) { debug_write_delay_ms_ = val; }
-
- private:
-  friend class BufferedBlockMgrTest;
-  friend struct Client;
-
-  /// Descriptor for a single memory buffer in the pool.
-  struct BufferDescriptor : public InternalQueue<BufferDescriptor>::Node {
-    /// Start of the buffer.
-    uint8_t* buffer;
-
-    /// Length of the buffer.
-    int64_t len;
-
-    /// Block that this buffer is assigned to. May be NULL.
-    Block* block;
-
-    /// Iterator into all_io_buffers_ for this buffer.
-    std::list<BufferDescriptor*>::iterator all_buffers_it;
-
-    BufferDescriptor(uint8_t* buf, int64_t len) : buffer(buf), len(len), block(NULL) {}
-  };
-
-  BufferedBlockMgr(RuntimeState* state, TmpFileMgr* tmp_file_mgr, int64_t block_size,
-      int64_t scratch_limit);
-
-  /// Initializes the block mgr. Idempotent and thread-safe.
-  void Init(DiskIoMgr* io_mgr, TmpFileMgr* tmp_file_mgr, RuntimeProfile* profile,
-      MemTracker* parent_tracker, int64_t mem_limit, int64_t scratch_limit);
-
-  /// PinBlock(), UnpinBlock(), DeleteBlock() perform the actual work of Block::Pin(),
-  /// Unpin() and Delete(). DeleteBlock() must be called without the lock_ taken and
-  /// DeleteBlockLocked() must be called with the lock_ taken.
-  Status PinBlock(Block* block, bool* pinned, Block* src, bool unpin);
-  Status UnpinBlock(Block* block);
-  void DeleteBlock(Block* block);
-  void DeleteBlockLocked(const boost::unique_lock<boost::mutex>& lock, Block* block);
-
-  /// If there is an in-flight write, cancel the write and restore the contents of the
-  /// block's buffer. If no write has been started for 'block', does nothing. 'block'
-  /// must have an associated buffer. Returns an error status if an error is encountered
-  /// while cancelling the write or CANCELLED if the block mgr is cancelled.
-  Status CancelWrite(Block* block);
-
-  /// If the 'block' is NULL, checks if cancelled and returns. Otherwise, depending on
-  /// 'unpin' calls either  DeleteBlock() or UnpinBlock(), which both first check for
-  /// cancellation. It should be called without the lock_ acquired.
-  Status DeleteOrUnpinBlock(Block* block, bool unpin);
-
-  /// Transfers the buffer from 'src' to 'dst'. 'src' must be pinned. If a write is
-  /// already in flight for 'src', this may block until that write completes.
-  /// If unpin == false, 'src' is simply deleted.
-  /// If unpin == true, 'src' is unpinned and it may block until the write of 'src' is
-  /// completed.
-  /// The caller should not hold 'lock_'.
-  Status TransferBuffer(Block* dst, Block* src, bool unpin);
-
-  /// The number of buffers available for client. That is, if all other clients were
-  /// stopped, the number of buffers this client could get.
-  int64_t available_buffers(Client* client) const;
-
-  /// Returns the total number of unreserved buffers. This is the sum of unpinned,
-  /// free and buffers we can still allocate minus the total number of reserved buffers
-  /// that are not pinned.
-  /// Note this can be negative if the buffers are oversubscribed.
-  /// Must be called with lock_ taken.
-  int64_t remaining_unreserved_buffers() const;
-
-  /// Finds a buffer for a block and pins it. If the block's buffer has not been evicted,
-  /// it removes the block from the unpinned list and sets *in_mem = true.
-  /// If the block is not in memory, it will call FindBuffer() that may block.
-  /// If we can't get a buffer (e.g. no more memory, nothing in the unpinned and free
-  /// lists) this function returns with the block unpinned.
-  /// Uses the lock_, the caller should not have already acquired the lock_.
-  Status FindBufferForBlock(Block* block, bool* in_mem);
-
-  /// Returns a new buffer that can be used. *buffer is set to NULL if there was no
-  /// memory.
-  /// Otherwise, this function gets a new buffer by:
-  ///   1. Allocating a new buffer if possible
-  ///   2. Using a buffer from the free list (which is populated by moving blocks from
-  ///      the unpinned list by writing them out).
-  /// Must be called with the lock_ already taken. This function can block.
-  Status FindBuffer(boost::unique_lock<boost::mutex>& lock, BufferDescriptor** buffer);
-
-  /// Writes unpinned blocks via DiskIoMgr until one of the following is true:
-  ///   1. The number of outstanding writes >= (block_write_threshold_ - num free buffers)
-  ///   2. There are no more unpinned blocks
-  /// Must be called with the lock_ already taken. Is not blocking.
-  Status WriteUnpinnedBlocks();
-
-  /// Issues the write for this block to the DiskIoMgr.
-  Status WriteUnpinnedBlock(Block* block);
-
-  /// Wait until either there is no in-flight write for 'block' or the block mgr is
-  /// cancelled. 'lock_' must be held with 'lock'.
-  void WaitForWrite(boost::unique_lock<boost::mutex>& lock, Block* block);
-
-  /// Callback used by DiskIoMgr to indicate a block write has completed.  write_status
-  /// is the status of the write. is_cancelled_ is set to true if write_status is not
-  /// Status::OK or a re-issue of the write fails. Returns the block's buffer to the
-  /// free buffers list if it is no longer pinned. Returns the block itself to the free
-  /// blocks list if it has been deleted.
-  void WriteComplete(Block* block, const Status& write_status);
-
-  /// Returns a deleted block to the list of free blocks. Assumes the block's buffer has
-  /// already been returned to the free buffers list. Non-blocking.
-  /// Thread-safe and does not need the lock_ acquired.
-  void ReturnUnusedBlock(Block* block);
-
-  /// Checks unused_blocks_ for an unused block object, else allocates a new one.
-  /// Non-blocking and needs no lock_.
-  Block* GetUnusedBlock(Client* client);
-
-  // Test helper to get the number of block writes currently outstanding.
-  int64_t GetNumWritesOutstanding();
-
-  /// Used to debug the state of the block manager. Lock must already be taken.
-  bool Validate() const;
-  std::string DebugInternal() const;
-
-  /// Size of the largest/default block in bytes.
-  const int64_t max_block_size_;
-
-  /// Unpinned blocks are written when the number of free buffers is below this threshold.
-  /// Equal to two times the number of disks.
-  const int block_write_threshold_;
-
-  /// If true, spilling is disabled. The client calls will fail if there is not enough
-  /// memory.
-  const bool disable_spill_;
-
-  const TUniqueId query_id_;
-
-  ObjectPool obj_pool_;
-
-  /// Track buffers allocated by the block manager.
-  boost::scoped_ptr<MemTracker> mem_tracker_;
-
-  /// This lock protects the block and buffer lists below, except for unused_blocks_.
-  /// It also protects the various counters and changes to block state. Additionally, it
-  /// is used for the blocking condvars: buffer_available_cv_ and
-  /// block->write_complete_cv_.
-  boost::mutex lock_;
-
-  /// If true, Init() has been called.
-  bool initialized_;
-
-  /// The total number of reserved buffers across all clients that are not pinned.
-  int unfullfilled_reserved_buffers_;
-
-  /// The total number of pinned buffers across all clients.
-  int total_pinned_buffers_;
-
-  /// Number of outstanding writes (Writes issued but not completed).
-  /// This does not include client-local writes.
-  int non_local_outstanding_writes_;
-
-  /// Signal availability of free buffers. Also signalled when a write completes for a
-  /// pinned block, in case another thread was expecting to obtain its buffer. If
-  /// 'non_local_outstanding_writes_' > 0, notify_all() will eventually be called on
-  /// this condition variable. To avoid free buffers accumulating while threads wait
-  /// on the cv, a woken thread must grab an available buffer (unless is_cancelled_ is
-  /// true at that time).
-  boost::condition_variable buffer_available_cv_;
-
-  /// All used or unused blocks allocated by the BufferedBlockMgr.
-  vector<Block*> all_blocks_;
-
-  /// List of blocks is_pinned_ = false AND are not on DiskIoMgr's write queue.
-  /// Blocks are added to and removed from the back of the list. (i.e. in LIFO order).
-  /// Blocks in this list must have is_pinned_ = false, in_write_ = false,
-  /// is_deleted_ = false.
-  InternalQueue<Block> unpinned_blocks_;
-
-  /// List of blocks that have been deleted and are no longer in use.
-  /// Can be reused in GetNewBlock(). Blocks in this list must be in the Init'ed state,
-  /// i.e. buffer_desc_ = NULL, is_pinned_ = false, in_write_ = false,
-  /// is_deleted_ = false, valid_data_len = 0.
-  InternalQueue<Block> unused_blocks_;
-
-  /// List of buffers that can be assigned to a block in Pin() or GetNewBlock().
-  /// These buffers either have no block associated with them or are associated with an
-  /// an unpinned block that has been persisted. That is, either block = NULL or
-  /// (!block->is_pinned_  && !block->in_write_  && !unpinned_blocks_.Contains(block)).
-  /// All of these buffers are io sized.
-  InternalQueue<BufferDescriptor> free_io_buffers_;
-
-  /// All allocated io-sized buffers.
-  std::list<BufferDescriptor*> all_io_buffers_;
-
-  /// Group of temporary physical files, (one per tmp device) to which
-  /// blocks may be written. Blocks are round-robined across these files.
-  boost::scoped_ptr<TmpFileMgr::FileGroup> tmp_file_group_;
-
-  /// If true, a disk write failed and all API calls return.
-  /// Status::CANCELLED. Set to true if there was an error writing a block, or if
-  /// WriteComplete() needed to reissue the write and that failed.
-  bool is_cancelled_;
-
-  /// Counters and timers to track behavior.
-  boost::scoped_ptr<RuntimeProfile> profile_;
-
-  /// These have a fixed value for the lifetime of the manager and show memory usage.
-  RuntimeProfile::Counter* mem_limit_counter_;
-  RuntimeProfile::Counter* block_size_counter_;
-
-  /// Total number of blocks created.
-  RuntimeProfile::Counter* created_block_counter_;
-
-  /// Number of deleted blocks reused.
-  RuntimeProfile::Counter* recycled_blocks_counter_;
-
-  /// Number of Pin() calls that did not require a disk read.
-  RuntimeProfile::Counter* buffered_pin_counter_;
-
-  /// Time spent waiting for a free buffer.
-  RuntimeProfile::Counter* buffer_wait_timer_;
-
-  /// Number of writes outstanding (issued but not completed).
-  RuntimeProfile::Counter* outstanding_writes_counter_;
-
-  /// Number of writes issued.
-  int writes_issued_;
-
-  /// Protects query_to_block_mgrs_.
-  static SpinLock static_block_mgrs_lock_;
-
-  /// All per-query BufferedBlockMgr objects that are in use.  For memory management, this
-  /// map contains only weak ptrs. BufferedBlockMgrs that are handed out are shared ptrs.
-  /// When all the shared ptrs are no longer referenced, the BufferedBlockMgr
-  /// d'tor will be called at which point the weak ptr will be removed from the map.
-  typedef boost::unordered_map<TUniqueId, std::weak_ptr<BufferedBlockMgr>> BlockMgrsMap;
-  static BlockMgrsMap query_to_block_mgrs_;
-
-  /// Debug option to delay write completion.
-  int debug_write_delay_ms_;
-
-}; // class BufferedBlockMgr
-
-} // namespace impala.
-
-#endif


[06/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-test.cc b/be/src/runtime/buffered-tuple-stream-test.cc
deleted file mode 100644
index 0904833..0000000
--- a/be/src/runtime/buffered-tuple-stream-test.cc
+++ /dev/null
@@ -1,1264 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <boost/scoped_ptr.hpp>
-#include <boost/bind.hpp>
-#include <boost/filesystem.hpp>
-
-#include <set>
-#include <string>
-#include <limits> // for std::numeric_limits<int>::max()
-
-#include "testutil/gtest-util.h"
-#include "codegen/llvm-codegen.h"
-#include "gutil/gscoped_ptr.h"
-#include "runtime/buffered-tuple-stream.inline.h"
-#include "runtime/collection-value.h"
-#include "runtime/collection-value-builder.h"
-#include "runtime/raw-value.h"
-#include "runtime/row-batch.h"
-#include "runtime/string-value.inline.h"
-#include "runtime/test-env.h"
-#include "runtime/tmp-file-mgr.h"
-#include "service/fe-support.h"
-#include "testutil/desc-tbl-builder.h"
-#include "util/test-info.h"
-
-#include "gen-cpp/Types_types.h"
-#include "gen-cpp/ImpalaInternalService_types.h"
-
-#include "common/names.h"
-
-using kudu::FreeDeleter;
-
-static const int BATCH_SIZE = 250;
-static const int IO_BLOCK_SIZE = 8 * 1024 * 1024;
-static const uint32_t PRIME = 479001599;
-
-namespace impala {
-
-static const StringValue STRINGS[] = {
-  StringValue("ABC"),
-  StringValue("HELLO"),
-  StringValue("123456789"),
-  StringValue("FOOBAR"),
-  StringValue("ONE"),
-  StringValue("THREE"),
-  StringValue("abcdefghijklmno"),
-  StringValue("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
-  StringValue("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
-};
-
-static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue);
-
-class SimpleTupleStreamTest : public testing::Test {
- protected:
-  virtual void SetUp() {
-    test_env_.reset(new TestEnv());
-    ASSERT_OK(test_env_->Init());
-
-    CreateDescriptors();
-
-    mem_pool_.reset(new MemPool(&tracker_));
-  }
-
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples(1, false);
-    vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ = pool_.Add(new RowDescriptor(
-        *int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ = pool_.Add(new RowDescriptor(
-        *string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-
-  virtual void TearDown() {
-    runtime_state_ = NULL;
-    client_ = NULL;
-    pool_.Clear();
-    mem_pool_->FreeAll();
-    test_env_.reset();
-  }
-
-  /// Setup a block manager with the provided settings and client with no reservation,
-  /// tracked by tracker_.
-  void InitBlockMgr(int64_t limit, int block_size) {
-    ASSERT_OK(test_env_->CreateQueryStateWithBlockMgr(
-        0, limit, block_size, nullptr, &runtime_state_));
-    MemTracker* client_tracker =
-        pool_.Add(new MemTracker(-1, "client", runtime_state_->instance_mem_tracker()));
-    ASSERT_OK(runtime_state_->block_mgr()->RegisterClient(
-        "", 0, false, client_tracker, runtime_state_, &client_));
-  }
-
-  /// Generate the ith element of a sequence of int values.
-  int GenIntValue(int i) {
-    // Multiply by large prime to get varied bit patterns.
-    return i * PRIME;
-  }
-
-  /// Generate the ith element of a sequence of bool values.
-  bool GenBoolValue(int i) {
-    // Use a middle bit of the int value.
-    return ((GenIntValue(i) >> 8) & 0x1) != 0;
-  }
-
-  /// Count the total number of slots per row based on the given row descriptor.
-  int CountSlotsPerRow(const RowDescriptor& row_desc) {
-    int slots_per_row = 0;
-    for (int i = 0; i < row_desc.tuple_descriptors().size(); ++i) {
-      TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[i];
-      slots_per_row += tuple_desc->slots().size();
-    }
-    return slots_per_row;
-  }
-
-  /// Allocate a row batch with 'num_rows' of rows with layout described by 'row_desc'.
-  /// 'offset' is used to account for rows occupied by any previous row batches. This is
-  /// needed to match the values generated in VerifyResults(). If 'gen_null' is true,
-  /// some tuples will be set to NULL.
-  virtual RowBatch* CreateBatch(
-      const RowDescriptor* row_desc, int offset, int num_rows, bool gen_null) {
-    RowBatch* batch = pool_.Add(new RowBatch(row_desc, num_rows, &tracker_));
-    int num_tuples = row_desc->tuple_descriptors().size();
-
-    int idx = offset * CountSlotsPerRow(*row_desc);
-    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-      TupleRow* row = batch->GetRow(row_idx);
-      for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-        TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
-        Tuple* tuple = Tuple::Create(tuple_desc->byte_size(), batch->tuple_data_pool());
-        bool is_null = gen_null && !GenBoolValue(idx);
-        for (int slot_idx = 0; slot_idx < tuple_desc->slots().size(); ++slot_idx, ++idx) {
-          SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
-          void* slot = tuple->GetSlot(slot_desc->tuple_offset());
-          switch (slot_desc->type().type) {
-            case TYPE_INT:
-              *reinterpret_cast<int*>(slot) = GenIntValue(idx);
-              break;
-            case TYPE_STRING:
-              *reinterpret_cast<StringValue*>(slot) = STRINGS[idx % NUM_STRINGS];
-              break;
-            default:
-              // The memory has been zero'ed out already by Tuple::Create().
-              break;
-          }
-        }
-        if (is_null) {
-          row->SetTuple(tuple_idx, NULL);
-        } else {
-          row->SetTuple(tuple_idx, tuple);
-        }
-      }
-      batch->CommitLastRow();
-    }
-    return batch;
-  }
-
-  virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) {
-    return CreateBatch(int_desc_, offset, num_rows, gen_null);
-  }
-
-  virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) {
-    return CreateBatch(string_desc_, offset, num_rows, gen_null);
-  }
-
-  void AppendValue(uint8_t* ptr, vector<int>* results) {
-    if (ptr == NULL) {
-      // For the tests indicate null-ability using the max int value
-      results->push_back(std::numeric_limits<int>::max());
-    } else {
-      results->push_back(*reinterpret_cast<int*>(ptr));
-    }
-  }
-
-  void AppendValue(uint8_t* ptr, vector<StringValue>* results) {
-    if (ptr == NULL) {
-      results->push_back(StringValue());
-    } else {
-      StringValue sv = *reinterpret_cast<StringValue*>(ptr);
-      uint8_t* copy = mem_pool_->Allocate(sv.len);
-      memcpy(copy, sv.ptr, sv.len);
-      sv.ptr = reinterpret_cast<char*>(copy);
-      results->push_back(sv);
-    }
-  }
-
-  template <typename T>
-  void AppendRowTuples(TupleRow* row, RowDescriptor* row_desc, vector<T>* results) {
-    DCHECK(row != NULL);
-    const int num_tuples = row_desc->tuple_descriptors().size();
-
-    for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-      TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
-      Tuple* tuple = row->GetTuple(tuple_idx);
-      const int num_slots = tuple_desc->slots().size();
-      for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx) {
-        SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
-        if (tuple == NULL) {
-          AppendValue(NULL, results);
-        } else {
-          void* slot = tuple->GetSlot(slot_desc->tuple_offset());
-          AppendValue(reinterpret_cast<uint8_t*>(slot), results);
-        }
-      }
-    }
-  }
-
-  template <typename T>
-  void ReadValues(BufferedTupleStream* stream, RowDescriptor* desc, vector<T>* results,
-      int num_batches = -1) {
-    bool eos = false;
-    RowBatch batch(desc, BATCH_SIZE, &tracker_);
-    int batches_read = 0;
-    do {
-      batch.Reset();
-      EXPECT_OK(stream->GetNext(&batch, &eos));
-      ++batches_read;
-      for (int i = 0; i < batch.num_rows(); ++i) {
-        AppendRowTuples(batch.GetRow(i), desc, results);
-      }
-    } while (!eos && (num_batches < 0 || batches_read <= num_batches));
-  }
-
-  void GetExpectedValue(int idx, bool is_null, int* val) {
-    if (is_null) {
-      *val = std::numeric_limits<int>::max();
-    } else {
-      *val = GenIntValue(idx);
-    }
-  }
-
-  void GetExpectedValue(int idx, bool is_null, StringValue* val) {
-    if (is_null) {
-      *val = StringValue();
-    } else {
-      *val = STRINGS[idx % NUM_STRINGS];
-    }
-  }
-
-  template <typename T>
-  void VerifyResults(const RowDescriptor& row_desc, const vector<T>& results,
-      int num_rows, bool gen_null) {
-    int idx = 0;
-    for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-      const int num_tuples = row_desc.tuple_descriptors().size();
-      for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-        const TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[tuple_idx];
-        const int num_slots = tuple_desc->slots().size();
-        bool is_null = gen_null && !GenBoolValue(idx);
-        for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx, ++idx) {
-          T expected_val;
-          GetExpectedValue(idx, is_null, &expected_val);
-          ASSERT_EQ(results[idx], expected_val)
-              << "results[" << idx << "] " << results[idx] << " != "
-              << expected_val << " row_idx=" << row_idx
-              << " tuple_idx=" << tuple_idx << " slot_idx=" << slot_idx
-              << " gen_null=" << gen_null;
-        }
-      }
-    }
-    DCHECK_EQ(results.size(), idx);
-  }
-
-  // Test adding num_batches of ints to the stream and reading them back.
-  // If unpin_stream is true, operate the stream in unpinned mode.
-  // Assumes that enough buffers are available to read and write the stream.
-  template <typename T>
-  void TestValues(int num_batches, RowDescriptor* desc, bool gen_null,
-      bool unpin_stream, int num_rows = BATCH_SIZE, bool use_small_buffers = true) {
-    BufferedTupleStream stream(runtime_state_, desc, runtime_state_->block_mgr(), client_,
-        use_small_buffers, false);
-    ASSERT_OK(stream.Init(-1, NULL, true));
-    bool got_write_buffer;
-    ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-    ASSERT_TRUE(got_write_buffer);
-
-    if (unpin_stream) {
-      ASSERT_OK(stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
-    }
-    // Add rows to the stream
-    int offset = 0;
-    for (int i = 0; i < num_batches; ++i) {
-      RowBatch* batch = NULL;
-
-      Status status;
-      ASSERT_TRUE(sizeof(T) == sizeof(int) || sizeof(T) == sizeof(StringValue));
-      batch = CreateBatch(desc, offset, num_rows, gen_null);
-      for (int j = 0; j < batch->num_rows(); ++j) {
-        bool b = stream.AddRow(batch->GetRow(j), &status);
-        ASSERT_OK(status);
-        if (!b) {
-          ASSERT_TRUE(stream.using_small_buffers());
-          bool got_buffer;
-          ASSERT_OK(stream.SwitchToIoBuffers(&got_buffer));
-          ASSERT_TRUE(got_buffer);
-          b = stream.AddRow(batch->GetRow(j), &status);
-          ASSERT_OK(status);
-        }
-        ASSERT_TRUE(b);
-      }
-      offset += batch->num_rows();
-      // Reset the batch to make sure the stream handles the memory correctly.
-      batch->Reset();
-    }
-
-    bool got_read_buffer;
-    ASSERT_OK(stream.PrepareForRead(false, &got_read_buffer));
-    ASSERT_TRUE(got_read_buffer);
-
-    // Read all the rows back
-    vector<T> results;
-    ReadValues(&stream, desc, &results);
-
-    // Verify result
-    VerifyResults<T>(*desc, results, num_rows * num_batches, gen_null);
-
-    stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-  }
-
-  void TestIntValuesInterleaved(int num_batches, int num_batches_before_read,
-      bool unpin_stream) {
-    for (int small_buffers = 0; small_buffers < 2; ++small_buffers) {
-      BufferedTupleStream stream(runtime_state_, int_desc_, runtime_state_->block_mgr(),
-          client_, small_buffers == 0, // initial small buffers
-          true); // read_write
-      ASSERT_OK(stream.Init(-1, NULL, true));
-      bool got_write_buffer;
-      ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-      ASSERT_TRUE(got_write_buffer);
-      bool got_read_buffer;
-      ASSERT_OK(stream.PrepareForRead(true, &got_read_buffer));
-      ASSERT_TRUE(got_read_buffer);
-      if (unpin_stream) {
-        ASSERT_OK(stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
-      }
-
-      vector<int> results;
-
-      for (int i = 0; i < num_batches; ++i) {
-        RowBatch* batch = CreateIntBatch(i * BATCH_SIZE, BATCH_SIZE, false);
-        for (int j = 0; j < batch->num_rows(); ++j) {
-          Status status;
-          bool b = stream.AddRow(batch->GetRow(j), &status);
-          ASSERT_TRUE(b);
-          ASSERT_OK(status);
-        }
-        // Reset the batch to make sure the stream handles the memory correctly.
-        batch->Reset();
-        if (i % num_batches_before_read == 0) {
-          ReadValues(&stream, int_desc_, &results,
-              (rand() % num_batches_before_read) + 1);
-        }
-      }
-      ReadValues(&stream, int_desc_, &results);
-
-      VerifyResults<int>(*int_desc_, results, BATCH_SIZE * num_batches, false);
-
-      stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-    }
-  }
-
-  void TestUnpinPin(bool varlen_data);
-
-  void TestTransferMemory(bool pinned_stream, bool read_write);
-
-  scoped_ptr<TestEnv> test_env_;
-  RuntimeState* runtime_state_;
-  BufferedBlockMgr::Client* client_;
-
-  MemTracker tracker_;
-  ObjectPool pool_;
-  RowDescriptor* int_desc_;
-  RowDescriptor* string_desc_;
-  scoped_ptr<MemPool> mem_pool_;
-};
-
-
-// Tests with a non-NULLable tuple per row.
-class SimpleNullStreamTest : public SimpleTupleStreamTest {
- protected:
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples(1, true);
-    vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ = pool_.Add(new RowDescriptor(
-        *int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ = pool_.Add(new RowDescriptor(
-        *string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-}; // SimpleNullStreamTest
-
-// Tests with multiple non-NULLable tuples per row.
-class MultiTupleStreamTest : public SimpleTupleStreamTest {
- protected:
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples;
-    nullable_tuples.push_back(false);
-    nullable_tuples.push_back(false);
-    nullable_tuples.push_back(false);
-
-    vector<TTupleId> tuple_ids;
-    tuple_ids.push_back(static_cast<TTupleId>(0));
-    tuple_ids.push_back(static_cast<TTupleId>(1));
-    tuple_ids.push_back(static_cast<TTupleId>(2));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ = pool_.Add(new RowDescriptor(
-        *int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ = pool_.Add(new RowDescriptor(
-        *string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-};
-
-// Tests with multiple NULLable tuples per row.
-class MultiNullableTupleStreamTest : public SimpleTupleStreamTest {
- protected:
-  virtual void CreateDescriptors() {
-    vector<bool> nullable_tuples;
-    nullable_tuples.push_back(false);
-    nullable_tuples.push_back(true);
-    nullable_tuples.push_back(true);
-
-    vector<TTupleId> tuple_ids;
-    tuple_ids.push_back(static_cast<TTupleId>(0));
-    tuple_ids.push_back(static_cast<TTupleId>(1));
-    tuple_ids.push_back(static_cast<TTupleId>(2));
-
-    DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_builder.DeclareTuple() << TYPE_INT;
-    int_desc_ = pool_.Add(new RowDescriptor(
-        *int_builder.Build(), tuple_ids, nullable_tuples));
-
-    DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_builder.DeclareTuple() << TYPE_STRING;
-    string_desc_ = pool_.Add(new RowDescriptor(
-        *string_builder.Build(), tuple_ids, nullable_tuples));
-  }
-};
-
-/// Tests with collection types.
-class ArrayTupleStreamTest : public SimpleTupleStreamTest {
- protected:
-  RowDescriptor* array_desc_;
-
-  virtual void CreateDescriptors() {
-    // tuples: (array<string>, array<array<int>>) (array<int>)
-    vector<bool> nullable_tuples(2, true);
-    vector<TTupleId> tuple_ids;
-    tuple_ids.push_back(static_cast<TTupleId>(0));
-    tuple_ids.push_back(static_cast<TTupleId>(1));
-    ColumnType string_array_type;
-    string_array_type.type = TYPE_ARRAY;
-    string_array_type.children.push_back(TYPE_STRING);
-
-    ColumnType int_array_type;
-    int_array_type.type = TYPE_ARRAY;
-    int_array_type.children.push_back(TYPE_STRING);
-
-    ColumnType nested_array_type;
-    nested_array_type.type = TYPE_ARRAY;
-    nested_array_type.children.push_back(int_array_type);
-
-    DescriptorTblBuilder builder(test_env_->exec_env()->frontend(), &pool_);
-    builder.DeclareTuple() << string_array_type << nested_array_type;
-    builder.DeclareTuple() << int_array_type;
-    array_desc_ = pool_.Add(new RowDescriptor(
-        *builder.Build(), tuple_ids, nullable_tuples));
-  }
-};
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleTupleStreamTest, Basic) {
-  InitBlockMgr(-1, IO_BLOCK_SIZE);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-  TestValues<int>(1, int_desc_, false, false);
-  TestValues<int>(10, int_desc_, false, false);
-  TestValues<int>(100, int_desc_, false, false);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, false, false);
-  TestValues<StringValue>(10, string_desc_, false, false);
-  TestValues<StringValue>(100, string_desc_, false, false);
-
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-  TestIntValuesInterleaved(1, 1, false);
-  TestIntValuesInterleaved(10, 5, false);
-  TestIntValuesInterleaved(100, 15, false);
-}
-
-// Test with only 1 buffer.
-TEST_F(SimpleTupleStreamTest, OneBufferSpill) {
-  // Each buffer can only hold 100 ints, so this spills quite often.
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(buffer_size, buffer_size);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-}
-
-// Test with a few buffers.
-TEST_F(SimpleTupleStreamTest, ManyBufferSpill) {
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(10 * buffer_size, buffer_size);
-
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-}
-
-void SimpleTupleStreamTest::TestUnpinPin(bool varlen_data) {
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(3 * buffer_size, buffer_size);
-  RowDescriptor* row_desc = varlen_data ? string_desc_ : int_desc_;
-
-  BufferedTupleStream stream(
-      runtime_state_, row_desc, runtime_state_->block_mgr(), client_, true, false);
-  ASSERT_OK(stream.Init(-1, NULL, true));
-  bool got_write_buffer;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-  ASSERT_TRUE(got_write_buffer);
-
-  int offset = 0;
-  bool full = false;
-  while (!full) {
-    RowBatch* batch = varlen_data ? CreateStringBatch(offset, BATCH_SIZE, false)
-                                  : CreateIntBatch(offset, BATCH_SIZE, false);
-    int j = 0;
-    for (; j < batch->num_rows(); ++j) {
-      Status status;
-      full = !stream.AddRow(batch->GetRow(j), &status);
-      ASSERT_OK(status);
-      if (full) break;
-    }
-    offset += j;
-  }
-
-  ASSERT_OK(stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
-
-  bool pinned = false;
-  ASSERT_OK(stream.PinStream(false, &pinned));
-  ASSERT_TRUE(pinned);
-
-
-  // Read and verify result a few times. We should be able to reread the stream if
-  // we don't use delete on read mode.
-  int read_iters = 3;
-  for (int i = 0; i < read_iters; ++i) {
-    bool delete_on_read = i == read_iters - 1;
-    bool got_read_buffer;
-    ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_buffer));
-    ASSERT_TRUE(got_read_buffer);
-
-    if (varlen_data) {
-      vector<StringValue> results;
-      ReadValues(&stream, row_desc, &results);
-      VerifyResults<StringValue>(*string_desc_, results, offset, false);
-    } else {
-      vector<int> results;
-      ReadValues(&stream, row_desc, &results);
-      VerifyResults<int>(*int_desc_, results, offset, false);
-    }
-  }
-
-  // After delete_on_read, all blocks aside from the last should be deleted.
-  // Note: this should really be 0, but the BufferedTupleStream returns eos before
-  // deleting the last block, rather than after, so the last block isn't deleted
-  // until the stream is closed.
-  ASSERT_EQ(stream.bytes_in_mem(false), buffer_size);
-
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-
-  ASSERT_EQ(stream.bytes_in_mem(false), 0);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPin) {
-  TestUnpinPin(false);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPinVarlen) {
-  TestUnpinPin(false);
-}
-
-TEST_F(SimpleTupleStreamTest, SmallBuffers) {
-  int buffer_size = IO_BLOCK_SIZE;
-  InitBlockMgr(2 * buffer_size, buffer_size);
-
-  BufferedTupleStream stream(
-      runtime_state_, int_desc_, runtime_state_->block_mgr(), client_, true, false);
-  ASSERT_OK(stream.Init(-1, NULL, false));
-  bool got_write_buffer;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-  ASSERT_TRUE(got_write_buffer);
-
-  // Initial buffer should be small.
-  EXPECT_LT(stream.bytes_in_mem(false), buffer_size);
-
-  RowBatch* batch = CreateIntBatch(0, 1024, false);
-
-  Status status;
-  for (int i = 0; i < batch->num_rows(); ++i) {
-    bool ret = stream.AddRow(batch->GetRow(i), &status);
-    EXPECT_TRUE(ret);
-    ASSERT_OK(status);
-  }
-  EXPECT_LT(stream.bytes_in_mem(false), buffer_size);
-  EXPECT_LT(stream.byte_size(), buffer_size);
-  ASSERT_TRUE(stream.using_small_buffers());
-
-  // 40 MB of ints
-  batch = CreateIntBatch(0, 10 * 1024 * 1024, false);
-  for (int i = 0; i < batch->num_rows(); ++i) {
-    bool ret = stream.AddRow(batch->GetRow(i), &status);
-    ASSERT_OK(status);
-    if (!ret) {
-      ASSERT_TRUE(stream.using_small_buffers());
-      bool got_buffer;
-      ASSERT_OK(stream.SwitchToIoBuffers(&got_buffer));
-      ASSERT_TRUE(got_buffer);
-      ret = stream.AddRow(batch->GetRow(i), &status);
-      ASSERT_OK(status);
-    }
-    ASSERT_TRUE(ret);
-  }
-  EXPECT_EQ(stream.bytes_in_mem(false), buffer_size);
-
-  // TODO: Test for IMPALA-2330. In case SwitchToIoBuffers() fails to get buffer then
-  // using_small_buffers() should still return true.
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-void SimpleTupleStreamTest::TestTransferMemory(bool pin_stream, bool read_write) {
-  // Use smaller buffers so that the explicit FLUSH_RESOURCES flag is required to
-  // make the batch at capacity.
-  int buffer_size = 4 * 1024;
-  InitBlockMgr(100 * buffer_size, buffer_size);
-
-  BufferedTupleStream stream(
-      runtime_state_, int_desc_, runtime_state_->block_mgr(), client_, false, read_write);
-  ASSERT_OK(stream.Init(-1, NULL, pin_stream));
-  bool got_write_buffer;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-  ASSERT_TRUE(got_write_buffer);
-  RowBatch* batch = CreateIntBatch(0, 1024, false);
-
-  // Construct a stream with 4 blocks.
-  const int total_num_blocks = 4;
-  while (stream.byte_size() < total_num_blocks * buffer_size) {
-    Status status;
-    for (int i = 0; i < batch->num_rows(); ++i) {
-      bool ret = stream.AddRow(batch->GetRow(i), &status);
-      EXPECT_TRUE(ret);
-      ASSERT_OK(status);
-    }
-  }
-
-  bool got_read_buffer;
-  ASSERT_OK(stream.PrepareForRead(true, &got_read_buffer));
-  ASSERT_TRUE(got_read_buffer);
-
-  batch->Reset();
-  stream.Close(batch, RowBatch::FlushMode::FLUSH_RESOURCES);
-  if (pin_stream) {
-    DCHECK_EQ(total_num_blocks, batch->num_blocks());
-  } else if (read_write) {
-    // Read and write block should be attached.
-    DCHECK_EQ(2, batch->num_blocks());
-  } else {
-    // Read block should be attached.
-    DCHECK_EQ(1, batch->num_blocks());
-  }
-  DCHECK(batch->AtCapacity()); // Flush resources flag should have been set.
-  batch->Reset();
-  DCHECK_EQ(0, batch->num_blocks());
-}
-
-/// Test attaching memory to a row batch from a pinned stream.
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamReadWrite) {
-  TestTransferMemory(true, true);
-}
-
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamNoReadWrite) {
-  TestTransferMemory(true, false);
-}
-
-/// Test attaching memory to a row batch from an unpinned stream.
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamReadWrite) {
-  TestTransferMemory(false, true);
-}
-
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamNoReadWrite) {
-  TestTransferMemory(false, false);
-}
-
-// Test that tuple stream functions if it references strings outside stream. The
-// aggregation node relies on this since it updates tuples in-place.
-TEST_F(SimpleTupleStreamTest, StringsOutsideStream) {
-  int buffer_size = 8 * 1024 * 1024;
-  InitBlockMgr(2 * buffer_size, buffer_size);
-  Status status = Status::OK();
-
-  int num_batches = 100;
-  int rows_added = 0;
-  DCHECK_EQ(string_desc_->tuple_descriptors().size(), 1);
-  TupleDescriptor& tuple_desc = *string_desc_->tuple_descriptors()[0];
-
-  set<SlotId> external_slots;
-  for (int i = 0; i < tuple_desc.string_slots().size(); ++i) {
-    external_slots.insert(tuple_desc.string_slots()[i]->id());
-  }
-
-  BufferedTupleStream stream(runtime_state_, string_desc_, runtime_state_->block_mgr(),
-      client_, true, false, external_slots);
-  for (int i = 0; i < num_batches; ++i) {
-    RowBatch* batch = CreateStringBatch(rows_added, BATCH_SIZE, false);
-    for (int j = 0; j < batch->num_rows(); ++j) {
-      uint8_t* varlen_data;
-      int fixed_size = tuple_desc.byte_size();
-      uint8_t* tuple = stream.AllocateRow(fixed_size, 0, &varlen_data, &status);
-      ASSERT_TRUE(tuple != NULL);
-      ASSERT_TRUE(status.ok());
-      // Copy fixed portion in, but leave it pointing to row batch's varlen data.
-      memcpy(tuple, batch->GetRow(j)->GetTuple(0), fixed_size);
-    }
-    rows_added += batch->num_rows();
-  }
-
-  DCHECK_EQ(rows_added, stream.num_rows());
-
-  for (int delete_on_read = 0; delete_on_read <= 1; ++delete_on_read) {
-    // Keep stream in memory and test we can read ok.
-    vector<StringValue> results;
-    bool got_read_buffer;
-    ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_buffer));
-    ASSERT_TRUE(got_read_buffer);
-    ReadValues(&stream, string_desc_, &results);
-    VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
-  }
-
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Construct a big row by stiching together many tuples so the total row size
-// will be close to the IO block size. With null indicators, stream will fail to
-// be initialized; Without null indicators, things should work fine.
-TEST_F(SimpleTupleStreamTest, BigRow) {
-  InitBlockMgr(2 * IO_BLOCK_SIZE, IO_BLOCK_SIZE);
-  vector<TupleId> tuple_ids;
-  vector<bool> nullable_tuples;
-  vector<bool> non_nullable_tuples;
-
-  DescriptorTblBuilder big_row_builder(test_env_->exec_env()->frontend(), &pool_);
-  // Each tuple contains 8 slots of TYPE_INT and a single byte for null indicator.
-  const int num_tuples = IO_BLOCK_SIZE / (8 * sizeof(int) + 1);
-  for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
-    big_row_builder.DeclareTuple() << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT
-        << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT;
-    tuple_ids.push_back(static_cast<TTupleId>(tuple_idx));
-    nullable_tuples.push_back(true);
-    non_nullable_tuples.push_back(false);
-  }
-  DescriptorTbl *desc = big_row_builder.Build();
-
-  // Construct a big row with all non-nullable tuples.
-  RowDescriptor* row_desc = pool_.Add(new RowDescriptor(
-      *desc, tuple_ids, non_nullable_tuples));
-  ASSERT_FALSE(row_desc->IsAnyTupleNullable());
-  // Test writing this row into the stream and then reading it back.
-  TestValues<int>(1, row_desc, false, false, 1, false);
-  TestValues<int>(1, row_desc, false, true, 1, false);
-
-  // Construct a big row with nullable tuples. This requires space for null indicators
-  // in the stream which, as a result, will fail to initialize.
-  RowDescriptor* nullable_row_desc = pool_.Add(new RowDescriptor(
-      *desc, tuple_ids, nullable_tuples));
-  ASSERT_TRUE(nullable_row_desc->IsAnyTupleNullable());
-  BufferedTupleStream nullable_stream(runtime_state_, nullable_row_desc,
-      runtime_state_->block_mgr(), client_, false, false);
-  Status status = nullable_stream.Init(-1, NULL, true);
-  ASSERT_FALSE(status.ok());
-  nullable_stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test for IMPALA-3923: overflow of 32-bit int in GetRows().
-TEST_F(SimpleTupleStreamTest, TestGetRowsOverflow) {
-  InitBlockMgr(-1, 8 * 1024 * 1024);
-  BufferedTupleStream stream(
-      runtime_state_, int_desc_, runtime_state_->block_mgr(), client_, false, false);
-  ASSERT_OK(stream.Init(-1, NULL, true));
-
-  Status status;
-  // Add more rows than can be fit in a RowBatch (limited by its 32-bit row count).
-  // Actually adding the rows would take a very long time, so just set num_rows_.
-  // This puts the stream in an inconsistent state, but exercises the right code path.
-  stream.num_rows_ = 1L << 33;
-  bool got_rows;
-  scoped_ptr<RowBatch> overflow_batch;
-  ASSERT_FALSE(stream.GetRows(&overflow_batch, &got_rows).ok());
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleNullStreamTest, Basic) {
-  InitBlockMgr(-1, IO_BLOCK_SIZE);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-  TestValues<int>(1, int_desc_, true, true);
-  TestValues<int>(10, int_desc_, true, true);
-  TestValues<int>(100, int_desc_, true, true);
-  TestValues<int>(1, int_desc_, false, false);
-  TestValues<int>(10, int_desc_, false, false);
-  TestValues<int>(100, int_desc_, false, false);
-  TestValues<int>(1, int_desc_, true, false);
-  TestValues<int>(10, int_desc_, true, false);
-  TestValues<int>(100, int_desc_, true, false);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, true, true);
-  TestValues<StringValue>(10, string_desc_, true, true);
-  TestValues<StringValue>(100, string_desc_, true, true);
-  TestValues<StringValue>(1, string_desc_, false, false);
-  TestValues<StringValue>(10, string_desc_, false, false);
-  TestValues<StringValue>(100, string_desc_, false, false);
-  TestValues<StringValue>(1, string_desc_, true, false);
-  TestValues<StringValue>(10, string_desc_, true, false);
-  TestValues<StringValue>(100, string_desc_, true, false);
-
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-  TestIntValuesInterleaved(1, 1, false);
-  TestIntValuesInterleaved(10, 5, false);
-  TestIntValuesInterleaved(100, 15, false);
-}
-
-// Test tuple stream with only 1 buffer and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleOneBufferSpill) {
-  // Each buffer can only hold 100 ints, so this spills quite often.
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(buffer_size, buffer_size);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-}
-
-// Test with a few buffers and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleManyBufferSpill) {
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(10 * buffer_size, buffer_size);
-
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-}
-
-// Test that we can allocate a row in the stream and copy in multiple tuples then
-// read it back from the stream.
-TEST_F(MultiTupleStreamTest, MultiTupleAllocateRow) {
-  // Use small buffers so it will be flushed to disk.
-  int buffer_size = 4 * 1024;
-  InitBlockMgr(2 * buffer_size, buffer_size);
-  Status status = Status::OK();
-
-  int num_batches = 1;
-  int rows_added = 0;
-  BufferedTupleStream stream(
-      runtime_state_, string_desc_, runtime_state_->block_mgr(), client_, false, false);
-  ASSERT_OK(stream.Init(-1, NULL, false));
-  bool got_write_buffer;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-  ASSERT_TRUE(got_write_buffer);
-
-  for (int i = 0; i < num_batches; ++i) {
-    RowBatch* batch = CreateStringBatch(rows_added, 1, false);
-    for (int j = 0; j < batch->num_rows(); ++j) {
-      TupleRow* row = batch->GetRow(j);
-      int64_t fixed_size = 0;
-      int64_t varlen_size = 0;
-      for (int k = 0; k < string_desc_->tuple_descriptors().size(); k++) {
-        TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[k];
-        fixed_size += tuple_desc->byte_size();
-        varlen_size += row->GetTuple(k)->VarlenByteSize(*tuple_desc);
-      }
-      uint8_t* varlen_data;
-      uint8_t* fixed_data = stream.AllocateRow(fixed_size, varlen_size, &varlen_data,
-          &status);
-      ASSERT_TRUE(fixed_data != NULL);
-      ASSERT_TRUE(status.ok());
-      uint8_t* varlen_write_ptr = varlen_data;
-      for (int k = 0; k < string_desc_->tuple_descriptors().size(); k++) {
-        TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[k];
-        Tuple* src = row->GetTuple(k);
-        Tuple* dst = reinterpret_cast<Tuple*>(fixed_data);
-        fixed_data += tuple_desc->byte_size();
-        memcpy(dst, src, tuple_desc->byte_size());
-        for (int l = 0; l < tuple_desc->slots().size(); l++) {
-          SlotDescriptor* slot = tuple_desc->slots()[l];
-          StringValue* src_string = src->GetStringSlot(slot->tuple_offset());
-          StringValue* dst_string = dst->GetStringSlot(slot->tuple_offset());
-          dst_string->ptr = reinterpret_cast<char*>(varlen_write_ptr);
-          memcpy(dst_string->ptr, src_string->ptr, src_string->len);
-          varlen_write_ptr += src_string->len;
-        }
-      }
-      ASSERT_EQ(varlen_data + varlen_size, varlen_write_ptr);
-    }
-    rows_added += batch->num_rows();
-  }
-
-  for (int i = 0; i < 3; ++i) {
-    bool delete_on_read = i == 2;
-    vector<StringValue> results;
-    bool got_read_buffer;
-    stream.PrepareForRead(delete_on_read, &got_read_buffer);
-    ASSERT_TRUE(got_read_buffer);
-    ReadValues(&stream, string_desc_, &results);
-    VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
-  }
-
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test with rows with multiple nullable tuples.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleOneBufferSpill) {
-  // Each buffer can only hold 100 ints, so this spills quite often.
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(buffer_size, buffer_size);
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(1, int_desc_, true, true);
-  TestValues<int>(10, int_desc_, true, true);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, true, true);
-  TestValues<StringValue>(10, string_desc_, true, true);
-}
-
-// Test with a few buffers.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleManyBufferSpill) {
-  int buffer_size = 100 * sizeof(int);
-  InitBlockMgr(10 * buffer_size, buffer_size);
-
-  TestValues<int>(1, int_desc_, false, true);
-  TestValues<int>(10, int_desc_, false, true);
-  TestValues<int>(100, int_desc_, false, true);
-  TestValues<int>(1, int_desc_, true, true);
-  TestValues<int>(10, int_desc_, true, true);
-  TestValues<int>(100, int_desc_, true, true);
-
-  TestValues<StringValue>(1, string_desc_, false, true);
-  TestValues<StringValue>(10, string_desc_, false, true);
-  TestValues<StringValue>(100, string_desc_, false, true);
-  TestValues<StringValue>(1, string_desc_, true, true);
-  TestValues<StringValue>(10, string_desc_, true, true);
-  TestValues<StringValue>(100, string_desc_, true, true);
-
-  TestIntValuesInterleaved(1, 1, true);
-  TestIntValuesInterleaved(10, 5, true);
-  TestIntValuesInterleaved(100, 15, true);
-}
-
-/// Test that ComputeRowSize handles nulls
-TEST_F(MultiNullableTupleStreamTest, TestComputeRowSize) {
-  InitBlockMgr(-1, 8 * 1024 * 1024);
-  const vector<TupleDescriptor*>& tuple_descs = string_desc_->tuple_descriptors();
-  // String in second tuple is stored externally.
-  set<SlotId> external_slots;
-  const SlotDescriptor* external_string_slot = tuple_descs[1]->slots()[0];
-  external_slots.insert(external_string_slot->id());
-
-  BufferedTupleStream stream(runtime_state_, string_desc_, runtime_state_->block_mgr(),
-      client_, false, false, external_slots);
-  gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
-        malloc(tuple_descs.size() * sizeof(Tuple*))));
-  gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
-        malloc(tuple_descs[0]->byte_size())));
-  gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
-        malloc(tuple_descs[1]->byte_size())));
-  gscoped_ptr<Tuple, FreeDeleter> tuple2(reinterpret_cast<Tuple*>(
-        malloc(tuple_descs[2]->byte_size())));
-  memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-  memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-  memset(tuple2.get(), 0, tuple_descs[2]->byte_size());
-
-  // All nullable tuples are NULL.
-  row->SetTuple(0, tuple0.get());
-  row->SetTuple(1, NULL);
-  row->SetTuple(2, NULL);
-  EXPECT_EQ(tuple_descs[0]->byte_size(), stream.ComputeRowSize(row.get()));
-
-  // Tuples are initialized to empty and have no var-len data.
-  row->SetTuple(1, tuple1.get());
-  row->SetTuple(2, tuple2.get());
-  EXPECT_EQ(string_desc_->GetRowSize(), stream.ComputeRowSize(row.get()));
-
-  // Tuple 0 has some data.
-  const SlotDescriptor* string_slot = tuple_descs[0]->slots()[0];
-  StringValue* sv = tuple0->GetStringSlot(string_slot->tuple_offset());
-  *sv = STRINGS[0];
-  int64_t expected_len = string_desc_->GetRowSize() + sv->len;
-  EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
-
-  // Check that external slots aren't included in count.
-  sv = tuple1->GetStringSlot(external_string_slot->tuple_offset());
-  sv->ptr = reinterpret_cast<char*>(1234);
-  sv->len = 1234;
-  EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
-
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-/// Test that deep copy works with arrays by copying into a BufferedTupleStream, freeing
-/// the original rows, then reading back the rows and verifying the contents.
-TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) {
-  Status status;
-  InitBlockMgr(-1, IO_BLOCK_SIZE);
-  const int NUM_ROWS = 4000;
-  BufferedTupleStream stream(
-      runtime_state_, array_desc_, runtime_state_->block_mgr(), client_, false, false);
-  const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
-  // Write out a predictable pattern of data by iterating over arrays of constants.
-  int strings_index = 0; // we take the mod of this as index into STRINGS.
-  int array_lens[] = { 0, 1, 5, 10, 1000, 2, 49, 20 };
-  int num_array_lens = sizeof(array_lens) / sizeof(array_lens[0]);
-  int array_len_index = 0;
-  ASSERT_OK(stream.Init(-1, NULL, false));
-  bool got_write_buffer;
-  ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
-  ASSERT_TRUE(got_write_buffer);
-
-  for (int i = 0; i < NUM_ROWS; ++i) {
-    int expected_row_size = tuple_descs[0]->byte_size() + tuple_descs[1]->byte_size();
-    gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
-          malloc(tuple_descs.size() * sizeof(Tuple*))));
-    gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
-          malloc(tuple_descs[0]->byte_size())));
-    gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
-          malloc(tuple_descs[1]->byte_size())));
-    memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-    memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-    row->SetTuple(0, tuple0.get());
-    row->SetTuple(1, tuple1.get());
-
-    // Only array<string> is non-null.
-    tuple0->SetNull(tuple_descs[0]->slots()[1]->null_indicator_offset());
-    tuple1->SetNull(tuple_descs[1]->slots()[0]->null_indicator_offset());
-    const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
-    const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-
-    int array_len = array_lens[array_len_index++ % num_array_lens];
-    CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
-    cv->ptr = NULL;
-    cv->num_tuples = 0;
-    CollectionValueBuilder builder(cv, *item_desc, mem_pool_.get(), runtime_state_,
-        array_len);
-    Tuple* array_data;
-    int num_rows;
-    builder.GetFreeMemory(&array_data, &num_rows);
-    expected_row_size += item_desc->byte_size() * array_len;
-
-    // Fill the array with pointers to our constant strings.
-    for (int j = 0; j < array_len; ++j) {
-      const StringValue* string = &STRINGS[strings_index++ % NUM_STRINGS];
-      array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
-      RawValue::Write(string, array_data, item_desc->slots()[0], mem_pool_.get());
-      array_data += item_desc->byte_size();
-      expected_row_size += string->len;
-    }
-    builder.CommitTuples(array_len);
-
-    // Check that internal row size computation gives correct result.
-    EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-    bool b = stream.AddRow(row.get(), &status);
-    ASSERT_TRUE(b);
-    ASSERT_OK(status);
-    mem_pool_->FreeAll(); // Free data as soon as possible to smoke out issues.
-  }
-
-  // Read back and verify data.
-  bool got_read_buffer;
-  stream.PrepareForRead(false, &got_read_buffer);
-  ASSERT_TRUE(got_read_buffer);
-  strings_index = 0;
-  array_len_index = 0;
-  bool eos = false;
-  int rows_read = 0;
-  RowBatch batch(array_desc_, BATCH_SIZE, &tracker_);
-  do {
-    batch.Reset();
-    ASSERT_OK(stream.GetNext(&batch, &eos));
-    for (int i = 0; i < batch.num_rows(); ++i) {
-      TupleRow* row = batch.GetRow(i);
-      Tuple* tuple0 = row->GetTuple(0);
-      Tuple* tuple1 = row->GetTuple(1);
-      ASSERT_TRUE(tuple0 != NULL);
-      ASSERT_TRUE(tuple1 != NULL);
-      const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
-      ASSERT_FALSE(tuple0->IsNull(array_slot_desc->null_indicator_offset()));
-      ASSERT_TRUE(tuple0->IsNull(tuple_descs[0]->slots()[1]->null_indicator_offset()));
-      ASSERT_TRUE(tuple1->IsNull(tuple_descs[1]->slots()[0]->null_indicator_offset()));
-
-      const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-      int expected_array_len = array_lens[array_len_index++ % num_array_lens];
-      CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
-      ASSERT_EQ(expected_array_len, cv->num_tuples);
-      for (int j = 0; j < cv->num_tuples; ++j) {
-        Tuple* item = reinterpret_cast<Tuple*>(cv->ptr + j * item_desc->byte_size());
-        const SlotDescriptor* string_desc = item_desc->slots()[0];
-        ASSERT_FALSE(item->IsNull(string_desc->null_indicator_offset()));
-        const StringValue* expected = &STRINGS[strings_index++ % NUM_STRINGS];
-        const StringValue* actual = item->GetStringSlot(string_desc->tuple_offset());
-        ASSERT_EQ(*expected, *actual);
-      }
-    }
-    rows_read += batch.num_rows();
-  } while (!eos);
-  ASSERT_EQ(NUM_ROWS, rows_read);
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-/// Test that ComputeRowSize handles nulls
-TEST_F(ArrayTupleStreamTest, TestComputeRowSize) {
-  InitBlockMgr(-1, 8 * 1024 * 1024);
-  const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
-  set<SlotId> external_slots;
-  // Second array slot in first tuple is stored externally.
-  const SlotDescriptor* external_array_slot = tuple_descs[0]->slots()[1];
-  external_slots.insert(external_array_slot->id());
-
-  BufferedTupleStream stream(runtime_state_, array_desc_, runtime_state_->block_mgr(),
-      client_, false, false, external_slots);
-  gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
-        malloc(tuple_descs.size() * sizeof(Tuple*))));
-  gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
-        malloc(tuple_descs[0]->byte_size())));
-  gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
-        malloc(tuple_descs[1]->byte_size())));
-  memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
-  memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-
-  // All tuples are NULL.
-  row->SetTuple(0, NULL);
-  row->SetTuple(1, NULL);
-  EXPECT_EQ(0, stream.ComputeRowSize(row.get()));
-
-  // Tuples are initialized to empty and have no var-len data.
-  row->SetTuple(0, tuple0.get());
-  row->SetTuple(1, tuple1.get());
-  EXPECT_EQ(array_desc_->GetRowSize(), stream.ComputeRowSize(row.get()));
-
-  // Tuple 0 has an array.
-  int expected_row_size = array_desc_->GetRowSize();
-  const SlotDescriptor* array_slot = tuple_descs[0]->slots()[0];
-  const TupleDescriptor* item_desc = array_slot->collection_item_descriptor();
-  int array_len = 128;
-  CollectionValue* cv = tuple0->GetCollectionSlot(array_slot->tuple_offset());
-  CollectionValueBuilder builder(cv, *item_desc, mem_pool_.get(), runtime_state_,
-      array_len);
-  Tuple* array_data;
-  int num_rows;
-  builder.GetFreeMemory(&array_data, &num_rows);
-  expected_row_size += item_desc->byte_size() * array_len;
-
-  // Fill the array with pointers to our constant strings.
-  for (int i = 0; i < array_len; ++i) {
-    const StringValue* str = &STRINGS[i % NUM_STRINGS];
-    array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
-    RawValue::Write(str, array_data, item_desc->slots()[0], mem_pool_.get());
-    array_data += item_desc->byte_size();
-    expected_row_size += str->len;
-  }
-  builder.CommitTuples(array_len);
-  EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-
-  // Check that the external slot isn't included in size.
-  cv = tuple0->GetCollectionSlot(external_array_slot->tuple_offset());
-  // ptr of external slot shouldn't be dereferenced when computing size.
-  cv->ptr = reinterpret_cast<uint8_t*>(1234);
-  cv->num_tuples = 1234;
-  EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-
-  // Check that the array is excluded if tuple 0's array has its null indicator set.
-  tuple0->SetNull(array_slot->null_indicator_offset());
-  EXPECT_EQ(array_desc_->GetRowSize(), stream.ComputeRowSize(row.get()));
-
-  stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// TODO: more tests.
-//  - The stream can operate in many modes
-
-}
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
-  impala::InitFeSupport();
-  impala::LlvmCodeGen::InitializeLlvm();
-  return RUN_ALL_TESTS();
-}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.cc b/be/src/runtime/buffered-tuple-stream.cc
deleted file mode 100644
index cce6390..0000000
--- a/be/src/runtime/buffered-tuple-stream.cc
+++ /dev/null
@@ -1,903 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/buffered-tuple-stream.inline.h"
-
-#include <boost/bind.hpp>
-#include <gutil/strings/substitute.h>
-
-#include "runtime/collection-value.h"
-#include "runtime/descriptors.h"
-#include "runtime/string-value.h"
-#include "runtime/tuple-row.h"
-#include "util/bit-util.h"
-#include "util/debug-util.h"
-#include "util/runtime-profile-counters.h"
-
-#include "common/names.h"
-
-using namespace impala;
-using namespace strings;
-
-// The first NUM_SMALL_BLOCKS of the tuple stream are made of blocks less than the
-// IO size. These blocks never spill.
-// TODO: Consider adding a 4MB in-memory buffer that would split the gap between the
-// 512KB in-memory buffer and the 8MB (IO-sized) spillable buffer.
-static const int64_t INITIAL_BLOCK_SIZES[] = { 64 * 1024, 512 * 1024 };
-static const int NUM_SMALL_BLOCKS = sizeof(INITIAL_BLOCK_SIZES) / sizeof(int64_t);
-
-string BufferedTupleStream::RowIdx::DebugString() const {
-  stringstream ss;
-  ss << "RowIdx block=" << block() << " offset=" << offset() << " idx=" << idx();
-  return ss.str();
-}
-
-BufferedTupleStream::BufferedTupleStream(RuntimeState* state,
-    const RowDescriptor* row_desc, BufferedBlockMgr* block_mgr,
-    BufferedBlockMgr::Client* client, bool use_initial_small_buffers, bool read_write,
-    const set<SlotId>& ext_varlen_slots)
-  : state_(state),
-    desc_(row_desc),
-    block_mgr_(block_mgr),
-    block_mgr_client_(client),
-    total_byte_size_(0),
-    read_tuple_idx_(-1),
-    read_ptr_(NULL),
-    read_end_ptr_(NULL),
-    write_tuple_idx_(-1),
-    write_ptr_(NULL),
-    write_end_ptr_(NULL),
-    rows_returned_(0),
-    read_block_idx_(-1),
-    write_block_(NULL),
-    num_pinned_(0),
-    num_small_blocks_(0),
-    num_rows_(0),
-    pin_timer_(NULL),
-    unpin_timer_(NULL),
-    get_new_block_timer_(NULL),
-    read_write_(read_write),
-    has_nullable_tuple_(row_desc->IsAnyTupleNullable()),
-    use_small_buffers_(use_initial_small_buffers),
-    delete_on_read_(false),
-    closed_(false),
-    pinned_(true) {
-  read_block_null_indicators_size_ = -1;
-  write_block_null_indicators_size_ = -1;
-  max_null_indicators_size_ = -1;
-  read_block_ = blocks_.end();
-  fixed_tuple_row_size_ = 0;
-  for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
-    const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i];
-    const int tuple_byte_size = tuple_desc->byte_size();
-    fixed_tuple_sizes_.push_back(tuple_byte_size);
-    fixed_tuple_row_size_ += tuple_byte_size;
-
-    vector<SlotDescriptor*> tuple_string_slots;
-    vector<SlotDescriptor*> tuple_coll_slots;
-    for (int j = 0; j < tuple_desc->slots().size(); ++j) {
-      SlotDescriptor* slot = tuple_desc->slots()[j];
-      if (!slot->type().IsVarLenType()) continue;
-      if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) {
-        if (slot->type().IsVarLenStringType()) {
-          tuple_string_slots.push_back(slot);
-        } else {
-          DCHECK(slot->type().IsCollectionType());
-          tuple_coll_slots.push_back(slot);
-        }
-      }
-    }
-    if (!tuple_string_slots.empty()) {
-      inlined_string_slots_.push_back(make_pair(i, tuple_string_slots));
-    }
-
-    if (!tuple_coll_slots.empty()) {
-      inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots));
-    }
-  }
-}
-
-BufferedTupleStream::~BufferedTupleStream() {
-  DCHECK(closed_);
-}
-
-// Returns the number of pinned blocks in the list. Only called in DCHECKs to validate
-// num_pinned_.
-int NumPinned(const list<BufferedBlockMgr::Block*>& blocks) {
-  int num_pinned = 0;
-  for (BufferedBlockMgr::Block* block : blocks) {
-    if (block->is_pinned() && block->is_max_size()) ++num_pinned;
-  }
-  return num_pinned;
-}
-
-string BufferedTupleStream::DebugString() const {
-  stringstream ss;
-  ss << "BufferedTupleStream num_rows=" << num_rows_ << " rows_returned="
-     << rows_returned_ << " pinned=" << (pinned_ ? "true" : "false")
-     << " delete_on_read=" << (delete_on_read_ ? "true" : "false")
-     << " closed=" << (closed_ ? "true" : "false")
-     << " num_pinned=" << num_pinned_
-     << " write_block=" << write_block_ << " read_block_=";
-  if (read_block_ == blocks_.end()) {
-    ss << "<end>";
-  } else {
-    ss << *read_block_;
-  }
-  ss << " blocks=[\n";
-  for (BufferedBlockMgr::Block* block : blocks_) {
-    ss << "{" << block->DebugString() << "}";
-    if (block != blocks_.back()) ss << ",\n";
-  }
-  ss << "]";
-  return ss.str();
-}
-
-Status BufferedTupleStream::Init(int node_id, RuntimeProfile* profile, bool pinned) {
-  if (profile != NULL) {
-    pin_timer_ = ADD_TIMER(profile, "PinTime");
-    unpin_timer_ = ADD_TIMER(profile, "UnpinTime");
-    get_new_block_timer_ = ADD_TIMER(profile, "GetNewBlockTime");
-  }
-
-  max_null_indicators_size_ = ComputeNumNullIndicatorBytes(block_mgr_->max_block_size());
-  if (UNLIKELY(max_null_indicators_size_ < 0)) {
-    // The block cannot even fit in a row of tuples so just assume there is one row.
-    int null_indicators_size =
-        BitUtil::RoundUpNumi64(desc_->tuple_descriptors().size()) * 8;
-    return Status(TErrorCode::BTS_BLOCK_OVERFLOW,
-        PrettyPrinter::Print(fixed_tuple_row_size_, TUnit::BYTES),
-        PrettyPrinter::Print(null_indicators_size,  TUnit::BYTES));
-  }
-
-  if (block_mgr_->max_block_size() < INITIAL_BLOCK_SIZES[0]) {
-    use_small_buffers_ = false;
-  }
-  if (!pinned) RETURN_IF_ERROR(UnpinStream(UNPIN_ALL_EXCEPT_CURRENT));
-  return Status::OK();
-}
-
-Status BufferedTupleStream::PrepareForWrite(bool* got_buffer) {
-  DCHECK(write_block_ == NULL);
-  return NewWriteBlockForRow(fixed_tuple_row_size_, got_buffer);
-}
-
-Status BufferedTupleStream::SwitchToIoBuffers(bool* got_buffer) {
-  if (!use_small_buffers_) {
-    *got_buffer = (write_block_ != NULL);
-    return Status::OK();
-  }
-  use_small_buffers_ = false;
-  Status status =
-      NewWriteBlock(block_mgr_->max_block_size(), max_null_indicators_size_, got_buffer);
-  // IMPALA-2330: Set the flag using small buffers back to false in case it failed to
-  // got a buffer.
-  DCHECK(status.ok() || !*got_buffer) << status.ok() << " " << *got_buffer;
-  use_small_buffers_ = !*got_buffer;
-  return status;
-}
-
-void BufferedTupleStream::Close(RowBatch* batch, RowBatch::FlushMode flush) {
-  for (BufferedBlockMgr::Block* block : blocks_) {
-    if (batch != NULL && block->is_pinned()) {
-      batch->AddBlock(block, flush);
-    } else {
-      block->Delete();
-    }
-  }
-  blocks_.clear();
-  num_pinned_ = 0;
-  DCHECK_EQ(num_pinned_, NumPinned(blocks_));
-  closed_ = true;
-}
-
-int64_t BufferedTupleStream::bytes_in_mem(bool ignore_current) const {
-  int64_t result = 0;
-  for (BufferedBlockMgr::Block* block : blocks_) {
-    if (!block->is_pinned()) continue;
-    if (!block->is_max_size()) continue;
-    if (block == write_block_ && ignore_current) continue;
-    result += block->buffer_len();
-  }
-  return result;
-}
-
-Status BufferedTupleStream::UnpinBlock(BufferedBlockMgr::Block* block) {
-  SCOPED_TIMER(unpin_timer_);
-  DCHECK(block->is_pinned());
-  if (!block->is_max_size()) return Status::OK();
-  RETURN_IF_ERROR(block->Unpin());
-  --num_pinned_;
-  DCHECK_EQ(num_pinned_, NumPinned(blocks_));
-  return Status::OK();
-}
-
-Status BufferedTupleStream::NewWriteBlock(
-    int64_t block_len, int64_t null_indicators_size, bool* got_block) noexcept {
-  DCHECK(!closed_);
-  DCHECK_GE(null_indicators_size, 0);
-  *got_block = false;
-
-  BufferedBlockMgr::Block* unpin_block = write_block_;
-  if (write_block_ != NULL) {
-    DCHECK(write_block_->is_pinned());
-    if (pinned_ || write_block_ == *read_block_ || !write_block_->is_max_size()) {
-      // In these cases, don't unpin the current write block.
-      unpin_block = NULL;
-    }
-  }
-
-  BufferedBlockMgr::Block* new_block = NULL;
-  {
-    SCOPED_TIMER(get_new_block_timer_);
-    RETURN_IF_ERROR(block_mgr_->GetNewBlock(
-        block_mgr_client_, unpin_block, &new_block, block_len));
-  }
-  *got_block = new_block != NULL;
-
-  if (!*got_block) {
-    DCHECK(unpin_block == NULL);
-    return Status::OK();
-  }
-
-  if (unpin_block != NULL) {
-    DCHECK(unpin_block == write_block_);
-    DCHECK(!write_block_->is_pinned());
-    --num_pinned_;
-    DCHECK_EQ(num_pinned_, NumPinned(blocks_));
-  }
-
-  // Mark the entire block as containing valid data to avoid updating it as we go.
-  new_block->Allocate<uint8_t>(block_len);
-
-  // Compute and allocate the block header with the null indicators.
-  DCHECK_EQ(null_indicators_size, ComputeNumNullIndicatorBytes(block_len));
-  write_block_null_indicators_size_ = null_indicators_size;
-  write_tuple_idx_ = 0;
-  write_ptr_ = new_block->buffer() + write_block_null_indicators_size_;
-  write_end_ptr_ = new_block->buffer() + block_len;
-
-  blocks_.push_back(new_block);
-  block_start_idx_.push_back(new_block->buffer());
-  write_block_ = new_block;
-  DCHECK(write_block_->is_pinned());
-  DCHECK_EQ(write_block_->num_rows(), 0);
-  if (write_block_->is_max_size()) {
-    ++num_pinned_;
-    DCHECK_EQ(num_pinned_, NumPinned(blocks_));
-  } else {
-    ++num_small_blocks_;
-  }
-  total_byte_size_ += block_len;
-  return Status::OK();
-}
-
-Status BufferedTupleStream::NewWriteBlockForRow(
-    int64_t row_size, bool* got_block) noexcept {
-  int64_t block_len = 0;
-  int64_t null_indicators_size = 0;
-  if (use_small_buffers_) {
-    *got_block = false;
-    if (blocks_.size() < NUM_SMALL_BLOCKS) {
-      block_len = INITIAL_BLOCK_SIZES[blocks_.size()];
-      null_indicators_size = ComputeNumNullIndicatorBytes(block_len);
-      // Use small buffer only if:
-      // 1. the small buffer's size is smaller than the configured max block size.
-      // 2. a single row of tuples and null indicators (if any) fit in the small buffer.
-      //
-      // If condition 2 above is not met, we will bail. An alternative would be
-      // to try the next larger small buffer.
-      *got_block = block_len < block_mgr_->max_block_size() &&
-          null_indicators_size >= 0 && row_size + null_indicators_size <= block_len;
-    }
-    // Do not switch to IO-buffers automatically. Do not get a buffer.
-    if (!*got_block) return Status::OK();
-  } else {
-    DCHECK_GE(max_null_indicators_size_, 0);
-    block_len = block_mgr_->max_block_size();
-    null_indicators_size = max_null_indicators_size_;
-    // Check if the size of row and null indicators exceeds the IO block size.
-    if (UNLIKELY(row_size + null_indicators_size > block_len)) {
-      return Status(TErrorCode::BTS_BLOCK_OVERFLOW,
-          PrettyPrinter::Print(row_size, TUnit::BYTES),
-          PrettyPrinter::Print(null_indicators_size, TUnit::BYTES));
-    }
-  }
-  return NewWriteBlock(block_len, null_indicators_size, got_block);
-}
-
-Status BufferedTupleStream::NextReadBlock() {
-  DCHECK(!closed_);
-  DCHECK(read_block_ != blocks_.end());
-  DCHECK_EQ(num_pinned_, NumPinned(blocks_)) << pinned_;
-
-  // If non-NULL, this will be the current block if we are going to free it while
-  // grabbing the next block. This will stay NULL if we don't want to free the
-  // current block.
-  BufferedBlockMgr::Block* block_to_free =
-      (!pinned_ || delete_on_read_) ? *read_block_ : NULL;
-  if (delete_on_read_) {
-    DCHECK(read_block_ == blocks_.begin());
-    DCHECK(*read_block_ != write_block_);
-    blocks_.pop_front();
-    read_block_ = blocks_.begin();
-    read_block_idx_ = 0;
-    if (block_to_free != NULL && !block_to_free->is_max_size()) {
-      block_to_free->Delete();
-      block_to_free = NULL;
-      DCHECK_EQ(num_pinned_, NumPinned(blocks_)) << DebugString();
-    }
-  } else {
-    ++read_block_;
-    ++read_block_idx_;
-    if (block_to_free != NULL && !block_to_free->is_max_size()) block_to_free = NULL;
-  }
-
-  bool pinned = false;
-  if (read_block_ == blocks_.end() || (*read_block_)->is_pinned()) {
-    // End of the blocks or already pinned, just handle block_to_free
-    if (block_to_free != NULL) {
-      SCOPED_TIMER(unpin_timer_);
-      if (delete_on_read_) {
-        block_to_free->Delete();
-        --num_pinned_;
-      } else {
-        RETURN_IF_ERROR(UnpinBlock(block_to_free));
-      }
-    }
-  } else {
-    // Call into the block mgr to atomically unpin/delete the old block and pin the
-    // new block.
-    SCOPED_TIMER(pin_timer_);
-    RETURN_IF_ERROR((*read_block_)->Pin(&pinned, block_to_free, !delete_on_read_));
-    if (!pinned) {
-      DCHECK(block_to_free == NULL) << "Should have been able to pin."
-          << endl << block_mgr_->DebugString(block_mgr_client_);;
-    }
-    if (block_to_free == NULL && pinned) ++num_pinned_;
-  }
-
-  if (read_block_ != blocks_.end() && (*read_block_)->is_pinned()) {
-    read_block_null_indicators_size_ =
-        ComputeNumNullIndicatorBytes((*read_block_)->buffer_len());
-    DCHECK_GE(read_block_null_indicators_size_, 0);
-    read_tuple_idx_ = 0;
-    read_ptr_ = (*read_block_)->buffer() + read_block_null_indicators_size_;
-    read_end_ptr_ = (*read_block_)->buffer() + (*read_block_)->buffer_len();
-  }
-  DCHECK_EQ(num_pinned_, NumPinned(blocks_)) << DebugString();
-  return Status::OK();
-}
-
-Status BufferedTupleStream::PrepareForRead(bool delete_on_read, bool* got_buffer) {
-  DCHECK(!closed_);
-  if (blocks_.empty()) return Status::OK();
-
-  if (!read_write_ && write_block_ != NULL) {
-    DCHECK(write_block_->is_pinned());
-    if (!pinned_ && write_block_ != blocks_.front()) {
-      RETURN_IF_ERROR(UnpinBlock(write_block_));
-    }
-    write_block_ = NULL;
-  }
-
-  // Walk the blocks and pin the first IO-sized block.
-  for (BufferedBlockMgr::Block* block : blocks_) {
-    if (!block->is_pinned()) {
-      SCOPED_TIMER(pin_timer_);
-      bool current_pinned;
-      RETURN_IF_ERROR(block->Pin(&current_pinned));
-      if (!current_pinned) {
-        *got_buffer = false;
-        return Status::OK();
-      }
-      ++num_pinned_;
-      DCHECK_EQ(num_pinned_, NumPinned(blocks_));
-    }
-    if (block->is_max_size()) break;
-  }
-
-  read_block_ = blocks_.begin();
-  DCHECK(read_block_ != blocks_.end());
-  read_block_null_indicators_size_ =
-      ComputeNumNullIndicatorBytes((*read_block_)->buffer_len());
-  DCHECK_GE(read_block_null_indicators_size_, 0);
-  read_tuple_idx_ = 0;
-  read_ptr_ = (*read_block_)->buffer() + read_block_null_indicators_size_;
-  read_end_ptr_ = (*read_block_)->buffer() + (*read_block_)->buffer_len();
-  rows_returned_ = 0;
-  read_block_idx_ = 0;
-  delete_on_read_ = delete_on_read;
-  *got_buffer = true;
-  return Status::OK();
-}
-
-Status BufferedTupleStream::PinStream(bool already_reserved, bool* pinned) {
-  DCHECK(!closed_);
-  DCHECK(pinned != NULL);
-  if (!already_reserved) {
-    // If we can't get all the blocks, don't try at all.
-    if (!block_mgr_->TryAcquireTmpReservation(block_mgr_client_, blocks_unpinned())) {
-      *pinned = false;
-      return Status::OK();
-    }
-  }
-
-  for (BufferedBlockMgr::Block* block : blocks_) {
-    if (block->is_pinned()) continue;
-    {
-      SCOPED_TIMER(pin_timer_);
-      RETURN_IF_ERROR(block->Pin(pinned));
-    }
-    if (!*pinned) {
-      VLOG_QUERY << "Should have been reserved." << endl
-                 << block_mgr_->DebugString(block_mgr_client_);
-      return Status::OK();
-    }
-    ++num_pinned_;
-    DCHECK_EQ(num_pinned_, NumPinned(blocks_));
-  }
-
-  if (!delete_on_read_) {
-    // Populate block_start_idx_ on pin.
-    DCHECK_EQ(block_start_idx_.size(), blocks_.size());
-    block_start_idx_.clear();
-    for (BufferedBlockMgr::Block* block : blocks_) {
-      block_start_idx_.push_back(block->buffer());
-    }
-  }
-  *pinned = true;
-  pinned_ = true;
-  return Status::OK();
-}
-
-Status BufferedTupleStream::UnpinStream(UnpinMode mode) {
-  DCHECK(!closed_);
-  DCHECK(mode == UNPIN_ALL || mode == UNPIN_ALL_EXCEPT_CURRENT);
-  SCOPED_TIMER(unpin_timer_);
-
-  for (BufferedBlockMgr::Block* block: blocks_) {
-    if (!block->is_pinned()) continue;
-    if (mode == UNPIN_ALL_EXCEPT_CURRENT
-        && (block == write_block_ || (read_write_ && block == *read_block_))) {
-      continue;
-    }
-    RETURN_IF_ERROR(UnpinBlock(block));
-  }
-  if (mode == UNPIN_ALL) {
-    read_block_ = blocks_.end();
-    write_block_ = NULL;
-  }
-  pinned_ = false;
-  return Status::OK();
-}
-
-int BufferedTupleStream::ComputeNumNullIndicatorBytes(int block_size) const {
-  if (has_nullable_tuple_) {
-    // We assume that all rows will use their max size, so we may be underutilizing the
-    // space, i.e. we may have some unused space in case of rows with NULL tuples.
-    const uint32_t tuples_per_row = desc_->tuple_descriptors().size();
-    const uint32_t min_row_size_in_bits = 8 * fixed_tuple_row_size_ + tuples_per_row;
-    const uint32_t block_size_in_bits = 8 * block_size;
-    const uint32_t max_num_rows = block_size_in_bits / min_row_size_in_bits;
-    if (UNLIKELY(max_num_rows == 0)) return -1;
-    return BitUtil::RoundUpNumi64(max_num_rows * tuples_per_row) * 8;
-  } else {
-    // If there are no nullable tuples then no need to waste space for null indicators.
-    return 0;
-  }
-}
-
-Status BufferedTupleStream::GetRows(scoped_ptr<RowBatch>* batch, bool* got_rows) {
-  if (num_rows() > numeric_limits<int>::max()) {
-    // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
-    return Status(Substitute("Trying to read $0 rows into in-memory batch failed. Limit "
-        "is $1", num_rows(), numeric_limits<int>::max()));
-  }
-  RETURN_IF_ERROR(PinStream(false, got_rows));
-  if (!*got_rows) return Status::OK();
-  bool got_read_buffer;
-  RETURN_IF_ERROR(PrepareForRead(false, &got_read_buffer));
-  DCHECK(got_read_buffer) << "Stream was pinned";
-  batch->reset(
-      new RowBatch(desc_, num_rows(), block_mgr_->get_tracker(block_mgr_client_)));
-  bool eos = false;
-  // Loop until GetNext fills the entire batch. Each call can stop at block
-  // boundaries. We generally want it to stop, so that blocks can be freed
-  // as we read. It is safe in this case because we pin the entire stream.
-  while (!eos) {
-    RETURN_IF_ERROR(GetNext(batch->get(), &eos));
-  }
-  return Status::OK();
-}
-
-Status BufferedTupleStream::GetNext(RowBatch* batch, bool* eos) {
-  return GetNextInternal<false>(batch, eos, NULL);
-}
-
-Status BufferedTupleStream::GetNext(RowBatch* batch, bool* eos,
-    vector<RowIdx>* indices) {
-  return GetNextInternal<true>(batch, eos, indices);
-}
-
-template <bool FILL_INDICES>
-Status BufferedTupleStream::GetNextInternal(RowBatch* batch, bool* eos,
-    vector<RowIdx>* indices) {
-  if (has_nullable_tuple_) {
-    return GetNextInternal<FILL_INDICES, true>(batch, eos, indices);
-  } else {
-    return GetNextInternal<FILL_INDICES, false>(batch, eos, indices);
-  }
-}
-
-template <bool FILL_INDICES, bool HAS_NULLABLE_TUPLE>
-Status BufferedTupleStream::GetNextInternal(RowBatch* batch, bool* eos,
-    vector<RowIdx>* indices) {
-  DCHECK(!closed_);
-  DCHECK(batch->row_desc()->LayoutEquals(*desc_));
-  *eos = (rows_returned_ == num_rows_);
-  if (*eos) return Status::OK();
-  DCHECK_GE(read_block_null_indicators_size_, 0);
-
-  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
-  DCHECK_LE(read_tuple_idx_ / tuples_per_row, (*read_block_)->num_rows());
-  DCHECK_EQ(read_tuple_idx_ % tuples_per_row, 0);
-  int rows_returned_curr_block = read_tuple_idx_ / tuples_per_row;
-
-  if (UNLIKELY(rows_returned_curr_block == (*read_block_)->num_rows())) {
-    // Get the next block in the stream. We need to do this at the beginning of
-    // the GetNext() call to ensure the buffer management semantics. NextReadBlock()
-    // will recycle the memory for the rows returned from the *previous* call to
-    // GetNext().
-    RETURN_IF_ERROR(NextReadBlock());
-    DCHECK(read_block_ != blocks_.end()) << DebugString();
-    DCHECK_GE(read_block_null_indicators_size_, 0);
-    rows_returned_curr_block = 0;
-  }
-
-  DCHECK(read_block_ != blocks_.end());
-  DCHECK((*read_block_)->is_pinned()) << DebugString();
-  DCHECK_GE(read_tuple_idx_, 0);
-
-  int rows_left_in_block = (*read_block_)->num_rows() - rows_returned_curr_block;
-  int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_block);
-  DCHECK_GE(rows_to_fill, 1);
-  batch->AddRows(rows_to_fill);
-  uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->GetRow(batch->num_rows()));
-
-  // Produce tuple rows from the current block and the corresponding position on the
-  // null tuple indicator.
-  if (FILL_INDICES) {
-    DCHECK(indices != NULL);
-    DCHECK(!delete_on_read_);
-    DCHECK_EQ(batch->num_rows(), 0);
-    indices->clear();
-    indices->reserve(rows_to_fill);
-  }
-
-  uint8_t* null_word = NULL;
-  uint32_t null_pos = 0;
-  // Start reading from position read_tuple_idx_ in the block.
-  // IMPALA-2256: Special case if there are no materialized slots.
-  bool increment_row = RowConsumesMemory();
-  uint64_t last_read_row = increment_row * (read_tuple_idx_ / tuples_per_row);
-  for (int i = 0; i < rows_to_fill; ++i) {
-    if (FILL_INDICES) {
-      indices->push_back(RowIdx());
-      DCHECK_EQ(indices->size(), i + 1);
-      (*indices)[i].set(read_block_idx_, read_ptr_ - (*read_block_)->buffer(),
-          last_read_row);
-    }
-    // Copy the row into the output batch.
-    TupleRow* output_row = reinterpret_cast<TupleRow*>(tuple_row_mem);
-    if (HAS_NULLABLE_TUPLE) {
-      for (int j = 0; j < tuples_per_row; ++j) {
-        // Stitch together the tuples from the block and the NULL ones.
-        null_word = (*read_block_)->buffer() + (read_tuple_idx_ >> 3);
-        null_pos = read_tuple_idx_ & 7;
-        ++read_tuple_idx_;
-        const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
-        // Copy tuple and advance read_ptr_. If it is a NULL tuple, it calls SetTuple
-        // with Tuple* being 0x0. To do that we multiply the current read_ptr_ with
-        // false (0x0).
-        output_row->SetTuple(j, reinterpret_cast<Tuple*>(
-            reinterpret_cast<uint64_t>(read_ptr_) * is_not_null));
-        read_ptr_ += fixed_tuple_sizes_[j] * is_not_null;
-      }
-    } else {
-      // When we know that there are no nullable tuples we can skip null checks.
-      for (int j = 0; j < tuples_per_row; ++j) {
-        output_row->SetTuple(j, reinterpret_cast<Tuple*>(read_ptr_));
-        read_ptr_ += fixed_tuple_sizes_[j];
-      }
-      read_tuple_idx_ += tuples_per_row;
-    }
-    tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
-
-    // Update string slot ptrs, skipping external strings.
-    for (int j = 0; j < inlined_string_slots_.size(); ++j) {
-      Tuple* tuple = output_row->GetTuple(inlined_string_slots_[j].first);
-      if (HAS_NULLABLE_TUPLE && tuple == NULL) continue;
-      FixUpStringsForRead(inlined_string_slots_[j].second, tuple);
-    }
-
-    // Update collection slot ptrs, skipping external collections. We traverse the
-    // collection structure in the same order as it was written to the stream, allowing
-    // us to infer the data layout based on the length of collections and strings.
-    for (int j = 0; j < inlined_coll_slots_.size(); ++j) {
-      Tuple* tuple = output_row->GetTuple(inlined_coll_slots_[j].first);
-      if (HAS_NULLABLE_TUPLE && tuple == NULL) continue;
-      FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple);
-    }
-    last_read_row += increment_row;
-  }
-
-  batch->CommitRows(rows_to_fill);
-  rows_returned_ += rows_to_fill;
-  *eos = (rows_returned_ == num_rows_);
-  if ((!pinned_ || delete_on_read_)
-      && rows_returned_curr_block + rows_to_fill == (*read_block_)->num_rows()) {
-    // No more data in this block. The batch must be immediately returned up the operator
-    // tree and deep copied so that NextReadBlock() can reuse the read block's buffer.
-    batch->MarkNeedsDeepCopy();
-  }
-  if (FILL_INDICES) DCHECK_EQ(indices->size(), rows_to_fill);
-  DCHECK_LE(read_ptr_, read_end_ptr_);
-  return Status::OK();
-}
-
-void BufferedTupleStream::FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots,
-    Tuple* tuple) {
-  DCHECK(tuple != NULL);
-  for (int i = 0; i < string_slots.size(); ++i) {
-    const SlotDescriptor* slot_desc = string_slots[i];
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-
-    StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
-    DCHECK_LE(sv->len, read_block_bytes_remaining());
-    sv->ptr = reinterpret_cast<char*>(read_ptr_);
-    read_ptr_ += sv->len;
-  }
-}
-
-void BufferedTupleStream::FixUpCollectionsForRead(const vector<SlotDescriptor*>& collection_slots,
-    Tuple* tuple) {
-  DCHECK(tuple != NULL);
-  for (int i = 0; i < collection_slots.size(); ++i) {
-    const SlotDescriptor* slot_desc = collection_slots[i];
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-
-    CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
-    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
-    int coll_byte_size = cv->num_tuples * item_desc.byte_size();
-    DCHECK_LE(coll_byte_size, read_block_bytes_remaining());
-    cv->ptr = reinterpret_cast<uint8_t*>(read_ptr_);
-    read_ptr_ += coll_byte_size;
-
-    if (!item_desc.HasVarlenSlots()) continue;
-    uint8_t* coll_data = cv->ptr;
-    for (int j = 0; j < cv->num_tuples; ++j) {
-      Tuple* item = reinterpret_cast<Tuple*>(coll_data);
-      FixUpStringsForRead(item_desc.string_slots(), item);
-      FixUpCollectionsForRead(item_desc.collection_slots(), item);
-      coll_data += item_desc.byte_size();
-    }
-  }
-}
-
-int64_t BufferedTupleStream::ComputeRowSize(TupleRow* row) const noexcept {
-  int64_t size = 0;
-  if (has_nullable_tuple_) {
-    for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
-      if (row->GetTuple(i) != NULL) size += fixed_tuple_sizes_[i];
-    }
-  } else {
-    size = fixed_tuple_row_size_;
-  }
-  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
-    Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
-    if (tuple == NULL) continue;
-    const vector<SlotDescriptor*>& slots = inlined_string_slots_[i].second;
-    for (auto it = slots.begin(); it != slots.end(); ++it) {
-      if (tuple->IsNull((*it)->null_indicator_offset())) continue;
-      size += tuple->GetStringSlot((*it)->tuple_offset())->len;
-    }
-  }
-
-  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
-    Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
-    if (tuple == NULL) continue;
-    const vector<SlotDescriptor*>& slots = inlined_coll_slots_[i].second;
-    for (auto it = slots.begin(); it != slots.end(); ++it) {
-      if (tuple->IsNull((*it)->null_indicator_offset())) continue;
-      CollectionValue* cv = tuple->GetCollectionSlot((*it)->tuple_offset());
-      const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor();
-      size += cv->num_tuples * item_desc.byte_size();
-
-      if (!item_desc.HasVarlenSlots()) continue;
-      for (int j = 0; j < cv->num_tuples; ++j) {
-        Tuple* item = reinterpret_cast<Tuple*>(&cv->ptr[j * item_desc.byte_size()]);
-        size += item->VarlenByteSize(item_desc);
-      }
-    }
-  }
-  return size;
-}
-
-bool BufferedTupleStream::AddRowSlow(TupleRow* row, Status* status) noexcept {
-  bool got_block;
-  int64_t row_size = ComputeRowSize(row);
-  *status = NewWriteBlockForRow(row_size, &got_block);
-  if (!status->ok() || !got_block) return false;
-  return DeepCopy(row);
-}
-
-bool BufferedTupleStream::DeepCopy(TupleRow* row) noexcept {
-  if (has_nullable_tuple_) {
-    return DeepCopyInternal<true>(row);
-  } else {
-    return DeepCopyInternal<false>(row);
-  }
-}
-
-// TODO: this really needs codegen
-// TODO: in case of duplicate tuples, this can redundantly serialize data.
-template <bool HasNullableTuple>
-bool BufferedTupleStream::DeepCopyInternal(TupleRow* row) noexcept {
-  if (UNLIKELY(write_block_ == NULL)) return false;
-  DCHECK_GE(write_block_null_indicators_size_, 0);
-  DCHECK(write_block_->is_pinned()) << DebugString() << std::endl
-      << write_block_->DebugString();
-
-  const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
-  uint32_t bytes_remaining = write_block_bytes_remaining();
-  if (UNLIKELY((bytes_remaining < fixed_tuple_row_size_) ||
-              (HasNullableTuple &&
-              (write_tuple_idx_ + tuples_per_row > write_block_null_indicators_size_ * 8)))) {
-    return false;
-  }
-
-  // Copy the not NULL fixed len tuples. For the NULL tuples just update the NULL tuple
-  // indicator.
-  if (HasNullableTuple) {
-    DCHECK_GT(write_block_null_indicators_size_, 0);
-    uint8_t* null_word = NULL;
-    uint32_t null_pos = 0;
-    for (int i = 0; i < tuples_per_row; ++i) {
-      null_word = write_block_->buffer() + (write_tuple_idx_ >> 3); // / 8
-      null_pos = write_tuple_idx_ & 7;
-      ++write_tuple_idx_;
-      const int tuple_size = fixed_tuple_sizes_[i];
-      Tuple* t = row->GetTuple(i);
-      const uint8_t mask = 1 << (7 - null_pos);
-      if (t != NULL) {
-        *null_word &= ~mask;
-        memcpy(write_ptr_, t, tuple_size);
-        write_ptr_ += tuple_size;
-      } else {
-        *null_word |= mask;
-      }
-    }
-    DCHECK_LE(write_tuple_idx_ - 1, write_block_null_indicators_size_ * 8);
-  } else {
-    // If we know that there are no nullable tuples no need to set the nullability flags.
-    DCHECK_EQ(write_block_null_indicators_size_, 0);
-    for (int i = 0; i < tuples_per_row; ++i) {
-      const int tuple_size = fixed_tuple_sizes_[i];
-      Tuple* t = row->GetTuple(i);
-      // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
-      // is delivered, the check below should become DCHECK(t != NULL).
-      DCHECK(t != NULL || tuple_size == 0);
-      memcpy(write_ptr_, t, tuple_size);
-      write_ptr_ += tuple_size;
-    }
-  }
-
-  // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets
-  // on the write path, only on the read. The tuple data is immediately followed
-  // by the string data so only the len information is necessary.
-  for (int i = 0; i < inlined_string_slots_.size(); ++i) {
-    const Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
-    if (HasNullableTuple && tuple == NULL) continue;
-    if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second))) return false;
-  }
-
-  // Copy inlined collection slots. We copy collection data in a well-defined order so
-  // we do not need to convert pointers to offsets on the write path.
-  for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
-    const Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
-    if (HasNullableTuple && tuple == NULL) continue;
-    if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second))) return false;
-  }
-
-  write_block_->AddRow();
-  ++num_rows_;
-  return true;
-}
-
-bool BufferedTupleStream::CopyStrings(const Tuple* tuple,
-    const vector<SlotDescriptor*>& string_slots) {
-  for (int i = 0; i < string_slots.size(); ++i) {
-    const SlotDescriptor* slot_desc = string_slots[i];
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-    const StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
-    if (LIKELY(sv->len > 0)) {
-      if (UNLIKELY(write_block_bytes_remaining() < sv->len)) return false;
-
-      memcpy(write_ptr_, sv->ptr, sv->len);
-      write_ptr_ += sv->len;
-    }
-  }
-  return true;
-}
-
-bool BufferedTupleStream::CopyCollections(const Tuple* tuple,
-    const vector<SlotDescriptor*>& collection_slots) {
-  for (int i = 0; i < collection_slots.size(); ++i) {
-    const SlotDescriptor* slot_desc = collection_slots[i];
-    if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-    const CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
-    const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
-    if (LIKELY(cv->num_tuples > 0)) {
-      int coll_byte_size = cv->num_tuples * item_desc.byte_size();
-      if (UNLIKELY(write_block_bytes_remaining() < coll_byte_size)) return false;
-      uint8_t* coll_data = write_ptr_;
-      memcpy(coll_data, cv->ptr, coll_byte_size);
-      write_ptr_ += coll_byte_size;
-
-      if (!item_desc.HasVarlenSlots()) continue;
-      // Copy variable length data when present in collection items.
-      for (int j = 0; j < cv->num_tuples; ++j) {
-        const Tuple* item = reinterpret_cast<Tuple*>(coll_data);
-        if (UNLIKELY(!CopyStrings(item, item_desc.string_slots()))) return false;
-        if (UNLIKELY(!CopyCollections(item, item_desc.collection_slots()))) return false;
-        coll_data += item_desc.byte_size();
-      }
-    }
-  }
-  return true;
-}
-
-void BufferedTupleStream::GetTupleRow(const RowIdx& idx, TupleRow* row) const {
-  DCHECK(row != NULL);
-  DCHECK(!closed_);
-  DCHECK(is_pinned());
-  DCHECK(!delete_on_read_);
-  DCHECK_EQ(blocks_.size(), block_start_idx_.size());
-  DCHECK_LT(idx.block(), blocks_.size());
-
-  uint8_t* data = block_start_idx_[idx.block()] + idx.offset();
-  if (has_nullable_tuple_) {
-    // Stitch together the tuples from the block and the NULL ones.
-    const int tuples_per_row = desc_->tuple_descriptors().size();
-    uint32_t tuple_idx = idx.idx() * tuples_per_row;
-    for (int i = 0; i < tuples_per_row; ++i) {
-      const uint8_t* null_word = block_start_idx_[idx.block()] + (tuple_idx >> 3);
-      const uint32_t null_pos = tuple_idx & 7;
-      const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
-      row->SetTuple(i, reinterpret_cast<Tuple*>(
-          reinterpret_cast<uint64_t>(data) * is_not_null));
-      data += desc_->tuple_descriptors()[i]->byte_size() * is_not_null;
-      ++tuple_idx;
-    }
-  } else {
-    for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
-      row->SetTuple(i, reinterpret_cast<Tuple*>(data));
-      data += desc_->tuple_descriptors()[i]->byte_size();
-    }
-  }
-}


[10/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc
index 7067961..fc0a4a6 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -31,10 +31,12 @@
 #include "exprs/scalar-expr-evaluator.h"
 #include "exprs/slot-ref.h"
 #include "gutil/strings/substitute.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
 #include "runtime/descriptors.h"
+#include "runtime/exec-env.h"
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
 #include "runtime/raw-value.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
@@ -111,7 +113,6 @@ PartitionedAggregationNode::PartitionedAggregationNode(
     needs_finalize_(tnode.agg_node.need_finalize),
     is_streaming_preagg_(tnode.agg_node.use_streaming_preaggregation),
     needs_serialize_(false),
-    block_mgr_client_(NULL),
     output_partition_(NULL),
     process_batch_no_grouping_fn_(NULL),
     process_batch_fn_(NULL),
@@ -224,24 +225,6 @@ Status PartitionedAggregationNode::Prepare(RuntimeState* state) {
     RETURN_IF_ERROR(HashTableCtx::Create(pool_, state, build_exprs_,
         grouping_exprs_, true, vector<bool>(build_exprs_.size(), true),
         state->fragment_hash_seed(), MAX_PARTITION_DEPTH, 1, expr_mem_pool(), &ht_ctx_));
-    RETURN_IF_ERROR(state_->block_mgr()->RegisterClient(
-        Substitute("PartitionedAggregationNode id=$0 ptr=$1", id_, this),
-        MinRequiredBuffers(), true, mem_tracker(), state, &block_mgr_client_));
-  }
-
-  // TODO: Is there a need to create the stream here? If memory reservations work we may
-  // be able to create this stream lazily and only whenever we need to spill.
-  if (!is_streaming_preagg_ && needs_serialize_ && block_mgr_client_ != NULL) {
-    serialize_stream_.reset(new BufferedTupleStream(state, &intermediate_row_desc_,
-        state->block_mgr(), block_mgr_client_, false /* use_initial_small_buffers */,
-        false /* read_write */));
-    RETURN_IF_ERROR(serialize_stream_->Init(id(), runtime_profile(), false));
-    bool got_buffer;
-    RETURN_IF_ERROR(serialize_stream_->PrepareForWrite(&got_buffer));
-    if (!got_buffer) {
-      return state_->block_mgr()->MemLimitTooLowError(block_mgr_client_, id());
-    }
-    DCHECK(serialize_stream_->has_write_block());
   }
   AddCodegenDisabledMessage(state);
   return Status::OK();
@@ -265,8 +248,16 @@ Status PartitionedAggregationNode::Open(RuntimeState* state) {
   SCOPED_TIMER(runtime_profile_->total_time_counter());
   // Open the child before consuming resources in this node.
   RETURN_IF_ERROR(child(0)->Open(state));
-
   RETURN_IF_ERROR(ExecNode::Open(state));
+
+  // Claim reservation after the child has been opened to reduce the peak reservation
+  // requirement.
+  if (!buffer_pool_client_.is_registered() && !grouping_exprs_.empty()) {
+    DCHECK_GE(resource_profile_.min_reservation,
+        resource_profile_.spillable_buffer_size * MinRequiredBuffers());
+    RETURN_IF_ERROR(ClaimBufferReservation(state));
+  }
+
   if (ht_ctx_.get() != nullptr) RETURN_IF_ERROR(ht_ctx_->Open(state));
   RETURN_IF_ERROR(AggFnEvaluator::Open(agg_fn_evals_, state));
   if (grouping_exprs_.empty()) {
@@ -278,6 +269,25 @@ Status PartitionedAggregationNode::Open(RuntimeState* state) {
     RETURN_IF_ERROR(state_->GetQueryStatus());
     singleton_output_tuple_returned_ = false;
   } else {
+    if (ht_allocator_ == nullptr) {
+      // Allocate 'serialize_stream_' and 'ht_allocator_' on the first Open() call.
+      ht_allocator_.reset(new Suballocator(state_->exec_env()->buffer_pool(),
+          &buffer_pool_client_, resource_profile_.spillable_buffer_size));
+
+      if (!is_streaming_preagg_ && needs_serialize_) {
+        serialize_stream_.reset(new BufferedTupleStreamV2(state, &intermediate_row_desc_,
+            &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+            resource_profile_.spillable_buffer_size));
+        RETURN_IF_ERROR(serialize_stream_->Init(id(), false));
+        bool got_buffer;
+        // Reserve the memory for 'serialize_stream_' so we don't need to scrounge up
+        // another buffer during spilling.
+        RETURN_IF_ERROR(serialize_stream_->PrepareForWrite(&got_buffer));
+        DCHECK(got_buffer)
+            << "Accounted in min reservation" << buffer_pool_client_.DebugString();
+        DCHECK(serialize_stream_->has_write_iterator());
+      }
+    }
     RETURN_IF_ERROR(CreateHashPartitions(0));
   }
 
@@ -520,9 +530,12 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
     bool ht_needs_expansion = false;
     for (int i = 0; i < PARTITION_FANOUT; ++i) {
       HashTable* hash_tbl = GetHashTable(i);
-      DCHECK(hash_tbl != NULL);
-      remaining_capacity[i] = hash_tbl->NumInsertsBeforeResize();
-      ht_needs_expansion |= remaining_capacity[i] < child_batch_->num_rows();
+      if (hash_tbl == nullptr) {
+        remaining_capacity[i] = 0;
+      } else {
+        remaining_capacity[i] = hash_tbl->NumInsertsBeforeResize();
+        ht_needs_expansion |= remaining_capacity[i] < child_batch_->num_rows();
+      }
     }
 
     // Stop expanding hash tables if we're not reducing the input sufficiently. As our
@@ -533,9 +546,12 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
     if (ht_needs_expansion && ShouldExpandPreaggHashTables()) {
       for (int i = 0; i < PARTITION_FANOUT; ++i) {
         HashTable* ht = GetHashTable(i);
-        if (remaining_capacity[i] < child_batch_->num_rows()) {
+        if (ht != nullptr && remaining_capacity[i] < child_batch_->num_rows()) {
           SCOPED_TIMER(ht_resize_timer_);
-          if (ht->CheckAndResize(child_batch_->num_rows(), ht_ctx_.get())) {
+          bool resized;
+          RETURN_IF_ERROR(
+              ht->CheckAndResize(child_batch_->num_rows(), ht_ctx_.get(), &resized));
+          if (resized) {
             remaining_capacity[i] = ht->NumInsertsBeforeResize();
           }
         }
@@ -548,7 +564,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
           child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity));
     } else {
       RETURN_IF_ERROR(ProcessBatchStreaming(needs_serialize_, prefetch_mode,
-          child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity ));
+          child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity));
     }
 
     child_batch_->Reset(); // All rows from child_batch_ were processed.
@@ -557,7 +573,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
   if (child_eos_) {
     child(0)->Close(state);
     child_batch_.reset();
-    MoveHashPartitions(child(0)->rows_returned());
+    RETURN_IF_ERROR(MoveHashPartitions(child(0)->rows_returned()));
   }
 
   num_rows_returned_ += out_batch->num_rows();
@@ -570,8 +586,10 @@ bool PartitionedAggregationNode::ShouldExpandPreaggHashTables() const {
   int64_t ht_rows = 0;
   for (int i = 0; i < PARTITION_FANOUT; ++i) {
     HashTable* ht = hash_partitions_[i]->hash_tbl.get();
-    ht_mem += ht->CurrentMemSize();
-    ht_rows += ht->size();
+    if (ht != nullptr) {
+      ht_mem += ht->CurrentMemSize();
+      ht_rows += ht->size();
+    }
   }
 
   // Need some rows in tables to have valid statistics.
@@ -678,7 +696,6 @@ void PartitionedAggregationNode::Close(RuntimeState* state) {
   if (serialize_stream_.get() != nullptr) {
     serialize_stream_->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
   }
-  if (block_mgr_client_ != nullptr) state->block_mgr()->ClearReservations(block_mgr_client_);
   ScalarExpr::Close(grouping_exprs_);
   ScalarExpr::Close(build_exprs_);
   AggFn::Close(agg_fns_);
@@ -705,56 +722,55 @@ Status PartitionedAggregationNode::Partition::InitStreams() {
     }
   }
 
-  aggregated_row_stream.reset(new BufferedTupleStream(parent->state_,
-      &parent->intermediate_row_desc_, parent->state_->block_mgr(),
-      parent->block_mgr_client_, true /* use_initial_small_buffers */,
-      false /* read_write */, external_varlen_slots));
-  RETURN_IF_ERROR(
-      aggregated_row_stream->Init(parent->id(), parent->runtime_profile(), true));
+  aggregated_row_stream.reset(new BufferedTupleStreamV2(parent->state_,
+      &parent->intermediate_row_desc_, &parent->buffer_pool_client_,
+      parent->resource_profile_.spillable_buffer_size,
+      parent->resource_profile_.spillable_buffer_size, external_varlen_slots));
+  RETURN_IF_ERROR(aggregated_row_stream->Init(parent->id(), true));
   bool got_buffer;
   RETURN_IF_ERROR(aggregated_row_stream->PrepareForWrite(&got_buffer));
   if (!got_buffer) {
-    return parent->state_->block_mgr()->MemLimitTooLowError(
-        parent->block_mgr_client_, parent->id());
+    stringstream ss;
+    parent->DebugString(2, &ss);
+    DCHECK(parent->is_streaming_preagg_)
+        << "Merge agg should have enough reservation " << parent->id_ << "\n"
+        << parent->buffer_pool_client_.DebugString() << "\n"
+        << ss.str();
+    DiscardAggregatedRowStream();
   }
 
   if (!parent->is_streaming_preagg_) {
-    unaggregated_row_stream.reset(new BufferedTupleStream(parent->state_,
-        parent->child(0)->row_desc(), parent->state_->block_mgr(),
-      parent->block_mgr_client_, true /* use_initial_small_buffers */,
-        false /* read_write */));
+    unaggregated_row_stream.reset(new BufferedTupleStreamV2(parent->state_,
+        parent->child(0)->row_desc(), &parent->buffer_pool_client_,
+        parent->resource_profile_.spillable_buffer_size,
+        parent->resource_profile_.spillable_buffer_size));
     // This stream is only used to spill, no need to ever have this pinned.
-    RETURN_IF_ERROR(unaggregated_row_stream->Init(parent->id(), parent->runtime_profile(),
-        false));
-    // TODO: allocate this buffer later only if we spill the partition.
-    RETURN_IF_ERROR(unaggregated_row_stream->PrepareForWrite(&got_buffer));
-    if (!got_buffer) {
-      return parent->state_->block_mgr()->MemLimitTooLowError(
-          parent->block_mgr_client_, parent->id());
-    }
-    DCHECK(unaggregated_row_stream->has_write_block());
+    RETURN_IF_ERROR(unaggregated_row_stream->Init(parent->id(), false));
+    // Save memory by waiting until we spill to allocate the write buffer for the
+    // unaggregated row stream.
+    DCHECK(!unaggregated_row_stream->has_write_iterator());
   }
   return Status::OK();
 }
 
-bool PartitionedAggregationNode::Partition::InitHashTable() {
-  DCHECK(hash_tbl.get() == NULL);
+Status PartitionedAggregationNode::Partition::InitHashTable(bool* got_memory) {
+  DCHECK(aggregated_row_stream != nullptr);
+  DCHECK(hash_tbl == nullptr);
   // We use the upper PARTITION_FANOUT num bits to pick the partition so only the
   // remaining bits can be used for the hash table.
   // TODO: we could switch to 64 bit hashes and then we don't need a max size.
   // It might be reasonable to limit individual hash table size for other reasons
   // though. Always start with small buffers.
-  hash_tbl.reset(HashTable::Create(parent->state_, parent->block_mgr_client_,
-      false, 1, NULL, 1L << (32 - NUM_PARTITIONING_BITS),
-      PAGG_DEFAULT_HASH_TABLE_SZ));
+  hash_tbl.reset(HashTable::Create(parent->ht_allocator_.get(), false, 1, nullptr,
+      1L << (32 - NUM_PARTITIONING_BITS), PAGG_DEFAULT_HASH_TABLE_SZ));
   // Please update the error message in CreateHashPartitions() if initial size of
   // hash table changes.
-  return hash_tbl->Init();
+  return hash_tbl->Init(got_memory);
 }
 
 Status PartitionedAggregationNode::Partition::SerializeStreamForSpilling() {
   DCHECK(!parent->is_streaming_preagg_);
-  if (parent->needs_serialize_ && aggregated_row_stream->num_rows() != 0) {
+  if (parent->needs_serialize_) {
     // We need to do a lot more work in this case. This step effectively does a merge
     // aggregation in this node. We need to serialize the intermediates, spill the
     // intermediates and then feed them into the aggregate function's merge step.
@@ -767,70 +783,69 @@ Status PartitionedAggregationNode::Partition::SerializeStreamForSpilling() {
     // for those UDAs.
     DCHECK(parent->serialize_stream_.get() != NULL);
     DCHECK(!parent->serialize_stream_->is_pinned());
-    DCHECK(parent->serialize_stream_->has_write_block());
 
     // Serialize and copy the spilled partition's stream into the new stream.
-    Status status = Status::OK();
-    bool failed_to_add = false;
-    BufferedTupleStream* new_stream = parent->serialize_stream_.get();
+    Status status;
+    BufferedTupleStreamV2* new_stream = parent->serialize_stream_.get();
     HashTable::Iterator it = hash_tbl->Begin(parent->ht_ctx_.get());
     while (!it.AtEnd()) {
       Tuple* tuple = it.GetTuple();
       it.Next();
       AggFnEvaluator::Serialize(agg_fn_evals, tuple);
       if (UNLIKELY(!new_stream->AddRow(reinterpret_cast<TupleRow*>(&tuple), &status))) {
-        failed_to_add = true;
-        break;
+        DCHECK(!status.ok()) << "Stream was unpinned - AddRow() only fails on error";
+        // Even if we can't add to new_stream, finish up processing this agg stream to make
+        // clean up easier (someone has to finalize this stream and we don't want to remember
+        // where we are).
+        parent->CleanupHashTbl(agg_fn_evals, it);
+        hash_tbl->Close();
+        hash_tbl.reset();
+        aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+        return status;
       }
     }
 
-    // Even if we can't add to new_stream, finish up processing this agg stream to make
-    // clean up easier (someone has to finalize this stream and we don't want to remember
-    // where we are).
-    if (failed_to_add) {
-      parent->CleanupHashTbl(agg_fn_evals, it);
-      hash_tbl->Close();
-      hash_tbl.reset();
-      aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-      RETURN_IF_ERROR(status);
-      return parent->state_->block_mgr()->MemLimitTooLowError(parent->block_mgr_client_,
-          parent->id());
-    }
-    DCHECK(status.ok());
-
     aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
     aggregated_row_stream.swap(parent->serialize_stream_);
     // Recreate the serialize_stream (and reserve 1 buffer) now in preparation for
     // when we need to spill again. We need to have this available before we need
     // to spill to make sure it is available. This should be acquirable since we just
     // freed at least one buffer from this partition's (old) aggregated_row_stream.
-    parent->serialize_stream_.reset(
-        new BufferedTupleStream(parent->state_, &parent->intermediate_row_desc_,
-            parent->state_->block_mgr(), parent->block_mgr_client_,
-            false /* use_initial_small_buffers */, false /* read_write */));
-    status = parent->serialize_stream_->Init(parent->id(), parent->runtime_profile(),
-        false);
+    parent->serialize_stream_.reset(new BufferedTupleStreamV2(parent->state_,
+        &parent->intermediate_row_desc_, &parent->buffer_pool_client_,
+        parent->resource_profile_.spillable_buffer_size,
+        parent->resource_profile_.spillable_buffer_size));
+    status = parent->serialize_stream_->Init(parent->id(), false);
     if (status.ok()) {
       bool got_buffer;
       status = parent->serialize_stream_->PrepareForWrite(&got_buffer);
-      if (status.ok() && !got_buffer) {
-        status = parent->state_->block_mgr()->MemLimitTooLowError(
-            parent->block_mgr_client_, parent->id());
-      }
+      DCHECK(!status.ok() || got_buffer) << "Accounted in min reservation";
     }
     if (!status.ok()) {
       hash_tbl->Close();
       hash_tbl.reset();
       return status;
     }
-    DCHECK(parent->serialize_stream_->has_write_block());
+    DCHECK(parent->serialize_stream_->has_write_iterator());
   }
   return Status::OK();
 }
 
-Status PartitionedAggregationNode::Partition::Spill() {
+void PartitionedAggregationNode::Partition::DiscardAggregatedRowStream() {
+  DCHECK(parent->is_streaming_preagg_);
+  DCHECK(aggregated_row_stream != nullptr);
+  DCHECK_EQ(aggregated_row_stream->num_rows(), 0);
+  if (hash_tbl != nullptr) hash_tbl->Close();
+  hash_tbl.reset();
+  aggregated_row_stream->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+  aggregated_row_stream.reset();
+}
+
+Status PartitionedAggregationNode::Partition::Spill(bool more_aggregate_rows) {
+  DCHECK(!parent->is_streaming_preagg_);
   DCHECK(!is_closed);
   DCHECK(!is_spilled());
+  RETURN_IF_ERROR(parent->state_->StartSpilling(parent->mem_tracker()));
 
   RETURN_IF_ERROR(SerializeStreamForSpilling());
 
@@ -846,34 +861,18 @@ Status PartitionedAggregationNode::Partition::Spill() {
   hash_tbl->Close();
   hash_tbl.reset();
 
-  // Try to switch both streams to IO-sized buffers to avoid allocating small buffers
-  // for spilled partition.
-  bool got_buffer = true;
-  if (aggregated_row_stream->using_small_buffers()) {
-    RETURN_IF_ERROR(aggregated_row_stream->SwitchToIoBuffers(&got_buffer));
-  }
-  // Unpin the stream as soon as possible to increase the chances that the
-  // SwitchToIoBuffers() call below will succeed.  If we're repartitioning, rows that
-  // were already aggregated (rows from the input partition's aggregated stream) will
-  // need to be added to this hash partition's aggregated stream, so we need to leave
-  // the write block pinned.
-  // TODO: when not repartitioning, don't leave the write block pinned.
-  DCHECK(!got_buffer || aggregated_row_stream->has_write_block())
-      << aggregated_row_stream->DebugString();
-  RETURN_IF_ERROR(
-      aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
-
-  if (got_buffer && unaggregated_row_stream->using_small_buffers()) {
-    RETURN_IF_ERROR(unaggregated_row_stream->SwitchToIoBuffers(&got_buffer));
-  }
-  if (!got_buffer) {
-    // We'll try again to get the buffers when the stream fills up the small buffers.
-    VLOG_QUERY << "Not enough memory to switch to IO-sized buffer for partition "
-               << this << " of agg=" << parent->id_ << " agg small buffers="
-               << aggregated_row_stream->using_small_buffers()
-               << " unagg small buffers="
-               << unaggregated_row_stream->using_small_buffers();
-    VLOG_FILE << GetStackTrace();
+  // Unpin the stream to free memory, but leave a write buffer in place so we can
+  // continue appending rows to one of the streams in the partition.
+  DCHECK(aggregated_row_stream->has_write_iterator());
+  DCHECK(!unaggregated_row_stream->has_write_iterator());
+  if (more_aggregate_rows) {
+    aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+  } else {
+    aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+    bool got_buffer;
+    RETURN_IF_ERROR(unaggregated_row_stream->PrepareForWrite(&got_buffer));
+    DCHECK(got_buffer)
+        << "Accounted in min reservation" << parent->buffer_pool_client_.DebugString();
   }
 
   COUNTER_ADD(parent->num_spilled_partitions_, 1);
@@ -933,33 +932,27 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple(
 }
 
 Tuple* PartitionedAggregationNode::ConstructIntermediateTuple(
-    const vector<AggFnEvaluator*>& agg_fn_evals, BufferedTupleStream* stream,
+    const vector<AggFnEvaluator*>& agg_fn_evals, BufferedTupleStreamV2* stream,
     Status* status) noexcept {
   DCHECK(stream != NULL && status != NULL);
   // Allocate space for the entire tuple in the stream.
   const int fixed_size = intermediate_tuple_desc_->byte_size();
   const int varlen_size = GroupingExprsVarlenSize();
-  uint8_t* varlen_buffer;
-  uint8_t* fixed_buffer = stream->AllocateRow(fixed_size, varlen_size, &varlen_buffer,
-      status);
-  if (UNLIKELY(fixed_buffer == NULL)) {
-    if (!status->ok() || !stream->using_small_buffers()) return NULL;
-    // IMPALA-2352: Make a best effort to switch to IO buffers and re-allocate.
-    // If SwitchToIoBuffers() fails the caller of this function can try to free
-    // some space, e.g. through spilling, and re-attempt to allocate space for
-    // this row.
-    bool got_buffer;
-    *status = stream->SwitchToIoBuffers(&got_buffer);
-    if (!status->ok() || !got_buffer) return NULL;
-    fixed_buffer = stream->AllocateRow(fixed_size, varlen_size, &varlen_buffer, status);
-    if (fixed_buffer == NULL) return NULL;
-  }
-
-  Tuple* intermediate_tuple = reinterpret_cast<Tuple*>(fixed_buffer);
-  intermediate_tuple->Init(fixed_size);
-  CopyGroupingValues(intermediate_tuple, varlen_buffer, varlen_size);
-  InitAggSlots(agg_fn_evals, intermediate_tuple);
-  return intermediate_tuple;
+  const int tuple_size = fixed_size + varlen_size;
+  uint8_t* tuple_data = stream->AddRowCustomBegin(tuple_size, status);
+  if (UNLIKELY(tuple_data == nullptr)) {
+    // If we failed to allocate and did not hit an error (indicated by a non-ok status),
+    // the caller of this function can try to free some space, e.g. through spilling, and
+    // re-attempt to allocate space for this row.
+    return nullptr;
+  }
+  Tuple* tuple = reinterpret_cast<Tuple*>(tuple_data);
+  tuple->Init(fixed_size);
+  uint8_t* varlen_buffer = tuple_data + fixed_size;
+  CopyGroupingValues(tuple, varlen_buffer, varlen_size);
+  InitAggSlots(agg_fn_evals, tuple);
+  stream->AddRowCustomEnd(tuple_size);
+  return tuple;
 }
 
 int PartitionedAggregationNode::GroupingExprsVarlenSize() {
@@ -1079,30 +1072,30 @@ Tuple* PartitionedAggregationNode::GetOutputTuple(
   return dst;
 }
 
-Status PartitionedAggregationNode::AppendSpilledRow(BufferedTupleStream* stream,
-    TupleRow* row) {
-  DCHECK(stream != NULL);
+template <bool AGGREGATED_ROWS>
+Status PartitionedAggregationNode::AppendSpilledRow(
+    Partition* __restrict__ partition, TupleRow* __restrict__ row) {
+  DCHECK(!is_streaming_preagg_);
+  DCHECK(partition->is_spilled());
+  BufferedTupleStreamV2* stream = AGGREGATED_ROWS ?
+      partition->aggregated_row_stream.get() :
+      partition->unaggregated_row_stream.get();
   DCHECK(!stream->is_pinned());
-  DCHECK(stream->has_write_block());
-  if (LIKELY(stream->AddRow(row, &process_batch_status_))) return Status::OK();
+  Status status;
+  if (LIKELY(stream->AddRow(row, &status))) return Status::OK();
+  RETURN_IF_ERROR(status);
 
-  // Adding fails iff either we hit an error or haven't switched to I/O buffers.
-  RETURN_IF_ERROR(process_batch_status_);
+  // Keep trying to free memory by spilling until we succeed or hit an error.
+  // Running out of partitions to spill is treated as an error by SpillPartition().
   while (true) {
-    bool got_buffer;
-    RETURN_IF_ERROR(stream->SwitchToIoBuffers(&got_buffer));
-    if (got_buffer) break;
-    RETURN_IF_ERROR(SpillPartition());
+    RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS));
+    if (stream->AddRow(row, &status)) return Status::OK();
+    RETURN_IF_ERROR(status);
   }
-
-  // Adding the row should succeed after the I/O buffer switch.
-  if (stream->AddRow(row, &process_batch_status_)) return Status::OK();
-  DCHECK(!process_batch_status_.ok());
-  return process_batch_status_;
 }
 
-void PartitionedAggregationNode::DebugString(int indentation_level,
-    stringstream* out) const {
+void PartitionedAggregationNode::DebugString(
+    int indentation_level, stringstream* out) const {
   *out << string(indentation_level * 2, ' ');
   *out << "PartitionedAggregationNode("
        << "intermediate_tuple_id=" << intermediate_tuple_id_
@@ -1114,85 +1107,100 @@ void PartitionedAggregationNode::DebugString(int indentation_level,
   *out << ")";
 }
 
-Status PartitionedAggregationNode::CreateHashPartitions(int level) {
+Status PartitionedAggregationNode::CreateHashPartitions(
+    int level, int single_partition_idx) {
   if (is_streaming_preagg_) DCHECK_EQ(level, 0);
   if (UNLIKELY(level >= MAX_PARTITION_DEPTH)) {
-    return Status(TErrorCode::PARTITIONED_AGG_MAX_PARTITION_DEPTH, id_, MAX_PARTITION_DEPTH);
+    return Status(
+        TErrorCode::PARTITIONED_AGG_MAX_PARTITION_DEPTH, id_, MAX_PARTITION_DEPTH);
   }
   ht_ctx_->set_level(level);
 
   DCHECK(hash_partitions_.empty());
+  int num_partitions_created = 0;
   for (int i = 0; i < PARTITION_FANOUT; ++i) {
-    Partition* new_partition = new Partition(this, level);
-    DCHECK(new_partition != NULL);
-    hash_partitions_.push_back(partition_pool_->Add(new_partition));
-    RETURN_IF_ERROR(new_partition->InitStreams());
-    hash_tbls_[i] = NULL;
-  }
-  if (!is_streaming_preagg_) {
-    DCHECK_GT(state_->block_mgr()->num_reserved_buffers_remaining(block_mgr_client_), 0);
+    hash_tbls_[i] = nullptr;
+    if (single_partition_idx == -1 || i == single_partition_idx) {
+      Partition* new_partition = partition_pool_->Add(new Partition(this, level, i));
+      ++num_partitions_created;
+      hash_partitions_.push_back(new_partition);
+      RETURN_IF_ERROR(new_partition->InitStreams());
+    } else {
+      hash_partitions_.push_back(nullptr);
+    }
   }
-
   // Now that all the streams are reserved (meaning we have enough memory to execute
   // the algorithm), allocate the hash tables. These can fail and we can still continue.
   for (int i = 0; i < PARTITION_FANOUT; ++i) {
-    if (UNLIKELY(!hash_partitions_[i]->InitHashTable())) {
-      // We don't spill on preaggregations. If we have so little memory that we can't
-      // allocate small hash tables, the mem limit is just too low.
-      if (is_streaming_preagg_) {
-        int64_t alloc_size = PAGG_DEFAULT_HASH_TABLE_SZ * HashTable::BucketSize();
-        string details = Substitute("Cannot perform aggregation at node with id $0."
-            " Failed to initialize hash table in preaggregation. The memory limit"
-            " is too low to execute the query.", id_);
-        return mem_tracker()->MemLimitExceeded(state_, details, alloc_size);
+    Partition* partition = hash_partitions_[i];
+    if (partition == nullptr) continue;
+    if (partition->aggregated_row_stream == nullptr) {
+      // Failed to create the aggregated row stream - cannot create a hash table.
+      // Just continue with a NULL hash table so rows will be passed through.
+      DCHECK(is_streaming_preagg_);
+    } else {
+      bool got_memory;
+      RETURN_IF_ERROR(partition->InitHashTable(&got_memory));
+      // Spill the partition if we cannot create a hash table for a merge aggregation.
+      if (UNLIKELY(!got_memory)) {
+        if (is_streaming_preagg_) {
+          partition->DiscardAggregatedRowStream();
+        } else {
+          // If we're repartitioning, we will be writing aggregated rows first.
+          RETURN_IF_ERROR(partition->Spill(level > 0));
+        }
       }
-      RETURN_IF_ERROR(hash_partitions_[i]->Spill());
     }
-    hash_tbls_[i] = hash_partitions_[i]->hash_tbl.get();
+    hash_tbls_[i] = partition->hash_tbl.get();
   }
 
-  COUNTER_ADD(partitions_created_, hash_partitions_.size());
+  COUNTER_ADD(partitions_created_, num_partitions_created);
   if (!is_streaming_preagg_) {
     COUNTER_SET(max_partition_level_, level);
   }
   return Status::OK();
 }
 
-Status PartitionedAggregationNode::CheckAndResizeHashPartitions(int num_rows,
-    const HashTableCtx* ht_ctx) {
+Status PartitionedAggregationNode::CheckAndResizeHashPartitions(
+    bool partitioning_aggregated_rows, int num_rows, const HashTableCtx* ht_ctx) {
   DCHECK(!is_streaming_preagg_);
   for (int i = 0; i < PARTITION_FANOUT; ++i) {
     Partition* partition = hash_partitions_[i];
+    if (partition == nullptr) continue;
     while (!partition->is_spilled()) {
       {
         SCOPED_TIMER(ht_resize_timer_);
-        if (partition->hash_tbl->CheckAndResize(num_rows, ht_ctx)) break;
+        bool resized;
+        RETURN_IF_ERROR(partition->hash_tbl->CheckAndResize(num_rows, ht_ctx, &resized));
+        if (resized) break;
       }
-      RETURN_IF_ERROR(SpillPartition());
+      RETURN_IF_ERROR(SpillPartition(partitioning_aggregated_rows));
     }
   }
   return Status::OK();
 }
 
 int64_t PartitionedAggregationNode::LargestSpilledPartition() const {
+  DCHECK(!is_streaming_preagg_);
   int64_t max_rows = 0;
   for (int i = 0; i < hash_partitions_.size(); ++i) {
     Partition* partition = hash_partitions_[i];
-    if (partition->is_closed || !partition->is_spilled()) continue;
-    int64_t rows = partition->aggregated_row_stream->num_rows() +
-        partition->unaggregated_row_stream->num_rows();
+    if (partition == nullptr || partition->is_closed || !partition->is_spilled()) {
+      continue;
+    }
+    int64_t rows = partition->aggregated_row_stream->num_rows()
+        + partition->unaggregated_row_stream->num_rows();
     if (rows > max_rows) max_rows = rows;
   }
   return max_rows;
 }
 
 Status PartitionedAggregationNode::NextPartition() {
-  DCHECK(output_partition_ == NULL);
+  DCHECK(output_partition_ == nullptr);
 
   // Keep looping until we get to a partition that fits in memory.
-  Partition* partition = NULL;
+  Partition* partition = nullptr;
   while (true) {
-    partition = NULL;
     // First return partitions that are fully aggregated (and in memory).
     if (!aggregated_partitions_.empty()) {
       partition = aggregated_partitions_.front();
@@ -1201,56 +1209,23 @@ Status PartitionedAggregationNode::NextPartition() {
       break;
     }
 
-    if (partition == NULL) {
-      DCHECK(!spilled_partitions_.empty());
-      DCHECK(!is_streaming_preagg_);
-      DCHECK_EQ(state_->block_mgr()->num_pinned_buffers(block_mgr_client_),
-          needs_serialize_ ? 1 : 0);
-
-      // TODO: we can probably do better than just picking the first partition. We
-      // can base this on the amount written to disk, etc.
-      partition = spilled_partitions_.front();
-      DCHECK(partition->is_spilled());
-
-      // Create the new hash partitions to repartition into.
-      // TODO: we don't need to repartition here. We are now working on 1 / FANOUT
-      // of the input so it's reasonably likely it can fit. We should look at this
-      // partitions size and just do the aggregation if it fits in memory.
-      RETURN_IF_ERROR(CreateHashPartitions(partition->level + 1));
-      COUNTER_ADD(num_repartitions_, 1);
-
-      // Rows in this partition could have been spilled into two streams, depending
-      // on if it is an aggregated intermediate, or an unaggregated row.
-      // Note: we must process the aggregated rows first to save a hash table lookup
-      // in ProcessBatch().
-      RETURN_IF_ERROR(ProcessStream<true>(partition->aggregated_row_stream.get()));
-      RETURN_IF_ERROR(ProcessStream<false>(partition->unaggregated_row_stream.get()));
-
-      COUNTER_ADD(num_row_repartitioned_, partition->aggregated_row_stream->num_rows());
-      COUNTER_ADD(num_row_repartitioned_,
-          partition->unaggregated_row_stream->num_rows());
+    // No aggregated partitions in memory - we should not be using any reservation aside
+    // from 'serialize_stream_'.
+    DCHECK_EQ(serialize_stream_ != nullptr ? serialize_stream_->BytesPinned(false) : 0,
+        buffer_pool_client_.GetUsedReservation()) << buffer_pool_client_.DebugString();
 
-      partition->Close(false);
-      spilled_partitions_.pop_front();
-
-      // Done processing this partition. Move the new partitions into
-      // spilled_partitions_/aggregated_partitions_.
-      int64_t num_input_rows = partition->aggregated_row_stream->num_rows() +
-          partition->unaggregated_row_stream->num_rows();
-
-      // Check if there was any reduction in the size of partitions after repartitioning.
-      int64_t largest_partition = LargestSpilledPartition();
-      DCHECK_GE(num_input_rows, largest_partition) << "Cannot have a partition with "
-          "more rows than the input";
-      if (UNLIKELY(num_input_rows == largest_partition)) {
-        return Status(TErrorCode::PARTITIONED_AGG_REPARTITION_FAILS, id_,
-            partition->level + 1, num_input_rows);
-      }
-      RETURN_IF_ERROR(MoveHashPartitions(num_input_rows));
-    }
-  }
+    // Try to fit a single spilled partition in memory. We can often do this because
+    // we only need to fit 1/PARTITION_FANOUT of the data in memory.
+    // TODO: in some cases when the partition probably won't fit in memory it could
+    // be better to skip directly to repartitioning.
+    RETURN_IF_ERROR(BuildSpilledPartition(&partition));
+    if (partition != nullptr) break;
 
-  DCHECK(partition->hash_tbl.get() != NULL);
+    // If we can't fit the partition in memory, repartition it.
+    RETURN_IF_ERROR(RepartitionSpilledPartition());
+  }
+  DCHECK(!partition->is_spilled());
+  DCHECK(partition->hash_tbl.get() != nullptr);
   DCHECK(partition->aggregated_row_stream->is_pinned());
 
   output_partition_ = partition;
@@ -1259,8 +1234,105 @@ Status PartitionedAggregationNode::NextPartition() {
   return Status::OK();
 }
 
-template<bool AGGREGATED_ROWS>
-Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stream) {
+Status PartitionedAggregationNode::BuildSpilledPartition(Partition** built_partition) {
+  DCHECK(!spilled_partitions_.empty());
+  DCHECK(!is_streaming_preagg_);
+  // Leave the partition in 'spilled_partitions_' to be closed if we hit an error.
+  Partition* src_partition = spilled_partitions_.front();
+  DCHECK(src_partition->is_spilled());
+
+  // Create a new hash partition from the rows of the spilled partition. This is simpler
+  // than trying to finish building a partially-built partition in place. We only
+  // initialise one hash partition that all rows in 'src_partition' will hash to.
+  RETURN_IF_ERROR(CreateHashPartitions(src_partition->level, src_partition->idx));
+  Partition* dst_partition = hash_partitions_[src_partition->idx];
+  DCHECK(dst_partition != nullptr);
+
+  // Rebuild the hash table over spilled aggregate rows then start adding unaggregated
+  // rows to the hash table. It's possible the partition will spill at either stage.
+  // In that case we need to finish processing 'src_partition' so that all rows are
+  // appended to 'dst_partition'.
+  // TODO: if the partition spills again but the aggregation reduces the input
+  // significantly, we could do better here by keeping the incomplete hash table in
+  // memory and only spilling unaggregated rows that didn't fit in the hash table
+  // (somewhat similar to the passthrough pre-aggregation).
+  RETURN_IF_ERROR(ProcessStream<true>(src_partition->aggregated_row_stream.get()));
+  RETURN_IF_ERROR(ProcessStream<false>(src_partition->unaggregated_row_stream.get()));
+  src_partition->Close(false);
+  spilled_partitions_.pop_front();
+  hash_partitions_.clear();
+
+  if (dst_partition->is_spilled()) {
+    PushSpilledPartition(dst_partition);
+    *built_partition = nullptr;
+    // Spilled the partition - we should not be using any reservation except from
+    // 'serialize_stream_'.
+    DCHECK_EQ(serialize_stream_ != nullptr ? serialize_stream_->BytesPinned(false) : 0,
+        buffer_pool_client_.GetUsedReservation()) << buffer_pool_client_.DebugString();
+  } else {
+    *built_partition = dst_partition;
+  }
+  return Status::OK();
+}
+
+Status PartitionedAggregationNode::RepartitionSpilledPartition() {
+  DCHECK(!spilled_partitions_.empty());
+  DCHECK(!is_streaming_preagg_);
+  // Leave the partition in 'spilled_partitions_' to be closed if we hit an error.
+  Partition* partition = spilled_partitions_.front();
+  DCHECK(partition->is_spilled());
+
+  // Create the new hash partitions to repartition into. This will allocate a
+  // write buffer for each partition's aggregated row stream.
+  RETURN_IF_ERROR(CreateHashPartitions(partition->level + 1));
+  COUNTER_ADD(num_repartitions_, 1);
+
+  // Rows in this partition could have been spilled into two streams, depending
+  // on if it is an aggregated intermediate, or an unaggregated row. Aggregated
+  // rows are processed first to save a hash table lookup in ProcessBatch().
+  RETURN_IF_ERROR(ProcessStream<true>(partition->aggregated_row_stream.get()));
+
+  // Prepare write buffers so we can append spilled rows to unaggregated partitions.
+  for (Partition* hash_partition : hash_partitions_) {
+    if (!hash_partition->is_spilled()) continue;
+    // The aggregated rows have been repartitioned. Free up at least a buffer's worth of
+    // reservation and use it to pin the unaggregated write buffer.
+    hash_partition->aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+    bool got_buffer;
+    RETURN_IF_ERROR(
+        hash_partition->unaggregated_row_stream->PrepareForWrite(&got_buffer));
+    DCHECK(got_buffer)
+        << "Accounted in min reservation" << buffer_pool_client_.DebugString();
+  }
+  RETURN_IF_ERROR(ProcessStream<false>(partition->unaggregated_row_stream.get()));
+
+  COUNTER_ADD(num_row_repartitioned_, partition->aggregated_row_stream->num_rows());
+  COUNTER_ADD(num_row_repartitioned_, partition->unaggregated_row_stream->num_rows());
+
+  partition->Close(false);
+  spilled_partitions_.pop_front();
+
+  // Done processing this partition. Move the new partitions into
+  // spilled_partitions_/aggregated_partitions_.
+  int64_t num_input_rows = partition->aggregated_row_stream->num_rows()
+      + partition->unaggregated_row_stream->num_rows();
+
+  // Check if there was any reduction in the size of partitions after repartitioning.
+  int64_t largest_partition = LargestSpilledPartition();
+  DCHECK_GE(num_input_rows, largest_partition) << "Partition had more rows than input";
+  if (UNLIKELY(num_input_rows == largest_partition)) {
+    stringstream ss;
+    DebugString(2, &ss);
+    return Status(TErrorCode::PARTITIONED_AGG_REPARTITION_FAILS, id_,
+        partition->level + 1, num_input_rows, buffer_pool_client_.DebugString(),
+        ss.str());
+  }
+  RETURN_IF_ERROR(MoveHashPartitions(num_input_rows));
+  return Status::OK();
+}
+
+template <bool AGGREGATED_ROWS>
+Status PartitionedAggregationNode::ProcessStream(BufferedTupleStreamV2* input_stream) {
   DCHECK(!is_streaming_preagg_);
   if (input_stream->num_rows() > 0) {
     while (true) {
@@ -1268,7 +1340,7 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stre
       RETURN_IF_ERROR(input_stream->PrepareForRead(true, &got_buffer));
       if (got_buffer) break;
       // Did not have a buffer to read the input stream. Spill and try again.
-      RETURN_IF_ERROR(SpillPartition());
+      RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS));
     }
 
     TPrefetchMode::type prefetch_mode = state_->query_options().prefetch_mode;
@@ -1288,16 +1360,17 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stre
   return Status::OK();
 }
 
-Status PartitionedAggregationNode::SpillPartition() {
+Status PartitionedAggregationNode::SpillPartition(bool more_aggregate_rows) {
   int64_t max_freed_mem = 0;
   int partition_idx = -1;
 
   // Iterate over the partitions and pick the largest partition that is not spilled.
   for (int i = 0; i < hash_partitions_.size(); ++i) {
+    if (hash_partitions_[i] == nullptr) continue;
     if (hash_partitions_[i]->is_closed) continue;
     if (hash_partitions_[i]->is_spilled()) continue;
     // Pass 'true' because we need to keep the write block pinned. See Partition::Spill().
-    int64_t mem = hash_partitions_[i]->aggregated_row_stream->bytes_in_mem(true);
+    int64_t mem = hash_partitions_[i]->aggregated_row_stream->BytesPinned(true);
     mem += hash_partitions_[i]->hash_tbl->ByteSize();
     mem += hash_partitions_[i]->agg_fn_pool->total_reserved_bytes();
     DCHECK_GT(mem, 0); // At least the hash table buckets should occupy memory.
@@ -1306,26 +1379,26 @@ Status PartitionedAggregationNode::SpillPartition() {
       partition_idx = i;
     }
   }
-  if (partition_idx == -1) {
-    // Could not find a partition to spill. This means the mem limit was just too low.
-    return state_->block_mgr()->MemLimitTooLowError(block_mgr_client_, id());
-  }
-
+  DCHECK_NE(partition_idx, -1) << "Should have been able to spill a partition to "
+                               << "reclaim memory: " << buffer_pool_client_.DebugString();
   hash_tbls_[partition_idx] = NULL;
-  return hash_partitions_[partition_idx]->Spill();
+  return hash_partitions_[partition_idx]->Spill(more_aggregate_rows);
 }
 
 Status PartitionedAggregationNode::MoveHashPartitions(int64_t num_input_rows) {
   DCHECK(!hash_partitions_.empty());
   stringstream ss;
-  ss << "PA(node_id=" << id() << ") partitioned(level="
-     << hash_partitions_[0]->level << ") "
-     << num_input_rows << " rows into:" << endl;
+  ss << "PA(node_id=" << id() << ") partitioned(level=" << hash_partitions_[0]->level
+     << ") " << num_input_rows << " rows into:" << endl;
   for (int i = 0; i < hash_partitions_.size(); ++i) {
     Partition* partition = hash_partitions_[i];
-    int64_t aggregated_rows = partition->aggregated_row_stream->num_rows();
+    if (partition == nullptr) continue;
+    int64_t aggregated_rows = 0;
+    if (partition->aggregated_row_stream != nullptr) {
+      aggregated_rows = partition->aggregated_row_stream->num_rows();
+    }
     int64_t unaggregated_rows = 0;
-    if (partition->unaggregated_row_stream != NULL) {
+    if (partition->unaggregated_row_stream != nullptr) {
       unaggregated_rows = partition->unaggregated_row_stream->num_rows();
     }
     double total_rows = aggregated_rows + unaggregated_rows;
@@ -1341,54 +1414,46 @@ Status PartitionedAggregationNode::MoveHashPartitions(int64_t num_input_rows) {
     if (total_rows == 0) {
       partition->Close(false);
     } else if (partition->is_spilled()) {
-      DCHECK(partition->hash_tbl.get() == NULL);
-      // We need to unpin all the spilled partitions to make room to allocate new
-      // hash_partitions_ when we repartition the spilled partitions.
-      // TODO: we only need to do this when we have memory pressure. This might be
-      // okay though since the block mgr should only write these to disk if there
-      // is memory pressure.
-      RETURN_IF_ERROR(
-          partition->aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL));
-      RETURN_IF_ERROR(partition->unaggregated_row_stream->UnpinStream(
-          BufferedTupleStream::UNPIN_ALL));
-
-      // Push new created partitions at the front. This means a depth first walk
-      // (more finely partitioned partitions are processed first). This allows us
-      // to delete blocks earlier and bottom out the recursion earlier.
-      spilled_partitions_.push_front(partition);
+      PushSpilledPartition(partition);
     } else {
       aggregated_partitions_.push_back(partition);
     }
-
   }
   VLOG(2) << ss.str();
   hash_partitions_.clear();
   return Status::OK();
 }
 
+void PartitionedAggregationNode::PushSpilledPartition(Partition* partition) {
+  DCHECK(partition->is_spilled());
+  DCHECK(partition->hash_tbl == nullptr);
+  // Ensure all pages in the spilled partition's streams are unpinned by invalidating
+  // the streams' read and write iterators. We may need all the memory to process the
+  // next spilled partitions.
+  partition->aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+  partition->unaggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+  spilled_partitions_.push_front(partition);
+}
+
 void PartitionedAggregationNode::ClosePartitions() {
-  for (int i = 0; i < hash_partitions_.size(); ++i) {
-    hash_partitions_[i]->Close(true);
-  }
-  for (list<Partition*>::iterator it = aggregated_partitions_.begin();
-      it != aggregated_partitions_.end(); ++it) {
-    (*it)->Close(true);
-  }
-  for (list<Partition*>::iterator it = spilled_partitions_.begin();
-      it != spilled_partitions_.end(); ++it) {
-    (*it)->Close(true);
+  for (Partition* partition : hash_partitions_) {
+    if (partition != nullptr) partition->Close(true);
   }
+  hash_partitions_.clear();
+  for (Partition* partition : aggregated_partitions_) partition->Close(true);
   aggregated_partitions_.clear();
+  for (Partition* partition : spilled_partitions_) partition->Close(true);
   spilled_partitions_.clear();
-  hash_partitions_.clear();
   memset(hash_tbls_, 0, sizeof(hash_tbls_));
   partition_pool_->Clear();
 }
 
 Status PartitionedAggregationNode::QueryMaintenance(RuntimeState* state) {
   AggFnEvaluator::FreeLocalAllocations(agg_fn_evals_);
-  for (int i = 0; i < hash_partitions_.size(); ++i) {
-    AggFnEvaluator::FreeLocalAllocations(hash_partitions_[i]->agg_fn_evals);
+  for (Partition* partition : hash_partitions_) {
+    if (partition != nullptr) {
+      AggFnEvaluator::FreeLocalAllocations(partition->agg_fn_evals);
+    }
   }
   if (ht_ctx_.get() != nullptr) ht_ctx_->FreeLocalAllocations();
   return ExecNode::QueryMaintenance(state);
@@ -1972,4 +2037,8 @@ Status PartitionedAggregationNode::CodegenProcessBatchStreaming(
   return Status::OK();
 }
 
+// Instantiate required templates.
+template Status PartitionedAggregationNode::AppendSpilledRow<false>(
+    Partition*, TupleRow*);
+template Status PartitionedAggregationNode::AppendSpilledRow<true>(Partition*, TupleRow*);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-aggregation-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.h b/be/src/exec/partitioned-aggregation-node.h
index 066dc28..4f8b622 100644
--- a/be/src/exec/partitioned-aggregation-node.h
+++ b/be/src/exec/partitioned-aggregation-node.h
@@ -19,13 +19,15 @@
 #ifndef IMPALA_EXEC_PARTITIONED_AGGREGATION_NODE_H
 #define IMPALA_EXEC_PARTITIONED_AGGREGATION_NODE_H
 
+#include <deque>
+
 #include <boost/scoped_ptr.hpp>
 
 #include "exec/exec-node.h"
 #include "exec/hash-table.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
-#include "runtime/descriptors.h"  // for TupleId
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/bufferpool/suballocator.h"
+#include "runtime/descriptors.h" // for TupleId
 #include "runtime/mem-pool.h"
 #include "runtime/string-value.h"
 
@@ -229,7 +231,9 @@ class PartitionedAggregationNode : public ExecNode {
   std::vector<int> string_grouping_exprs_;
 
   RuntimeState* state_;
-  BufferedBlockMgr::Client* block_mgr_client_;
+
+  /// Allocator for hash table memory.
+  boost::scoped_ptr<Suballocator> ht_allocator_;
 
   /// MemPool used to allocate memory for when we don't have grouping and don't initialize
   /// the partitioning structures, or during Close() when creating new output tuples.
@@ -337,12 +341,12 @@ class PartitionedAggregationNode : public ExecNode {
   HashTable* hash_tbls_[PARTITION_FANOUT];
 
   /// All partitions that have been spilled and need further processing.
-  std::list<Partition*> spilled_partitions_;
+  std::deque<Partition*> spilled_partitions_;
 
   /// All partitions that are aggregated and can just return the results in GetNext().
   /// After consuming all the input, hash_partitions_ is split into spilled_partitions_
   /// and aggregated_partitions_, depending on if it was spilled or not.
-  std::list<Partition*> aggregated_partitions_;
+  std::deque<Partition*> aggregated_partitions_;
 
   /// END: Members that must be Reset()
   /////////////////////////////////////////
@@ -352,31 +356,42 @@ class PartitionedAggregationNode : public ExecNode {
   /// initially use small buffers. Streaming pre-aggregations do not spill and do not
   /// require an unaggregated stream.
   struct Partition {
-    Partition(PartitionedAggregationNode* parent, int level)
-      : parent(parent), is_closed(false), level(level) {}
+    Partition(PartitionedAggregationNode* parent, int level, int idx)
+      : parent(parent), is_closed(false), level(level), idx(idx) {}
 
     ~Partition();
 
     /// Initializes aggregated_row_stream and unaggregated_row_stream (if a spilling
-    /// aggregation), reserving one buffer for each. The buffers backing these streams
-    /// are reserved, so this function will not fail with a continuable OOM. If we fail
-    /// to init these buffers, the mem limit is too low to run this algorithm.
-    Status InitStreams();
+    /// aggregation), allocating one buffer for each. Spilling merge aggregations must
+    /// have enough reservation for the initial buffer for the stream, so this should
+    /// not fail due to OOM. Preaggregations do not reserve any buffers: if does not
+    /// have enough reservation for the initial buffer, the aggregated row stream is not
+    /// created and an OK status is returned.
+    Status InitStreams() WARN_UNUSED_RESULT;
 
-    /// Initializes the hash table. Returns false on OOM.
-    bool InitHashTable();
+    /// Initializes the hash table. 'aggregated_row_stream' must be non-NULL.
+    /// Sets 'got_memory' to true if the hash table was initialised or false on OOM.
+    Status InitHashTable(bool* got_memory) WARN_UNUSED_RESULT;
 
     /// Called in case we need to serialize aggregated rows. This step effectively does
     /// a merge aggregation in this node.
-    Status SerializeStreamForSpilling();
+    Status SerializeStreamForSpilling() WARN_UNUSED_RESULT;
 
     /// Closes this partition. If finalize_rows is true, this iterates over all rows
     /// in aggregated_row_stream and finalizes them (this is only used in the cancellation
     /// path).
     void Close(bool finalize_rows);
 
-    /// Spills this partition, unpinning streams and cleaning up hash tables as necessary.
-    Status Spill();
+    /// Spill this partition. 'more_aggregate_rows' = true means that more aggregate rows
+    /// may be appended to the the partition before appending unaggregated rows. On
+    /// success, one of the streams is left with a write iterator: the aggregated stream
+    /// if 'more_aggregate_rows' is true or the unaggregated stream otherwise.
+    Status Spill(bool more_aggregate_rows) WARN_UNUSED_RESULT;
+
+    /// Discards the aggregated row stream and hash table. Only valid to call if this is
+    /// a streaming preaggregation and the initial memory allocation for hash tables or
+    /// the aggregated stream failed. The aggregated stream must have 0 rows.
+    void DiscardAggregatedRowStream();
 
     bool is_spilled() const { return hash_tbl.get() == NULL; }
 
@@ -390,9 +405,12 @@ class PartitionedAggregationNode : public ExecNode {
     /// etc.
     const int level;
 
+    /// The index of this partition within 'hash_partitions_' at its level.
+    const int idx;
+
     /// Hash table for this partition.
     /// Can be NULL if this partition is no longer maintaining a hash table (i.e.
-    /// is spilled).
+    /// is spilled or we are passing through all rows for this partition).
     boost::scoped_ptr<HashTable> hash_tbl;
 
     /// Clone of parent's agg_fn_evals_ and backing MemPool.
@@ -401,18 +419,24 @@ class PartitionedAggregationNode : public ExecNode {
 
     /// Tuple stream used to store aggregated rows. When the partition is not spilled,
     /// (meaning the hash table is maintained), this stream is pinned and contains the
-    /// memory referenced by the hash table. When it is spilled, aggregate rows are
-    /// just appended to this stream.
-    boost::scoped_ptr<BufferedTupleStream> aggregated_row_stream;
+    /// memory referenced by the hash table. When it is spilled, this consumes reservation
+    /// for a write buffer only during repartitioning of aggregated rows.
+    ///
+    /// For streaming preaggs, this may be NULL if sufficient memory is not available.
+    /// In that case hash_tbl is also NULL and all rows for the partition will be passed
+    /// through.
+    boost::scoped_ptr<BufferedTupleStreamV2> aggregated_row_stream;
 
     /// Unaggregated rows that are spilled. Always NULL for streaming pre-aggregations.
-    boost::scoped_ptr<BufferedTupleStream> unaggregated_row_stream;
+    /// Always unpinned. Has a write buffer allocated when the partition is spilled and
+    /// unaggregated rows are being processed.
+    boost::scoped_ptr<BufferedTupleStreamV2> unaggregated_row_stream;
   };
 
   /// Stream used to store serialized spilled rows. Only used if needs_serialize_
   /// is set. This stream is never pinned and only used in Partition::Spill as a
   /// a temporary buffer.
-  boost::scoped_ptr<BufferedTupleStream> serialize_stream_;
+  boost::scoped_ptr<BufferedTupleStreamV2> serialize_stream_;
 
   /// Accessor for 'hash_tbls_' that verifies consistency with the partitions.
   HashTable* ALWAYS_INLINE GetHashTable(int partition_idx) {
@@ -447,7 +471,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// FunctionContexts, so is stored outside the stream. If stream's small buffers get
   /// full, it will attempt to switch to IO-buffers.
   Tuple* ConstructIntermediateTuple(const std::vector<AggFnEvaluator*>& agg_fn_evals,
-      BufferedTupleStream* stream, Status* status) noexcept;
+      BufferedTupleStreamV2* stream, Status* status) noexcept;
 
   /// Constructs intermediate tuple, allocating memory from pool instead of the stream.
   /// Returns NULL and sets status if there is not enough memory to allocate the tuple.
@@ -495,7 +519,7 @@ class PartitionedAggregationNode : public ExecNode {
 
   /// Do the aggregation for all tuple rows in the batch when there is no grouping.
   /// This function is replaced by codegen.
-  Status ProcessBatchNoGrouping(RowBatch* batch);
+  Status ProcessBatchNoGrouping(RowBatch* batch) WARN_UNUSED_RESULT;
 
   /// Processes a batch of rows. This is the core function of the algorithm. We partition
   /// the rows into hash_partitions_, spilling as necessary.
@@ -507,9 +531,9 @@ class PartitionedAggregationNode : public ExecNode {
   //
   /// This function is replaced by codegen. We pass in ht_ctx_.get() as an argument for
   /// performance.
-  template<bool AGGREGATED_ROWS>
-  Status IR_ALWAYS_INLINE ProcessBatch(RowBatch* batch,
-      TPrefetchMode::type prefetch_mode, HashTableCtx* ht_ctx);
+  template <bool AGGREGATED_ROWS>
+  Status IR_ALWAYS_INLINE ProcessBatch(RowBatch* batch, TPrefetchMode::type prefetch_mode,
+      HashTableCtx* ht_ctx) WARN_UNUSED_RESULT;
 
   /// Evaluates the rows in 'batch' starting at 'start_row_idx' and stores the results in
   /// the expression values cache in 'ht_ctx'. The number of rows evaluated depends on
@@ -524,7 +548,8 @@ class PartitionedAggregationNode : public ExecNode {
   /// ProcessBatch for codegen to substitute function calls with codegen'd versions.
   /// May spill partitions if not enough memory is available.
   template <bool AGGREGATED_ROWS>
-  Status IR_ALWAYS_INLINE ProcessRow(TupleRow* row, HashTableCtx* ht_ctx);
+  Status IR_ALWAYS_INLINE ProcessRow(
+      TupleRow* row, HashTableCtx* ht_ctx) WARN_UNUSED_RESULT;
 
   /// Create a new intermediate tuple in partition, initialized with row. ht_ctx is
   /// the context for the partition's hash table and hash is the precomputed hash of
@@ -533,35 +558,33 @@ class PartitionedAggregationNode : public ExecNode {
   /// tuple to the partition's stream. Must be inlined into ProcessBatch for codegen
   /// to substitute function calls with codegen'd versions.  insert_it is an iterator
   /// for insertion returned from HashTable::FindBuildRowBucket().
-  template<bool AGGREGATED_ROWS>
-  Status IR_ALWAYS_INLINE AddIntermediateTuple(Partition* partition,
-      TupleRow* row, uint32_t hash, HashTable::Iterator insert_it);
-
-  /// Append a row to a spilled partition. May spill partitions if needed to switch to
-  /// I/O buffers. Selects the correct stream according to the argument. Inlined into
-  /// ProcessBatch().
-  template<bool AGGREGATED_ROWS>
-  Status IR_ALWAYS_INLINE AppendSpilledRow(Partition* partition, TupleRow* row);
+  template <bool AGGREGATED_ROWS>
+  Status IR_ALWAYS_INLINE AddIntermediateTuple(Partition* partition, TupleRow* row,
+      uint32_t hash, HashTable::Iterator insert_it) WARN_UNUSED_RESULT;
 
-  /// Append a row to a stream of a spilled partition. May spill partitions if needed
-  /// to append the row.
-  Status AppendSpilledRow(BufferedTupleStream* stream, TupleRow* row);
+  /// Append a row to a spilled partition. The row may be aggregated or unaggregated
+  /// according to AGGREGATED_ROWS. May spill partitions if needed to append the row
+  /// buffers.
+  template <bool AGGREGATED_ROWS>
+  Status IR_ALWAYS_INLINE AppendSpilledRow(
+      Partition* partition, TupleRow* row) WARN_UNUSED_RESULT;
 
   /// Reads all the rows from input_stream and process them by calling ProcessBatch().
-  template<bool AGGREGATED_ROWS>
-  Status ProcessStream(BufferedTupleStream* input_stream);
+  template <bool AGGREGATED_ROWS>
+  Status ProcessStream(BufferedTupleStreamV2* input_stream) WARN_UNUSED_RESULT;
 
   /// Output 'singleton_output_tuple_' and transfer memory to 'row_batch'.
   void GetSingletonOutput(RowBatch* row_batch);
 
   /// Get rows for the next rowbatch from the next partition. Sets 'partition_eos_' to
   /// true if all rows from all partitions have been returned or the limit is reached.
-  Status GetRowsFromPartition(RuntimeState* state, RowBatch* row_batch);
+  Status GetRowsFromPartition(
+      RuntimeState* state, RowBatch* row_batch) WARN_UNUSED_RESULT;
 
   /// Get output rows from child for streaming pre-aggregation. Aggregates some rows with
   /// hash table and passes through other rows converted into the intermediate
   /// tuple format. Sets 'child_eos_' once all rows from child have been returned.
-  Status GetRowsStreaming(RuntimeState* state, RowBatch* row_batch);
+  Status GetRowsStreaming(RuntimeState* state, RowBatch* row_batch) WARN_UNUSED_RESULT;
 
   /// Return true if we should keep expanding hash tables in the preagg. If false,
   /// the preagg should pass through any rows it can't fit in its tables.
@@ -582,7 +605,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// 'ht_ctx' is passed in as a way to avoid aliasing of 'this' confusing the optimiser.
   Status ProcessBatchStreaming(bool needs_serialize, TPrefetchMode::type prefetch_mode,
       RowBatch* in_batch, RowBatch* out_batch, HashTableCtx* ht_ctx,
-      int remaining_capacity[PARTITION_FANOUT]);
+      int remaining_capacity[PARTITION_FANOUT]) WARN_UNUSED_RESULT;
 
   /// Tries to add intermediate to the hash table 'hash_tbl' of 'partition' for streaming
   /// aggregation. The input row must have been evaluated with 'ht_ctx', with 'hash' set
@@ -592,18 +615,24 @@ class PartitionedAggregationNode : public ExecNode {
   /// keeps track of how many more entries can be added to the hash table so we can avoid
   /// retrying inserts. It is decremented if an insert succeeds and set to zero if an
   /// insert fails. If an error occurs, returns false and sets 'status'.
-  bool IR_ALWAYS_INLINE TryAddToHashTable(HashTableCtx* ht_ctx,
-      Partition* partition, HashTable* hash_tbl, TupleRow* in_row, uint32_t hash,
-      int* remaining_capacity, Status* status);
+  bool IR_ALWAYS_INLINE TryAddToHashTable(HashTableCtx* ht_ctx, Partition* partition,
+      HashTable* hash_tbl, TupleRow* in_row, uint32_t hash, int* remaining_capacity,
+      Status* status) WARN_UNUSED_RESULT;
 
   /// Initializes hash_partitions_. 'level' is the level for the partitions to create.
+  /// If 'single_partition_idx' is provided, it must be a number in range
+  /// [0, PARTITION_FANOUT), and only that partition is created - the others are
+  /// initialized to NULL.
   /// Also sets ht_ctx_'s level to 'level'.
-  Status CreateHashPartitions(int level);
+  Status CreateHashPartitions(
+      int level, int single_partition_idx = -1) WARN_UNUSED_RESULT;
 
   /// Ensure that hash tables for all in-memory partitions are large enough to fit
   /// 'num_rows' additional hash table entries. If there is not enough memory to
-  /// resize the hash tables, may spill partitions.
-  Status CheckAndResizeHashPartitions(int num_rows, const HashTableCtx* ht_ctx);
+  /// resize the hash tables, may spill partitions. 'aggregated_rows' is true if
+  /// we're currently partitioning aggregated rows.
+  Status CheckAndResizeHashPartitions(
+      bool aggregated_rows, int num_rows, const HashTableCtx* ht_ctx) WARN_UNUSED_RESULT;
 
   /// Iterates over all the partitions in hash_partitions_ and returns the number of rows
   /// of the largest spilled partition (in terms of number of aggregated and unaggregated
@@ -614,16 +643,39 @@ class PartitionedAggregationNode : public ExecNode {
   /// initializes output_iterator_ and output_partition_. This either removes
   /// a partition from aggregated_partitions_ (and is done) or removes the next
   /// partition from aggregated_partitions_ and repartitions it.
-  Status NextPartition();
-
-  /// Picks a partition from hash_partitions_ to spill.
-  Status SpillPartition();
+  Status NextPartition() WARN_UNUSED_RESULT;
+
+  /// Tries to build the first partition in 'spilled_partitions_'.
+  /// If successful, set *built_partition to the partition. The caller owns the partition
+  /// and is responsible for closing it. If unsuccessful because the partition could not
+  /// fit in memory, set *built_partition to NULL and append the spilled partition to the
+  /// head of 'spilled_partitions_' so it can be processed by
+  /// RepartitionSpilledPartition().
+  Status BuildSpilledPartition(Partition** built_partition) WARN_UNUSED_RESULT;
+
+  /// Repartitions the first partition in 'spilled_partitions_' into PARTITION_FANOUT
+  /// output partitions. On success, each output partition is either:
+  /// * closed, if no rows were added to the partition.
+  /// * in 'spilled_partitions_', if the partition spilled.
+  /// * in 'aggregated_partitions_', if the output partition was not spilled.
+  Status RepartitionSpilledPartition() WARN_UNUSED_RESULT;
+
+  /// Picks a partition from 'hash_partitions_' to spill. 'more_aggregate_rows' is passed
+  /// to Partition::Spill() when spilling the partition. See the Partition::Spill()
+  /// comment for further explanation.
+  Status SpillPartition(bool more_aggregate_rows) WARN_UNUSED_RESULT;
 
   /// Moves the partitions in hash_partitions_ to aggregated_partitions_ or
   /// spilled_partitions_. Partitions moved to spilled_partitions_ are unpinned.
   /// input_rows is the number of input rows that have been repartitioned.
   /// Used for diagnostics.
-  Status MoveHashPartitions(int64_t input_rows);
+  Status MoveHashPartitions(int64_t input_rows) WARN_UNUSED_RESULT;
+
+  /// Adds a partition to the front of 'spilled_partitions_' for later processing.
+  /// 'spilled_partitions_' uses LIFO so more finely partitioned partitions are processed
+  /// first). This allows us to delete pages earlier and bottom out the recursion
+  /// earlier and also improves time locality of access to spilled data on disk.
+  void PushSpilledPartition(Partition* partition);
 
   /// Calls Close() on every Partition in 'aggregated_partitions_',
   /// 'spilled_partitions_', and 'hash_partitions_' and then resets the lists,
@@ -638,7 +690,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// and returns the IR function in 'fn'. Returns non-OK status if codegen
   /// is unsuccessful.
   Status CodegenUpdateSlot(LlvmCodeGen* codegen, int agg_fn_idx,
-      SlotDescriptor* slot_desc, llvm::Function** fn);
+      SlotDescriptor* slot_desc, llvm::Function** fn) WARN_UNUSED_RESULT;
 
   /// Codegen a call to a function implementing the UDA interface with input values
   /// from 'input_vals'. 'dst_val' should contain the previous value of the aggregate
@@ -647,10 +699,10 @@ class PartitionedAggregationNode : public ExecNode {
   /// the insert position of 'builder'.
   Status CodegenCallUda(LlvmCodeGen* codegen, LlvmBuilder* builder, AggFn* agg_fn,
       llvm::Value* agg_fn_ctx_arg, const std::vector<CodegenAnyVal>& input_vals,
-      const CodegenAnyVal& dst_val, CodegenAnyVal* updated_dst_val);
+      const CodegenAnyVal& dst_val, CodegenAnyVal* updated_dst_val) WARN_UNUSED_RESULT;
 
   /// Codegen UpdateTuple(). Returns non-OK status if codegen is unsuccessful.
-  Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn);
+  Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn) WARN_UNUSED_RESULT;
 
   /// Codegen the non-streaming process row batch loop. The loop has already been
   /// compiled to IR and loaded into the codegen object. UpdateAggTuple has also been
@@ -659,26 +711,28 @@ class PartitionedAggregationNode : public ExecNode {
   /// 'process_batch_no_grouping_fn_' will be updated with the codegened function
   /// depending on whether this is a grouping or non-grouping aggregation.
   /// Assumes AGGREGATED_ROWS = false.
-  Status CodegenProcessBatch(LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode);
+  Status CodegenProcessBatch(
+      LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) WARN_UNUSED_RESULT;
 
   /// Codegen the materialization loop for streaming preaggregations.
   /// 'process_batch_streaming_fn_' will be updated with the codegened function.
   Status CodegenProcessBatchStreaming(
-      LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode);
+      LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) WARN_UNUSED_RESULT;
 
-  /// We need two buffers per partition, one for the aggregated stream and one
-  /// for the unaggregated stream. We need an additional buffer to read the stream
-  /// we are currently repartitioning.
+  /// Compute minimum buffer requirement for grouping aggregations.
+  /// We need one buffer per partition, which is used either as the write buffer for the
+  /// aggregated stream or the unaggregated stream. We need an additional buffer to read
+  /// the stream we are currently repartitioning.
   /// If we need to serialize, we need an additional buffer while spilling a partition
   /// as the partitions aggregate stream needs to be serialized and rewritten.
   /// We do not spill streaming preaggregations, so we do not need to reserve any buffers.
   int MinRequiredBuffers() const {
-    // Must be kept in sync with AggregationNode.computeResourceProfile() in fe.
-    if (is_streaming_preagg_) return 0;
-    return 2 * PARTITION_FANOUT + 1 + (needs_serialize_ ? 1 : 0);
+    DCHECK(!grouping_exprs_.empty());
+    // Must be kept in sync with AggregationNode.computeNodeResourceProfile() in fe.
+    if (is_streaming_preagg_) return 0; // Need 0 buffers to pass through rows.
+    return PARTITION_FANOUT + 1 + (needs_serialize_ ? 1 : 0);
   }
 };
-
 }
 
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-builder-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder-ir.cc b/be/src/exec/partitioned-hash-join-builder-ir.cc
index e5f649e..df58036 100644
--- a/be/src/exec/partitioned-hash-join-builder-ir.cc
+++ b/be/src/exec/partitioned-hash-join-builder-ir.cc
@@ -19,7 +19,7 @@
 
 #include "codegen/impala-ir.h"
 #include "exec/hash-table.inline.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
 #include "runtime/raw-value.inline.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-filter.h"
@@ -30,7 +30,7 @@
 using namespace impala;
 
 inline bool PhjBuilder::AppendRow(
-    BufferedTupleStream* stream, TupleRow* row, Status* status) {
+    BufferedTupleStreamV2* stream, TupleRow* row, Status* status) {
   if (LIKELY(stream->AddRow(row, status))) return true;
   if (UNLIKELY(!status->ok())) return false;
   return AppendRowStreamFull(stream, row, status);
@@ -73,12 +73,12 @@ Status PhjBuilder::ProcessBuildBatch(
 
 bool PhjBuilder::Partition::InsertBatch(TPrefetchMode::type prefetch_mode,
     HashTableCtx* ht_ctx, RowBatch* batch,
-    const vector<BufferedTupleStream::RowIdx>& indices) {
+    const vector<BufferedTupleStreamV2::FlatRowPtr>& flat_rows, Status* status) {
   // Compute the hash values and prefetch the hash table buckets.
   const int num_rows = batch->num_rows();
   HashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache();
   const int prefetch_size = expr_vals_cache->capacity();
-  const BufferedTupleStream::RowIdx* row_indices = indices.data();
+  const BufferedTupleStreamV2::FlatRowPtr* flat_rows_data = flat_rows.data();
   for (int prefetch_group_row = 0; prefetch_group_row < num_rows;
        prefetch_group_row += prefetch_size) {
     int cur_row = prefetch_group_row;
@@ -97,9 +97,9 @@ bool PhjBuilder::Partition::InsertBatch(TPrefetchMode::type prefetch_mode,
     expr_vals_cache->ResetForRead();
     FOREACH_ROW_LIMIT(batch, cur_row, prefetch_size, batch_iter) {
       TupleRow* row = batch_iter.Get();
-      BufferedTupleStream::RowIdx row_idx = row_indices[cur_row];
+      BufferedTupleStreamV2::FlatRowPtr flat_row = flat_rows_data[cur_row];
       if (!expr_vals_cache->IsRowNull()
-          && UNLIKELY(!hash_tbl_->Insert(ht_ctx, row_idx, row))) {
+          && UNLIKELY(!hash_tbl_->Insert(ht_ctx, flat_row, row, status))) {
         return false;
       }
       expr_vals_cache->NextRow();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-builder.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.cc b/be/src/exec/partitioned-hash-join-builder.cc
index 4a5885b..a2f7c96 100644
--- a/be/src/exec/partitioned-hash-join-builder.cc
+++ b/be/src/exec/partitioned-hash-join-builder.cc
@@ -25,8 +25,10 @@
 #include "exec/hash-table.inline.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/exec-env.h"
 #include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-filter-bank.h"
 #include "runtime/runtime-filter.h"
@@ -44,19 +46,23 @@ static const string PREPARE_FOR_READ_FAILED_ERROR_MSG =
     "the memory limit may help this query to complete successfully.";
 
 using namespace impala;
-using namespace llvm;
-using namespace strings;
-using std::unique_ptr;
+using llvm::ConstantInt;
+using llvm::Function;
+using llvm::Type;
+using llvm::Value;
+using strings::Substitute;
 
 PhjBuilder::PhjBuilder(int join_node_id, TJoinOp::type join_op,
     const RowDescriptor* probe_row_desc, const RowDescriptor* build_row_desc,
-    RuntimeState* state)
+    RuntimeState* state, BufferPool::ClientHandle* buffer_pool_client,
+    int64_t spillable_buffer_size)
   : DataSink(build_row_desc),
     runtime_state_(state),
     join_node_id_(join_node_id),
     join_op_(join_op),
     probe_row_desc_(probe_row_desc),
-    block_mgr_client_(NULL),
+    buffer_pool_client_(buffer_pool_client),
+    spillable_buffer_size_(spillable_buffer_size),
     non_empty_build_(false),
     partitions_created_(NULL),
     largest_partition_percent_(NULL),
@@ -137,9 +143,6 @@ Status PhjBuilder::Prepare(RuntimeState* state, MemTracker* parent_mem_tracker)
     RETURN_IF_ERROR(ScalarExprEvaluator::Create(*filter_exprs_[i], state, &pool_,
         expr_mem_pool(), &filter_ctxs_[i].expr_eval));
   }
-  RETURN_IF_ERROR(state->block_mgr()->RegisterClient(
-      Substitute("PartitionedHashJoin id=$0 builder=$1", join_node_id_, this),
-      MinRequiredBuffers(), true, mem_tracker_.get(), state, &block_mgr_client_));
 
   partitions_created_ = ADD_COUNTER(profile(), "PartitionsCreated", TUnit::UNIT);
   largest_partition_percent_ =
@@ -169,6 +172,11 @@ Status PhjBuilder::Open(RuntimeState* state) {
   for (const FilterContext& ctx : filter_ctxs_) {
     RETURN_IF_ERROR(ctx.expr_eval->Open(state));
   }
+  if (ht_allocator_ == nullptr) {
+    // Create 'ht_allocator_' on the first call to Open().
+    ht_allocator_.reset(new Suballocator(
+        state->exec_env()->buffer_pool(), buffer_pool_client_, spillable_buffer_size_));
+  }
   RETURN_IF_ERROR(CreateHashPartitions(0));
   AllocateRuntimeFilters();
 
@@ -248,7 +256,6 @@ void PhjBuilder::Close(RuntimeState* state) {
     if (ctx.expr_eval != nullptr) ctx.expr_eval->Close(state);
   }
   ScalarExpr::Close(filter_exprs_);
-  if (block_mgr_client_ != NULL) state->block_mgr()->ClearReservations(block_mgr_client_);
   ScalarExpr::Close(build_exprs_);
   pool_.Clear();
   DataSink::Close(state);
@@ -264,13 +271,11 @@ void PhjBuilder::Reset() {
 Status PhjBuilder::CreateAndPreparePartition(int level, Partition** partition) {
   all_partitions_.emplace_back(new Partition(runtime_state_, this, level));
   *partition = all_partitions_.back().get();
-  RETURN_IF_ERROR((*partition)->build_rows()->Init(join_node_id_, profile(), true));
+  RETURN_IF_ERROR((*partition)->build_rows()->Init(join_node_id_, true));
   bool got_buffer;
   RETURN_IF_ERROR((*partition)->build_rows()->PrepareForWrite(&got_buffer));
-  if (!got_buffer) {
-    return runtime_state_->block_mgr()->MemLimitTooLowError(
-        block_mgr_client_, join_node_id_);
-  }
+  DCHECK(got_buffer)
+      << "Accounted in min reservation" << buffer_pool_client_->DebugString();
   return Status::OK();
 }
 
@@ -288,22 +293,11 @@ Status PhjBuilder::CreateHashPartitions(int level) {
 }
 
 bool PhjBuilder::AppendRowStreamFull(
-    BufferedTupleStream* stream, TupleRow* row, Status* status) noexcept {
+    BufferedTupleStreamV2* stream, TupleRow* row, Status* status) noexcept {
   while (true) {
-    // Check if the stream is still using small buffers and try to switch to IO-buffers.
-    if (stream->using_small_buffers()) {
-      bool got_buffer;
-      *status = stream->SwitchToIoBuffers(&got_buffer);
-      if (!status->ok()) return false;
-
-      if (got_buffer) {
-        if (LIKELY(stream->AddRow(row, status))) return true;
-        if (!status->ok()) return false;
-      }
-    }
     // We ran out of memory. Pick a partition to spill. If we ran out of unspilled
     // partitions, SpillPartition() will return an error status.
-    *status = SpillPartition(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+    *status = SpillPartition(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
     if (!status->ok()) return false;
     if (stream->AddRow(row, status)) return true;
     if (!status->ok()) return false;
@@ -313,7 +307,7 @@ bool PhjBuilder::AppendRowStreamFull(
 }
 
 // TODO: can we do better with a different spilling heuristic?
-Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
+Status PhjBuilder::SpillPartition(BufferedTupleStreamV2::UnpinMode mode) {
   DCHECK_EQ(hash_partitions_.size(), PARTITION_FANOUT);
   int64_t max_freed_mem = 0;
   int partition_idx = -1;
@@ -323,7 +317,7 @@ Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
     Partition* candidate = hash_partitions_[i];
     if (candidate->IsClosed()) continue;
     if (candidate->is_spilled()) continue;
-    int64_t mem = candidate->build_rows()->bytes_in_mem(false);
+    int64_t mem = candidate->build_rows()->BytesPinned(false);
     if (candidate->hash_tbl() != NULL) {
       // The hash table should not have matches, since we have not probed it yet.
       // Losing match info would lead to incorrect results (IMPALA-1488).
@@ -337,9 +331,9 @@ Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
   }
 
   if (partition_idx == -1) {
-    // Could not find a partition to spill. This means the mem limit was just too low.
-    return runtime_state_->block_mgr()->MemLimitTooLowError(
-        block_mgr_client_, join_node_id_);
+    return Status(Substitute("Internal error: could not find a partition to spill in "
+                             " hash join $1: \n$2\nClient:\n$3",
+        join_node_id_, DebugString(), buffer_pool_client_->DebugString()));
   }
 
   VLOG(2) << "Spilling partition: " << partition_idx << endl << DebugString();
@@ -373,8 +367,7 @@ Status PhjBuilder::BuildHashTablesAndPrepareProbeStreams() {
       partition->Close(NULL);
     } else if (partition->is_spilled()) {
       // We don't need any build-side data for spilled partitions in memory.
-      RETURN_IF_ERROR(
-          partition->build_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
+      partition->build_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
     }
   }
 
@@ -393,7 +386,7 @@ Status PhjBuilder::BuildHashTablesAndPrepareProbeStreams() {
     RETURN_IF_ERROR(partition->BuildHashTable(&built));
     // If we did not have enough memory to build this hash table, we need to spill this
     // partition (clean up the hash table, unpin build).
-    if (!built) RETURN_IF_ERROR(partition->Spill(BufferedTupleStream::UNPIN_ALL));
+    if (!built) RETURN_IF_ERROR(partition->Spill(BufferedTupleStreamV2::UNPIN_ALL));
   }
 
   // We may have spilled additional partitions while building hash tables, we need to
@@ -429,11 +422,11 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
 
   while (probe_streams_to_create > 0) {
     // Create stream in vector, so that it will be cleaned up after any failure.
-    spilled_partition_probe_streams_.emplace_back(std::make_unique<BufferedTupleStream>(
-        runtime_state_, probe_row_desc_, runtime_state_->block_mgr(), block_mgr_client_,
-        false /* use_initial_small_buffers */, false /* read_write */));
-    BufferedTupleStream* probe_stream = spilled_partition_probe_streams_.back().get();
-    RETURN_IF_ERROR(probe_stream->Init(join_node_id_, profile(), false));
+    spilled_partition_probe_streams_.emplace_back(
+        make_unique<BufferedTupleStreamV2>(runtime_state_, probe_row_desc_,
+            buffer_pool_client_, spillable_buffer_size_, spillable_buffer_size_));
+    BufferedTupleStreamV2* probe_stream = spilled_partition_probe_streams_.back().get();
+    RETURN_IF_ERROR(probe_stream->Init(join_node_id_, false));
 
     // Loop until either the stream gets a buffer or all partitions are spilled (in which
     // case SpillPartition() returns an error).
@@ -442,7 +435,7 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
       RETURN_IF_ERROR(probe_stream->PrepareForWrite(&got_buffer));
       if (got_buffer) break;
 
-      RETURN_IF_ERROR(SpillPartition(BufferedTupleStream::UNPIN_ALL));
+      RETURN_IF_ERROR(SpillPartition(BufferedTupleStreamV2::UNPIN_ALL));
       ++probe_streams_to_create;
     }
     --probe_streams_to_create;
@@ -450,7 +443,7 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
   return Status::OK();
 }
 
-vector<unique_ptr<BufferedTupleStream>> PhjBuilder::TransferProbeStreams() {
+vector<unique_ptr<BufferedTupleStreamV2>> PhjBuilder::TransferProbeStreams() {
   return std::move(spilled_partition_probe_streams_);
 }
 
@@ -460,7 +453,7 @@ void PhjBuilder::CloseAndDeletePartitions() {
   all_partitions_.clear();
   hash_partitions_.clear();
   null_aware_partition_ = NULL;
-  for (unique_ptr<BufferedTupleStream>& stream : spilled_partition_probe_streams_) {
+  for (unique_ptr<BufferedTupleStreamV2>& stream : spilled_partition_probe_streams_) {
     stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
   }
   spilled_partition_probe_streams_.clear();
@@ -512,14 +505,14 @@ void PhjBuilder::PublishRuntimeFilters(int64_t num_build_rows) {
 }
 
 Status PhjBuilder::RepartitionBuildInput(
-    Partition* input_partition, int level, BufferedTupleStream* input_probe_rows) {
+    Partition* input_partition, int level, BufferedTupleStreamV2* input_probe_rows) {
   DCHECK_GE(level, 1);
   SCOPED_TIMER(repartition_timer_);
   COUNTER_ADD(num_repartitions_, 1);
   RuntimeState* state = runtime_state_;
 
   // Setup the read buffer and the new partitions.
-  BufferedTupleStream* build_rows = input_partition->build_rows();
+  BufferedTupleStreamV2* build_rows = input_partition->build_rows();
   DCHECK(build_rows != NULL);
   bool got_read_buffer;
   RETURN_IF_ERROR(build_rows->PrepareForRead(true, &got_read_buffer));
@@ -552,7 +545,7 @@ Status PhjBuilder::RepartitionBuildInput(
     bool got_buffer;
     RETURN_IF_ERROR(input_probe_rows->PrepareForRead(true, &got_buffer));
     if (got_buffer) break;
-    RETURN_IF_ERROR(SpillPartition(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
+    RETURN_IF_ERROR(SpillPartition(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT));
   }
 
   RETURN_IF_ERROR(FlushFinal(state));
@@ -580,12 +573,9 @@ bool PhjBuilder::HashTableStoresNulls() const {
 
 PhjBuilder::Partition::Partition(RuntimeState* state, PhjBuilder* parent, int level)
   : parent_(parent), is_spilled_(false), level_(level) {
-  // If we're repartitioning, we can assume the build input is fairly large and small
-  // buffers will most likely just waste memory.
-  bool use_initial_small_buffers = level == 0;
-  build_rows_ =
-      std::make_unique<BufferedTupleStream>(state, parent_->row_desc_, state->block_mgr(),
-          parent_->block_mgr_client_, use_initial_small_buffers, false /* read_write */);
+  build_rows_ = make_unique<BufferedTupleStreamV2>(state, parent_->row_desc_,
+      parent_->buffer_pool_client_, parent->spillable_buffer_size_,
+      parent->spillable_buffer_size_);
 }
 
 PhjBuilder::Partition::~Partition() {
@@ -612,30 +602,15 @@ void PhjBuilder::Partition::Close(RowBatch* batch) {
   }
 }
 
-Status PhjBuilder::Partition::Spill(BufferedTupleStream::UnpinMode mode) {
+Status PhjBuilder::Partition::Spill(BufferedTupleStreamV2::UnpinMode mode) {
   DCHECK(!IsClosed());
-  // Close the hash table as soon as possible to release memory.
+  RETURN_IF_ERROR(parent_->runtime_state_->StartSpilling(parent_->mem_tracker()));
+  // Close the hash table and unpin the stream backing it to free memory.
   if (hash_tbl() != NULL) {
     hash_tbl_->Close();
     hash_tbl_.reset();
   }
-
-  // Unpin the stream as soon as possible to increase the chances that the
-  // SwitchToIoBuffers() call below will succeed.
-  RETURN_IF_ERROR(build_rows_->UnpinStream(mode));
-
-  if (build_rows_->using_small_buffers()) {
-    bool got_buffer;
-    RETURN_IF_ERROR(build_rows_->SwitchToIoBuffers(&got_buffer));
-    if (!got_buffer) {
-      // We'll try again to get the buffers when the stream fills up the small buffers.
-      VLOG_QUERY << "Not enough memory to switch to IO-sized buffer for partition "
-                 << this << " of join=" << parent_->join_node_id_
-                 << " build small buffers=" << build_rows_->using_small_buffers();
-      VLOG_FILE << GetStackTrace();
-    }
-  }
-
+  build_rows_->UnpinStream(mode);
   if (!is_spilled_) {
     COUNTER_ADD(parent_->num_spilled_partitions_, 1);
     if (parent_->num_spilled_partitions_->value() == 1) {
@@ -652,14 +627,14 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
   *built = false;
 
   // Before building the hash table, we need to pin the rows in memory.
-  RETURN_IF_ERROR(build_rows_->PinStream(false, built));
+  RETURN_IF_ERROR(build_rows_->PinStream(built));
   if (!*built) return Status::OK();
 
   RuntimeState* state = parent_->runtime_state_;
   HashTableCtx* ctx = parent_->ht_ctx_.get();
   ctx->set_level(level()); // Set the hash function for building the hash table.
   RowBatch batch(parent_->row_desc_, state->batch_size(), parent_->mem_tracker());
-  vector<BufferedTupleStream::RowIdx> indices;
+  vector<BufferedTupleStreamV2::FlatRowPtr> flat_rows;
   bool eos = false;
 
   // Allocate the partition-local hash table. Initialize the number of buckets based on
@@ -674,22 +649,22 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
   //
   // TODO: Try to allocate the hash table before pinning the stream to avoid needlessly
   // reading all of the spilled rows from disk when we won't succeed anyway.
-  int64_t estimated_num_buckets = build_rows()->RowConsumesMemory() ?
-      HashTable::EstimateNumBuckets(build_rows()->num_rows()) :
-      state->batch_size() * 2;
-  hash_tbl_.reset(HashTable::Create(state, parent_->block_mgr_client_,
+  int64_t estimated_num_buckets = HashTable::EstimateNumBuckets(build_rows()->num_rows());
+  hash_tbl_.reset(HashTable::Create(parent_->ht_allocator_.get(),
       true /* store_duplicates */, parent_->row_desc_->tuple_descriptors().size(),
       build_rows(), 1 << (32 - NUM_PARTITIONING_BITS), estimated_num_buckets));
-  if (!hash_tbl_->Init()) goto not_built;
+  bool success;
+  Status status = hash_tbl_->Init(&success);
+  if (!status.ok() || !success) goto not_built;
+  status = build_rows_->PrepareForRead(false, &success);
+  if (!status.ok()) goto not_built;
+  DCHECK(success) << "Stream was already pinned.";
 
-  bool got_read_buffer;
-  RETURN_IF_ERROR(build_rows_->PrepareForRead(false, &got_read_buffer));
-  DCHECK(got_read_buffer) << "Stream was already pinned.";
   do {
-    RETURN_IF_ERROR(build_rows_->GetNext(&batch, &eos, &indices));
-    DCHECK_EQ(batch.num_rows(), indices.size());
-    DCHECK_LE(batch.num_rows(), hash_tbl_->EmptyBuckets())
-        << build_rows()->RowConsumesMemory();
+    status = build_rows_->GetNext(&batch, &eos, &flat_rows);
+    if (!status.ok()) goto not_built;
+    DCHECK_EQ(batch.num_rows(), flat_rows.size());
+    DCHECK_LE(batch.num_rows(), hash_tbl_->EmptyBuckets());
     TPrefetchMode::type prefetch_mode = state->query_options().prefetch_mode;
     if (parent_->insert_batch_fn_ != NULL) {
       InsertBatchFn insert_batch_fn;
@@ -699,11 +674,12 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
         insert_batch_fn = parent_->insert_batch_fn_;
       }
       DCHECK(insert_batch_fn != NULL);
-      if (UNLIKELY(!insert_batch_fn(this, prefetch_mode, ctx, &batch, indices))) {
+      if (UNLIKELY(
+              !insert_batch_fn(this, prefetch_mode, ctx, &batch, flat_rows, &status))) {
         goto not_built;
       }
-    } else {
-      if (UNLIKELY(!InsertBatch(prefetch_mode, ctx, &batch, indices))) goto not_built;
+    } else if (UNLIKELY(!InsertBatch(prefetch_mode, ctx, &batch, flat_rows, &status))) {
+      goto not_built;
     }
     RETURN_IF_CANCELLED(state);
     RETURN_IF_ERROR(state->GetQueryStatus());
@@ -725,7 +701,7 @@ not_built:
     hash_tbl_->Close();
     hash_tbl_.reset();
   }
-  return Status::OK();
+  return status;
 }
 
 void PhjBuilder::Codegen(LlvmCodeGen* codegen) {
@@ -774,7 +750,8 @@ string PhjBuilder::DebugString() const {
     DCHECK(partition->build_rows() != NULL);
     ss << endl
        << "    Build Rows: " << partition->build_rows()->num_rows()
-       << " (Blocks pinned: " << partition->build_rows()->blocks_pinned() << ")" << endl;
+       << " (Bytes pinned: " << partition->build_rows()->BytesPinned(false) << ")"
+       << endl;
     if (partition->hash_tbl() != NULL) {
       ss << "    Hash Table Rows: " << partition->hash_tbl()->size() << endl;
     }


[08/11] incubator-impala git commit: IMPALA-4674: Part 2: port backend exec to BufferPool

Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-block-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr-test.cc b/be/src/runtime/buffered-block-mgr-test.cc
deleted file mode 100644
index cb294c2..0000000
--- a/be/src/runtime/buffered-block-mgr-test.cc
+++ /dev/null
@@ -1,1547 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <boost/bind.hpp>
-#include <boost/date_time/posix_time/posix_time.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/regex.hpp>
-#include <boost/scoped_ptr.hpp>
-#include <boost/thread/thread.hpp>
-#include <gutil/strings/substitute.h>
-#include <sys/stat.h>
-
-#include "codegen/llvm-codegen.h"
-#include "common/init.h"
-#include "common/object-pool.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/disk-io-mgr.h"
-#include "runtime/exec-env.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/query-state.h"
-#include "runtime/runtime-state.h"
-#include "runtime/test-env.h"
-#include "runtime/tmp-file-mgr.h"
-#include "service/fe-support.h"
-#include "testutil/gtest-util.h"
-#include "util/cpu-info.h"
-#include "util/disk-info.h"
-#include "util/error-util.h"
-#include "util/filesystem-util.h"
-#include "util/promise.h"
-#include "util/test-info.h"
-#include "util/time.h"
-
-#include "gen-cpp/Types_types.h"
-#include "gen-cpp/ImpalaInternalService_types.h"
-
-#include "common/names.h"
-
-using boost::filesystem::directory_iterator;
-using boost::filesystem::remove;
-using boost::regex;
-
-// Note: This is the default scratch dir created by impala.
-// FLAGS_scratch_dirs + TmpFileMgr::TMP_SUB_DIR_NAME.
-const string SCRATCH_DIR = "/tmp/impala-scratch";
-
-// This suffix is appended to a tmp dir
-const string SCRATCH_SUFFIX = "/impala-scratch";
-
-// Number of millieconds to wait to ensure write completes. We don't know for sure how
-// slow the disk will be, so this is much higher than we expect the writes to take.
-const static int WRITE_WAIT_MILLIS = 10000;
-
-// How often to check for write completion
-const static int WRITE_CHECK_INTERVAL_MILLIS = 10;
-
-DECLARE_bool(disk_spill_encryption);
-
-namespace impala {
-
-class BufferedBlockMgrTest : public ::testing::Test {
- protected:
-  const static int block_size_ = 1024;
-
-  virtual void SetUp() {
-    test_env_.reset(new TestEnv());
-    ASSERT_OK(test_env_->Init());
-  }
-
-  virtual void TearDown() {
-    TearDownMgrs();
-    test_env_.reset();
-
-    // Tests modify permissions, so make sure we can delete if they didn't clean up.
-    for (int i = 0; i < created_tmp_dirs_.size(); ++i) {
-      chmod((created_tmp_dirs_[i] + SCRATCH_SUFFIX).c_str(), S_IRWXU);
-    }
-    FileSystemUtil::RemovePaths(created_tmp_dirs_);
-    created_tmp_dirs_.clear();
-    pool_.Clear();
-  }
-
-  /// Reinitialize test_env_ to have multiple temporary directories.
-  vector<string> InitMultipleTmpDirs(int num_dirs) {
-    vector<string> tmp_dirs;
-    for (int i = 0; i < num_dirs; ++i) {
-      const string& dir = Substitute("/tmp/buffered-block-mgr-test.$0", i);
-      // Fix permissions in case old directories were left from previous runs of test.
-      chmod((dir + SCRATCH_SUFFIX).c_str(), S_IRWXU);
-      EXPECT_OK(FileSystemUtil::RemoveAndCreateDirectory(dir));
-      tmp_dirs.push_back(dir);
-      created_tmp_dirs_.push_back(dir);
-    }
-    test_env_.reset(new TestEnv);
-    test_env_->SetTmpFileMgrArgs(tmp_dirs, false);
-    EXPECT_OK(test_env_->Init());
-    EXPECT_EQ(num_dirs, test_env_->tmp_file_mgr()->NumActiveTmpDevices());
-    return tmp_dirs;
-  }
-
-  static void ValidateBlock(BufferedBlockMgr::Block* block, int32_t data) {
-    ASSERT_EQ(block->valid_data_len(), sizeof(int32_t));
-    ASSERT_EQ(*reinterpret_cast<int32_t*>(block->buffer()), data);
-  }
-
-  static int32_t* MakeRandomSizeData(BufferedBlockMgr::Block* block) {
-    // Format is int32_t size, followed by size bytes of data
-    int32_t size = (rand() % 252) + 4; // So blocks have 4-256 bytes of data
-    uint8_t* data = block->Allocate<uint8_t>(size);
-    *(reinterpret_cast<int32_t*>(data)) = size;
-    int i;
-    for (i = 4; i < size-5; ++i) {
-      data[i] = i;
-    }
-    for (; i < size; ++i) {  // End marker of at least 5 0xff's
-      data[i] = 0xff;
-    }
-    return reinterpret_cast<int32_t*>(data);  // Really returns a pointer to size
-  }
-
-  static void ValidateRandomSizeData(BufferedBlockMgr::Block* block, int32_t size) {
-    int32_t bsize = *(reinterpret_cast<int32_t*>(block->buffer()));
-    uint8_t* data = reinterpret_cast<uint8_t*>(block->buffer());
-    int i;
-    ASSERT_EQ(block->valid_data_len(), size);
-    ASSERT_EQ(size, bsize);
-    for (i = 4; i < size - 5; ++i) {
-      ASSERT_EQ(data[i], i);
-    }
-    for (; i < size; ++i) {
-      ASSERT_EQ(data[i], 0xff);
-    }
-  }
-
-  /// Helper to create a simple block manager.
-  BufferedBlockMgr* CreateMgr(int64_t query_id, int max_buffers, int block_size,
-      RuntimeState** query_state = NULL, TQueryOptions* query_options = NULL) {
-    RuntimeState* state;
-    EXPECT_OK(test_env_->CreateQueryStateWithBlockMgr(
-        query_id, max_buffers, block_size, query_options, &state));
-    if (query_state != NULL) *query_state = state;
-    return state->block_mgr();
-  }
-
-  /// Create a new client tracker as a child of the RuntimeState's instance tracker.
-  MemTracker* NewClientTracker(RuntimeState* state) {
-    return pool_.Add(new MemTracker(-1, "client", state->instance_mem_tracker()));
-  }
-
-  BufferedBlockMgr* CreateMgrAndClient(int64_t query_id, int max_buffers, int block_size,
-      int reserved_blocks, bool tolerates_oversubscription,
-      BufferedBlockMgr::Client** client, RuntimeState** query_state = NULL,
-      TQueryOptions* query_options = NULL) {
-    RuntimeState* state;
-    BufferedBlockMgr* mgr =
-        CreateMgr(query_id, max_buffers, block_size, &state, query_options);
-
-    MemTracker* client_tracker = NewClientTracker(state);
-    EXPECT_OK(mgr->RegisterClient(Substitute("Client for query $0", query_id),
-        reserved_blocks, tolerates_oversubscription, client_tracker, state, client));
-    EXPECT_TRUE(client != NULL);
-    if (query_state != NULL) *query_state = state;
-    return mgr;
-  }
-
-  void CreateMgrsAndClients(int64_t start_query_id, int num_mgrs, int buffers_per_mgr,
-      int block_size, int reserved_blocks_per_client, bool tolerates_oversubscription,
-      vector<BufferedBlockMgr*>* mgrs, vector<BufferedBlockMgr::Client*>* clients) {
-    for (int i = 0; i < num_mgrs; ++i) {
-      BufferedBlockMgr::Client* client;
-      BufferedBlockMgr* mgr = CreateMgrAndClient(start_query_id + i, buffers_per_mgr,
-          block_size_, reserved_blocks_per_client, tolerates_oversubscription, &client);
-      mgrs->push_back(mgr);
-      clients->push_back(client);
-    }
-  }
-
-  // Destroy all created query states and associated block managers.
-  void TearDownMgrs() {
-    // Tear down the query states, which DCHECKs that the memory consumption of
-    // the query's trackers is zero.
-    test_env_->TearDownQueries();
-  }
-
-  void AllocateBlocks(BufferedBlockMgr* block_mgr, BufferedBlockMgr::Client* client,
-      int num_blocks, vector<BufferedBlockMgr::Block*>* blocks) {
-    int32_t* data;
-    Status status;
-    BufferedBlockMgr::Block* new_block;
-    for (int i = 0; i < num_blocks; ++i) {
-      ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-      ASSERT_TRUE(new_block != NULL);
-      data = new_block->Allocate<int32_t>(sizeof(int32_t));
-      *data = blocks->size();
-      blocks->push_back(new_block);
-    }
-  }
-
-  // Pin all blocks, expecting they are pinned successfully.
-  void PinBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
-    for (int i = 0; i < blocks.size(); ++i) {
-      bool pinned;
-      ASSERT_OK(blocks[i]->Pin(&pinned));
-      ASSERT_TRUE(pinned);
-    }
-  }
-
-  // Pin all blocks. By default, expect no errors from Unpin() calls. If
-  // expected_error_codes is non-NULL, returning one of the error codes is
-  // also allowed.
-  void UnpinBlocks(const vector<BufferedBlockMgr::Block*>& blocks,
-      const vector<TErrorCode::type>* expected_error_codes = nullptr,
-      int delay_between_unpins_ms = 0) {
-    for (int i = 0; i < blocks.size(); ++i) {
-      Status status = blocks[i]->Unpin();
-      if (!status.ok() && expected_error_codes != nullptr) {
-        // Check if it's one of the expected errors.
-        bool is_expected_error = false;
-        for (TErrorCode::type code : *expected_error_codes) {
-          if (status.code() == code) {
-            is_expected_error = true;
-            break;
-          }
-        }
-        ASSERT_TRUE(is_expected_error) << status.msg().msg();
-      } else {
-        ASSERT_TRUE(status.ok()) << status.msg().msg();
-      }
-      if (delay_between_unpins_ms > 0) SleepForMs(delay_between_unpins_ms);
-    }
-  }
-
-  void DeleteBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
-    for (int i = 0; i < blocks.size(); ++i) {
-      blocks[i]->Delete();
-    }
-  }
-
-  void DeleteBlocks(const vector<pair<BufferedBlockMgr::Block*, int32_t>>& blocks) {
-    for (int i = 0; i < blocks.size(); ++i) {
-      blocks[i].first->Delete();
-    }
-  }
-
-  static void WaitForWrites(BufferedBlockMgr* block_mgr) {
-    vector<BufferedBlockMgr*> block_mgrs;
-    block_mgrs.push_back(block_mgr);
-    WaitForWrites(block_mgrs);
-  }
-
-  // Wait for writes issued through block managers to complete.
-  static void WaitForWrites(const vector<BufferedBlockMgr*>& block_mgrs) {
-    int max_attempts = WRITE_WAIT_MILLIS / WRITE_CHECK_INTERVAL_MILLIS;
-    for (int i = 0; i < max_attempts; ++i) {
-      SleepForMs(WRITE_CHECK_INTERVAL_MILLIS);
-      if (AllWritesComplete(block_mgrs)) return;
-    }
-    ASSERT_TRUE(false) << "Writes did not complete after " << WRITE_WAIT_MILLIS << "ms";
-  }
-
-  static bool AllWritesComplete(BufferedBlockMgr* block_mgr) {
-    return block_mgr->GetNumWritesOutstanding() == 0;
-  }
-
-  static bool AllWritesComplete(const vector<BufferedBlockMgr*>& block_mgrs) {
-    for (int i = 0; i < block_mgrs.size(); ++i) {
-      if (!AllWritesComplete(block_mgrs[i])) return false;
-    }
-    return true;
-  }
-
-  // Remove permissions for the temporary file at 'path' - all subsequent writes
-  // to the file should fail. Expects backing file has already been allocated.
-  static void DisableBackingFile(const string& path) {
-    EXPECT_GT(path.size(), 0);
-    EXPECT_EQ(0, chmod(path.c_str(), 0));
-    LOG(INFO) << "Injected fault by removing file permissions " << path;
-  }
-
-  // Check that the file backing the block has dir as a prefix of its path.
-  static bool BlockInDir(BufferedBlockMgr::Block* block, const string& dir) {
-    return block->TmpFilePath().find(dir) == 0;
-  }
-
-  // Find a block in the list that is backed by a file with the given directory as prefix
-  // of its path.
-  static BufferedBlockMgr::Block* FindBlockForDir(
-      const vector<BufferedBlockMgr::Block*>& blocks, const string& dir) {
-    for (int i = 0; i < blocks.size(); ++i) {
-      if (BlockInDir(blocks[i], dir)) return blocks[i];
-    }
-    return NULL;
-  }
-
-  void TestGetNewBlockImpl(int block_size) {
-    Status status;
-    int max_num_blocks = 5;
-    vector<BufferedBlockMgr::Block*> blocks;
-    BufferedBlockMgr* block_mgr;
-    BufferedBlockMgr::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, false, &client);
-    ASSERT_EQ(test_env_->TotalQueryMemoryConsumption(), 0);
-
-    // Allocate blocks until max_num_blocks, they should all succeed and memory
-    // usage should go up.
-    BufferedBlockMgr::Block* new_block;
-    BufferedBlockMgr::Block* first_block = NULL;
-    for (int i = 0; i < max_num_blocks; ++i) {
-      status = block_mgr->GetNewBlock(client, NULL, &new_block);
-      ASSERT_TRUE(new_block != NULL);
-      ASSERT_EQ(block_mgr->bytes_allocated(), (i + 1) * block_size);
-      if (first_block == NULL) first_block = new_block;
-      blocks.push_back(new_block);
-    }
-
-    // Trying to allocate a new one should fail.
-    ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-    ASSERT_TRUE(new_block == NULL);
-    ASSERT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-
-    // We can allocate a new block by transferring an already allocated one.
-    uint8_t* old_buffer = first_block->buffer();
-    ASSERT_OK(block_mgr->GetNewBlock(client, first_block, &new_block));
-    ASSERT_TRUE(new_block != NULL);
-    ASSERT_EQ(old_buffer, new_block->buffer());
-    ASSERT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-    ASSERT_TRUE(!first_block->is_pinned());
-    blocks.push_back(new_block);
-
-    // Trying to allocate a new one should still fail.
-    ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-    ASSERT_TRUE(new_block == NULL);
-    ASSERT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-
-    ASSERT_EQ(block_mgr->writes_issued(), 1);
-
-    DeleteBlocks(blocks);
-    TearDownMgrs();
-  }
-
-  void TestEvictionImpl(int block_size) {
-    ASSERT_GT(block_size, 0);
-    int max_num_buffers = 5;
-    BufferedBlockMgr* block_mgr;
-    BufferedBlockMgr::Client* client;
-    block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-
-    // Check counters.
-    RuntimeProfile* profile = block_mgr->profile();
-    RuntimeProfile::Counter* buffered_pin = profile->GetCounter("BufferedPins");
-
-    vector<BufferedBlockMgr::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
-    ASSERT_EQ(block_mgr->bytes_allocated(), max_num_buffers * block_size);
-    for (BufferedBlockMgr::Block* block : blocks) block->Unpin();
-
-    // Re-pinning all blocks
-    for (int i = 0; i < blocks.size(); ++i) {
-      bool pinned;
-      ASSERT_OK(blocks[i]->Pin(&pinned));
-      ASSERT_TRUE(pinned);
-      ValidateBlock(blocks[i], i);
-    }
-    int buffered_pins_expected = blocks.size();
-    ASSERT_EQ(buffered_pin->value(), buffered_pins_expected);
-
-    // Unpin all blocks
-    for (BufferedBlockMgr::Block* block : blocks) block->Unpin();
-    // Get two new blocks.
-    AllocateBlocks(block_mgr, client, 2, &blocks);
-    // At least two writes must be issued. The first (num_blocks - 2) must be in memory.
-    ASSERT_GE(block_mgr->writes_issued(), 2);
-    for (int i = 0; i < (max_num_buffers - 2); ++i) {
-      bool pinned;
-      ASSERT_OK(blocks[i]->Pin(&pinned));
-      ASSERT_TRUE(pinned);
-      ValidateBlock(blocks[i], i);
-    }
-    ASSERT_GE(buffered_pin->value(), buffered_pins_expected);
-    DeleteBlocks(blocks);
-    TearDownMgrs();
-  }
-
-  // Test that randomly issues GetFreeBlock(), Pin(), Unpin(), Delete() and Close()
-  // calls. All calls made are legal - error conditions are not expected until the first
-  // call to Close(). This is called 2 times with encryption+integrity on/off.
-  // When executed in single-threaded mode 'tid' should be SINGLE_THREADED_TID.
-  static const int SINGLE_THREADED_TID = -1;
-  void TestRandomInternalImpl(RuntimeState* state, BufferedBlockMgr* block_mgr,
-      int num_buffers, int tid) {
-    ASSERT_TRUE(block_mgr != NULL);
-    const int num_iterations = 10000;
-    const int iters_before_close = num_iterations - 1000;
-    bool close_called = false;
-    unordered_map<BufferedBlockMgr::Block*, int> pinned_block_map;
-    vector<pair<BufferedBlockMgr::Block*, int32_t>> pinned_blocks;
-    unordered_map<BufferedBlockMgr::Block*, int> unpinned_block_map;
-    vector<pair<BufferedBlockMgr::Block*, int32_t>> unpinned_blocks;
-
-    typedef enum { Pin, New, Unpin, Delete, Close } ApiFunction;
-    ApiFunction api_function;
-
-    BufferedBlockMgr::Client* client;
-    ASSERT_OK(
-        block_mgr->RegisterClient("", 0, false, NewClientTracker(state), state, &client));
-    ASSERT_TRUE(client != NULL);
-
-    pinned_blocks.reserve(num_buffers);
-    BufferedBlockMgr::Block* new_block;
-    for (int i = 0; i < num_iterations; ++i) {
-      if ((i % 20000) == 0) LOG (ERROR) << " Iteration " << i << endl;
-      if (i > iters_before_close && (rand() % 5 == 0)) {
-        api_function = Close;
-      } else if (pinned_blocks.size() == 0 && unpinned_blocks.size() == 0) {
-        api_function = New;
-      } else if (pinned_blocks.size() == 0) {
-        // Pin or New. Can't unpin or delete.
-        api_function = static_cast<ApiFunction>(rand() % 2);
-      } else if (pinned_blocks.size() >= num_buffers) {
-        // Unpin or delete. Can't pin or get new.
-        api_function = static_cast<ApiFunction>(2 + (rand() % 2));
-      } else if (unpinned_blocks.size() == 0) {
-        // Can't pin. Unpin, new or delete.
-        api_function = static_cast<ApiFunction>(1 + (rand() % 3));
-      } else {
-        // Any api function.
-        api_function = static_cast<ApiFunction>(rand() % 4);
-      }
-
-      pair<BufferedBlockMgr::Block*, int32_t> block_data;
-      int rand_pick = 0;
-      int32_t* data = NULL;
-      bool pinned = false;
-      Status status;
-      switch (api_function) {
-        case New:
-          status = block_mgr->GetNewBlock(client, NULL, &new_block);
-          if (close_called || (tid != SINGLE_THREADED_TID && status.IsCancelled())) {
-            ASSERT_TRUE(new_block == NULL);
-            ASSERT_TRUE(status.IsCancelled());
-            continue;
-          }
-          ASSERT_OK(status);
-          ASSERT_TRUE(new_block != NULL);
-          data = MakeRandomSizeData(new_block);
-          block_data = make_pair(new_block, *data);
-
-          pinned_blocks.push_back(block_data);
-          pinned_block_map.insert(make_pair(block_data.first, pinned_blocks.size() - 1));
-          break;
-        case Pin:
-          rand_pick = rand() % unpinned_blocks.size();
-          block_data = unpinned_blocks[rand_pick];
-          status = block_data.first->Pin(&pinned);
-          if (close_called || (tid != SINGLE_THREADED_TID && status.IsCancelled())) {
-            ASSERT_TRUE(status.IsCancelled());
-            // In single-threaded runs the block should not have been pinned.
-            // In multi-threaded runs Pin() may return the block pinned but the status to
-            // be cancelled. In this case we could move the block from unpinned_blocks
-            // to pinned_blocks. We do not do that because after IsCancelled() no actual
-            // block operations should take place.
-            if (tid == SINGLE_THREADED_TID) ASSERT_FALSE(pinned);
-            continue;
-          }
-          ASSERT_OK(status);
-          ASSERT_TRUE(pinned);
-          ValidateRandomSizeData(block_data.first, block_data.second);
-          unpinned_blocks[rand_pick] = unpinned_blocks.back();
-          unpinned_blocks.pop_back();
-          unpinned_block_map[unpinned_blocks[rand_pick].first] = rand_pick;
-
-          pinned_blocks.push_back(block_data);
-          pinned_block_map.insert(make_pair(block_data.first, pinned_blocks.size() - 1));
-          break;
-        case Unpin:
-          rand_pick = rand() % pinned_blocks.size();
-          block_data = pinned_blocks[rand_pick];
-          status = block_data.first->Unpin();
-          if (close_called || (tid != SINGLE_THREADED_TID && status.IsCancelled())) {
-            ASSERT_TRUE(status.IsCancelled());
-            continue;
-          }
-          ASSERT_OK(status);
-          pinned_blocks[rand_pick] = pinned_blocks.back();
-          pinned_blocks.pop_back();
-          pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick;
-
-          unpinned_blocks.push_back(block_data);
-          unpinned_block_map.insert(make_pair(block_data.first,
-              unpinned_blocks.size() - 1));
-          break;
-        case Delete:
-          rand_pick = rand() % pinned_blocks.size();
-          block_data = pinned_blocks[rand_pick];
-          block_data.first->Delete();
-          pinned_blocks[rand_pick] = pinned_blocks.back();
-          pinned_blocks.pop_back();
-          pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick;
-          break;
-        case Close:
-          block_mgr->Cancel();
-          close_called = true;
-          break;
-      }
-    }
-
-    // The client needs to delete all its blocks.
-    DeleteBlocks(pinned_blocks);
-    DeleteBlocks(unpinned_blocks);
-  }
-
-  // Single-threaded execution of the TestRandomInternalImpl.
-  void TestRandomInternalSingle(int block_size) {
-    ASSERT_GT(block_size, 0);
-    ASSERT_TRUE(test_env_.get() != NULL);
-    const int max_num_buffers = 100;
-    RuntimeState* state;
-    BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &state);
-    TestRandomInternalImpl(state, block_mgr, max_num_buffers, SINGLE_THREADED_TID);
-    TearDownMgrs();
-  }
-
-  // Multi-threaded execution of the TestRandomInternalImpl.
-  void TestRandomInternalMulti(int num_threads, int block_size) {
-    ASSERT_GT(num_threads, 0);
-    ASSERT_GT(block_size, 0);
-    ASSERT_TRUE(test_env_.get() != NULL);
-    const int max_num_buffers = 100;
-    RuntimeState* state;
-    BufferedBlockMgr* block_mgr = CreateMgr(0, num_threads * max_num_buffers, block_size,
-        &state);
-
-    thread_group workers;
-    for (int i = 0; i < num_threads; ++i) {
-      thread* t = new thread(bind(&BufferedBlockMgrTest::TestRandomInternalImpl, this,
-                                  state, block_mgr, max_num_buffers, i));
-      workers.add_thread(t);
-    }
-    workers.join_all();
-    TearDownMgrs();
-  }
-
-  // Repeatedly call BufferedBlockMgr::Create() and BufferedBlockMgr::~BufferedBlockMgr().
-  void CreateDestroyThread(RuntimeState* state) {
-    const int num_buffers = 10;
-    const int iters = 10000;
-    for (int i = 0; i < iters; ++i) {
-      shared_ptr<BufferedBlockMgr> mgr;
-      Status status = BufferedBlockMgr::Create(state, state->query_mem_tracker(),
-          state->runtime_profile(), test_env_->tmp_file_mgr(), block_size_ * num_buffers,
-          block_size_, &mgr);
-    }
-  }
-
-  // IMPALA-2286: Test for races between BufferedBlockMgr::Create() and
-  // BufferedBlockMgr::~BufferedBlockMgr().
-  void CreateDestroyMulti() {
-    const int num_threads = 8;
-    thread_group workers;
-    // Create a shared RuntimeState with no BufferedBlockMgr.
-    RuntimeState shared_state(TQueryCtx(), test_env_->exec_env());
-
-    for (int i = 0; i < num_threads; ++i) {
-      thread* t = new thread(
-          bind(&BufferedBlockMgrTest::CreateDestroyThread, this, &shared_state));
-      workers.add_thread(t);
-    }
-    workers.join_all();
-    shared_state.ReleaseResources();
-  }
-
-  // Test that in-flight IO operations are correctly handled on tear down.
-  // write: if true, tear down while write operations are in flight, otherwise tear down
-  //    during read operations.
-  void TestDestructDuringIO(bool write);
-
-  /// Test for IMPALA-2252: race when tearing down runtime state and block mgr after query
-  /// cancellation. Simulates query cancellation while writes are in flight. Forces the
-  /// block mgr to have a longer lifetime than the runtime state. If write_error is true,
-  /// force writes to hit errors. If wait_for_writes is true, wait for writes to complete
-  /// before destroying block mgr.
-  void TestRuntimeStateTeardown(bool write_error, bool wait_for_writes);
-
-  void TestWriteError(int write_delay_ms);
-
-  scoped_ptr<TestEnv> test_env_;
-  ObjectPool pool_;
-  vector<string> created_tmp_dirs_;
-};
-
-TEST_F(BufferedBlockMgrTest, GetNewBlock) {
-  TestGetNewBlockImpl(1024);
-  TestGetNewBlockImpl(8 * 1024);
-  TestGetNewBlockImpl(8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, GetNewBlockSmallBlocks) {
-  const int block_size = 1024;
-  int max_num_blocks = 3;
-  BufferedBlockMgr* block_mgr;
-  BufferedBlockMgr::Client* client;
-  block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, false, &client);
-  MemTracker* client_tracker = block_mgr->get_tracker(client);
-  ASSERT_EQ(0, test_env_->TotalQueryMemoryConsumption());
-
-  vector<BufferedBlockMgr::Block*> blocks;
-
-  // Allocate a small block.
-  BufferedBlockMgr::Block* new_block = NULL;
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block, 128));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_EQ(block_mgr->bytes_allocated(), 0);
-  ASSERT_EQ(block_mgr->mem_tracker()->consumption(), 0);
-  ASSERT_EQ(client_tracker->consumption(), 128);
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_EQ(new_block->BytesRemaining(), 128);
-  ASSERT_TRUE(new_block->buffer() != NULL);
-  blocks.push_back(new_block);
-
-  // Allocate a normal block
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size());
-  ASSERT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size());
-  ASSERT_EQ(client_tracker->consumption(), 128 + block_mgr->max_block_size());
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_EQ(new_block->BytesRemaining(), block_mgr->max_block_size());
-  ASSERT_TRUE(new_block->buffer() != NULL);
-  blocks.push_back(new_block);
-
-  // Allocate another small block.
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block, 512));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size());
-  ASSERT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size());
-  ASSERT_EQ(client_tracker->consumption(), 128 + 512 + block_mgr->max_block_size());
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_EQ(new_block->BytesRemaining(), 512);
-  ASSERT_TRUE(new_block->buffer() != NULL);
-  blocks.push_back(new_block);
-
-  // Should be able to unpin and pin the middle block
-  ASSERT_OK(blocks[1]->Unpin());
-
-  bool pinned;
-  ASSERT_OK(blocks[1]->Pin(&pinned));
-  ASSERT_TRUE(pinned);
-
-  DeleteBlocks(blocks);
-  TearDownMgrs();
-}
-
-// Test that pinning more blocks than the max available buffers.
-TEST_F(BufferedBlockMgrTest, Pin) {
-  int max_num_blocks = 5;
-  const int block_size = 1024;
-  BufferedBlockMgr* block_mgr;
-  BufferedBlockMgr::Client* client;
-  block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, false, &client);
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
-
-  // Unpin them all.
-  for (int i = 0; i < blocks.size(); ++i) {
-    ASSERT_OK(blocks[i]->Unpin());
-  }
-
-  // Allocate more, this should work since we just unpinned some blocks.
-  AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
-
-  // Try to pin a unpinned block, this should not be possible.
-  bool pinned;
-  ASSERT_OK(blocks[0]->Pin(&pinned));
-  ASSERT_FALSE(pinned);
-
-  // Unpin all blocks.
-  for (int i = 0; i < blocks.size(); ++i) {
-    ASSERT_OK(blocks[i]->Unpin());
-  }
-
-  // Should be able to pin max_num_blocks blocks.
-  for (int i = 0; i < max_num_blocks; ++i) {
-    ASSERT_OK(blocks[i]->Pin(&pinned));
-    ASSERT_TRUE(pinned);
-  }
-
-  // Can't pin any more though.
-  ASSERT_OK(blocks[max_num_blocks]->Pin(&pinned));
-  ASSERT_FALSE(pinned);
-
-  DeleteBlocks(blocks);
-  TearDownMgrs();
-}
-
-// Test the eviction policy of the block mgr. No writes issued until more than
-// the max available buffers are allocated. Writes must be issued in LIFO order.
-TEST_F(BufferedBlockMgrTest, Eviction) {
-  TestEvictionImpl(1024);
-  TestEvictionImpl(8 * 1024 * 1024);
-}
-
-// Test deletion and reuse of blocks.
-TEST_F(BufferedBlockMgrTest, Deletion) {
-  int max_num_buffers = 5;
-  const int block_size = 1024;
-  BufferedBlockMgr* block_mgr;
-  BufferedBlockMgr::Client* client;
-  block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-
-  // Check counters.
-  RuntimeProfile* profile = block_mgr->profile();
-  RuntimeProfile::Counter* recycled_cnt = profile->GetCounter("BlocksRecycled");
-  RuntimeProfile::Counter* created_cnt = profile->GetCounter("BlocksCreated");
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  ASSERT_EQ(created_cnt->value(), max_num_buffers);
-
-  DeleteBlocks(blocks);
-  blocks.clear();
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  ASSERT_EQ(created_cnt->value(), max_num_buffers);
-  ASSERT_EQ(recycled_cnt->value(), max_num_buffers);
-
-  DeleteBlocks(blocks);
-  TearDownMgrs();
-}
-
-// Delete blocks of various sizes and statuses to exercise the different code paths.
-// This relies on internal validation in block manager to detect many errors.
-TEST_F(BufferedBlockMgrTest, DeleteSingleBlocks) {
-  int max_num_buffers = 16;
-  BufferedBlockMgr::Client* client;
-  BufferedBlockMgr* block_mgr =
-      CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
-  MemTracker* client_tracker = block_mgr->get_tracker(client);
-
-  // Pinned I/O block.
-  BufferedBlockMgr::Block* new_block;
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_TRUE(new_block->is_max_size());
-  new_block->Delete();
-  ASSERT_EQ(0, client_tracker->consumption());
-
-  // Pinned non-I/O block.
-  int small_block_size = 128;
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block, small_block_size));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_EQ(small_block_size, client_tracker->consumption());
-  new_block->Delete();
-  ASSERT_EQ(0, client_tracker->consumption());
-
-  // Unpinned I/O block - delete after written to disk.
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_TRUE(new_block->is_max_size());
-  new_block->Unpin();
-  ASSERT_FALSE(new_block->is_pinned());
-  WaitForWrites(block_mgr);
-  new_block->Delete();
-  ASSERT_EQ(client_tracker->consumption(), 0);
-
-  // Unpinned I/O block - delete before written to disk.
-  ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
-  ASSERT_TRUE(new_block != NULL);
-  ASSERT_TRUE(new_block->is_pinned());
-  ASSERT_TRUE(new_block->is_max_size());
-  new_block->Unpin();
-  ASSERT_FALSE(new_block->is_pinned());
-  new_block->Delete();
-  WaitForWrites(block_mgr);
-  ASSERT_EQ(client_tracker->consumption(), 0);
-
-  TearDownMgrs();
-}
-
-// This exercises a code path where:
-// 1. A block A is unpinned.
-// 2. A block B is unpinned.
-// 3. A write for block A is initiated.
-// 4. Block A is pinned.
-// 5. Block B is pinned, with block A passed in to be deleted.
-//    Block A's buffer will be transferred to block B.
-// 6. The write for block A completes.
-// Previously there was a bug (IMPALA-3936) where the buffer transfer happened before the
-// write completed. There were also various hangs related to missing condition variable
-// notifications.
-TEST_F(BufferedBlockMgrTest, TransferBufferDuringWrite) {
-  const int trials = 5;
-  const int max_num_buffers = 2;
-  BufferedBlockMgr::Client* client;
-  RuntimeState* query_state;
-  BufferedBlockMgr* block_mgr = CreateMgrAndClient(
-      0, max_num_buffers, block_size_, 1, false, &client, &query_state);
-
-  for (int trial = 0; trial < trials; ++trial) {
-    for (int delay_ms = 0; delay_ms <= 10; delay_ms += 5) {
-      // Force writes to be delayed to enlarge window of opportunity for bug.
-      block_mgr->set_debug_write_delay_ms(delay_ms);
-      vector<BufferedBlockMgr::Block*> blocks;
-      AllocateBlocks(block_mgr, client, 2, &blocks);
-
-      // Force the second block to be written and have its buffer freed.
-      // We only have one buffer to share between the first and second blocks now.
-      ASSERT_OK(blocks[1]->Unpin());
-
-      // Create another client. Reserving different numbers of buffers can send it
-      // down different code paths because the original client is entitled to different
-      // number of buffers.
-      int reserved_buffers = trial % max_num_buffers;
-      BufferedBlockMgr::Client* tmp_client;
-      ASSERT_OK(block_mgr->RegisterClient("tmp_client", reserved_buffers, false,
-          NewClientTracker(query_state), query_state, &tmp_client));
-      BufferedBlockMgr::Block* tmp_block;
-      ASSERT_OK(block_mgr->GetNewBlock(tmp_client, NULL, &tmp_block));
-
-      // Initiate the write, repin the block, then immediately try to swap the buffer to
-      // the second block while the write is still in flight.
-      ASSERT_OK(blocks[0]->Unpin());
-      bool pinned;
-      ASSERT_OK(blocks[0]->Pin(&pinned));
-      ASSERT_TRUE(pinned);
-      ASSERT_OK(blocks[1]->Pin(&pinned, blocks[0], false));
-      ASSERT_TRUE(pinned);
-
-      blocks[1]->Delete();
-      tmp_block->Delete();
-      block_mgr->ClearReservations(tmp_client);
-    }
-  }
-}
-
-// Test that all APIs return cancelled after close.
-TEST_F(BufferedBlockMgrTest, Close) {
-  int max_num_buffers = 5;
-  const int block_size = 1024;
-  BufferedBlockMgr* block_mgr;
-  BufferedBlockMgr::Client* client;
-  block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
-  block_mgr->Cancel();
-
-  BufferedBlockMgr::Block* new_block;
-  Status status = block_mgr->GetNewBlock(client, NULL, &new_block);
-  ASSERT_TRUE(status.IsCancelled());
-  ASSERT_TRUE(new_block == NULL);
-  status = blocks[0]->Unpin();
-  ASSERT_TRUE(status.IsCancelled());
-  bool pinned;
-  status = blocks[0]->Pin(&pinned);
-  ASSERT_TRUE(status.IsCancelled());
-
-  DeleteBlocks(blocks);
-  TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, DestructDuringWrite) {
-  const int trials = 20;
-  const int max_num_buffers = 5;
-
-  for (int trial = 0; trial < trials; ++trial) {
-    BufferedBlockMgr::Client* client;
-    BufferedBlockMgr* block_mgr =
-        CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
-
-    vector<BufferedBlockMgr::Block*> blocks;
-    AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
-    // Unpin will initiate writes.
-    UnpinBlocks(blocks);
-
-    // Writes should still be in flight when blocks are deleted.
-    DeleteBlocks(blocks);
-
-    // Destruct block manager while blocks are deleted and writes are in flight.
-    TearDownMgrs();
-  }
-  // Destroying test environment will check that all writes have completed.
-}
-
-void BufferedBlockMgrTest::TestRuntimeStateTeardown(
-    bool write_error, bool wait_for_writes) {
-  const int max_num_buffers = 10;
-  RuntimeState* state;
-  BufferedBlockMgr::Client* client;
-  CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client, &state);
-
-  // Hold extra references to block mgr and query state so they outlive RuntimeState.
-  shared_ptr<BufferedBlockMgr> block_mgr;
-  QueryState::ScopedRef qs(state->query_id());
-  Status status = BufferedBlockMgr::Create(state, state->query_mem_tracker(),
-      state->runtime_profile(), test_env_->tmp_file_mgr(), 0, block_size_, &block_mgr);
-  ASSERT_TRUE(status.ok());
-  ASSERT_TRUE(block_mgr != NULL);
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr.get(), client, max_num_buffers, &blocks);
-
-  if (write_error) {
-    // Force flushing blocks to disk then remove temporary file to force writes to fail.
-    UnpinBlocks(blocks);
-    vector<BufferedBlockMgr::Block*> more_blocks;
-    AllocateBlocks(block_mgr.get(), client, max_num_buffers, &more_blocks);
-
-    const string& tmp_file_path = blocks[0]->TmpFilePath();
-    DeleteBlocks(more_blocks);
-    PinBlocks(blocks);
-    DisableBackingFile(tmp_file_path);
-  }
-
-  // Unpin will initiate writes. If the write error propagates fast enough, some Unpin()
-  // calls may see a cancelled block mgr.
-  vector<TErrorCode::type> cancelled_code = {TErrorCode::CANCELLED};
-  UnpinBlocks(blocks, write_error ? &cancelled_code : nullptr);
-
-  // Tear down while writes are in flight. The block mgr may outlive the runtime state
-  // because it may be referenced by other runtime states. This test simulates this
-  // scenario by holding onto a reference to the block mgr. This should be safe so
-  // long as blocks are properly deleted before the runtime state is torn down.
-  DeleteBlocks(blocks);
-  test_env_->TearDownQueries();
-
-  // Optionally wait for writes to complete after cancellation.
-  if (wait_for_writes) WaitForWrites(block_mgr.get());
-  block_mgr.reset();
-
-  ASSERT_EQ(test_env_->TotalQueryMemoryConsumption(), 0);
-}
-
-TEST_F(BufferedBlockMgrTest, RuntimeStateTeardown) {
-  TestRuntimeStateTeardown(false, false);
-}
-
-TEST_F(BufferedBlockMgrTest, RuntimeStateTeardownWait) {
-  TestRuntimeStateTeardown(false, true);
-}
-
-TEST_F(BufferedBlockMgrTest, RuntimeStateTeardownWriteError) {
-  TestRuntimeStateTeardown(true, true);
-}
-
-// Regression test for IMPALA-2927 write complete with cancelled runtime state
-TEST_F(BufferedBlockMgrTest, WriteCompleteWithCancelledRuntimeState) {
-  const int max_num_buffers = 10;
-  RuntimeState* state;
-  BufferedBlockMgr::Client* client;
-  BufferedBlockMgr* block_mgr =
-      CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client, &state);
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
-  // Force flushing blocks to disk so that more writes are in flight.
-  UnpinBlocks(blocks);
-
-  // Cancel the runtime state and re-pin the blocks while writes are in flight to check
-  // that WriteComplete() handles the case ok.
-  state->set_is_cancelled();
-  PinBlocks(blocks);
-
-  WaitForWrites(block_mgr);
-  DeleteBlocks(blocks);
-}
-
-// Remove write permissions on scratch files. Return # of scratch files.
-static int remove_scratch_perms() {
-  int num_files = 0;
-  directory_iterator dir_it(SCRATCH_DIR);
-  for (; dir_it != directory_iterator(); ++dir_it) {
-    ++num_files;
-    chmod(dir_it->path().c_str(), 0);
-  }
-
-  return num_files;
-}
-
-// Test that the block manager behaves correctly after a write error.  Delete the scratch
-// directory before an operation that would cause a write and test that subsequent API
-// calls return 'CANCELLED' correctly.
-void BufferedBlockMgrTest::TestWriteError(int write_delay_ms) {
-  int max_num_buffers = 2;
-  const int block_size = 1024;
-  BufferedBlockMgr* block_mgr;
-  BufferedBlockMgr::Client* client;
-  block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-  block_mgr->set_debug_write_delay_ms(write_delay_ms);
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  // Unpin two blocks here, to ensure that backing storage is allocated in tmp file.
-  UnpinBlocks(blocks);
-  WaitForWrites(block_mgr);
-  // Repin the blocks
-  PinBlocks(blocks);
-  // Remove the backing storage so that future writes will fail
-  int num_files = remove_scratch_perms();
-  ASSERT_GT(num_files, 0);
-  vector<TErrorCode::type> expected_error_codes = {TErrorCode::CANCELLED,
-      TErrorCode::SCRATCH_ALLOCATION_FAILED};
-  // Give the first write a chance to fail before the second write starts.
-  int interval_ms = 10;
-  UnpinBlocks(blocks, &expected_error_codes, interval_ms);
-  WaitForWrites(block_mgr);
-  // Subsequent calls should fail.
-  DeleteBlocks(blocks);
-  BufferedBlockMgr::Block* new_block;
-  ASSERT_TRUE(block_mgr->GetNewBlock(client, NULL, &new_block).IsCancelled());
-  ASSERT_TRUE(new_block == NULL);
-
-  TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, WriteError) {
-  TestWriteError(0);
-}
-
-// Regression test for IMPALA-4842 - inject a delay in the write to
-// reproduce the issue.
-TEST_F(BufferedBlockMgrTest, WriteErrorWriteDelay) {
-  TestWriteError(100);
-}
-
-// Test block manager error handling when temporary file space cannot be allocated to
-// back an unpinned buffer.
-TEST_F(BufferedBlockMgrTest, TmpFileAllocateError) {
-  int max_num_buffers = 2;
-  BufferedBlockMgr::Client* client;
-  BufferedBlockMgr* block_mgr =
-      CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
-
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  // Unpin a block, forcing a write.
-  ASSERT_OK(blocks[0]->Unpin());
-  WaitForWrites(block_mgr);
-  // Remove temporary files - subsequent operations will fail.
-  int num_files = remove_scratch_perms();
-  ASSERT_TRUE(num_files > 0);
-  // Current implementation will not fail here until it attempts to write the file.
-  // This behavior is not contractual but we want to know if it changes accidentally.
-  ASSERT_OK(blocks[1]->Unpin());
-
-  // Write failure should cancel query
-  WaitForWrites(block_mgr);
-  ASSERT_TRUE(block_mgr->IsCancelled());
-
-  DeleteBlocks(blocks);
-  TearDownMgrs();
-}
-
-// Test that the block manager is able to blacklist a temporary device correctly after a
-// write error. The query that encountered the write error should not allocate more
-// blocks on that device, but existing blocks on the device will remain in use and future
-// queries will use the device.
-TEST_F(BufferedBlockMgrTest, WriteErrorBlacklist) {
-  // Set up two buffered block managers with two temporary dirs.
-  vector<string> tmp_dirs = InitMultipleTmpDirs(2);
-  // Simulate two concurrent queries.
-  const int NUM_BLOCK_MGRS = 2;
-  const int MAX_NUM_BLOCKS = 4;
-  int blocks_per_mgr = MAX_NUM_BLOCKS / NUM_BLOCK_MGRS;
-  vector<BufferedBlockMgr*> block_mgrs;
-  vector<BufferedBlockMgr::Client*> clients;
-  CreateMgrsAndClients(
-      0, NUM_BLOCK_MGRS, blocks_per_mgr, block_size_, 0, false, &block_mgrs, &clients);
-
-  // Allocate files for all 2x2 combinations by unpinning blocks.
-  vector<vector<BufferedBlockMgr::Block*>> blocks;
-  vector<BufferedBlockMgr::Block*> all_blocks;
-  for (int i = 0; i < NUM_BLOCK_MGRS; ++i) {
-    vector<BufferedBlockMgr::Block*> mgr_blocks;
-    AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks);
-    UnpinBlocks(mgr_blocks);
-    for (int j = 0; j < blocks_per_mgr; ++j) {
-      LOG(INFO) << "Manager " << i << " Block " << j << " backed by file "
-                << mgr_blocks[j]->TmpFilePath();
-    }
-    blocks.push_back(mgr_blocks);
-    all_blocks.insert(all_blocks.end(), mgr_blocks.begin(), mgr_blocks.end());
-  }
-  WaitForWrites(block_mgrs);
-  int error_mgr = 0;
-  int no_error_mgr = 1;
-  const string& error_dir = tmp_dirs[0];
-  const string& good_dir = tmp_dirs[1];
-  // Delete one file from first scratch dir for first block manager.
-  BufferedBlockMgr::Block* error_block = FindBlockForDir(blocks[error_mgr], error_dir);
-  ASSERT_TRUE(error_block != NULL) << "Expected a tmp file in dir " << error_dir;
-  const string& error_file_path = error_block->TmpFilePath();
-  PinBlocks(all_blocks);
-  DisableBackingFile(error_file_path);
-  UnpinBlocks(all_blocks); // Should succeed since writes occur asynchronously
-  WaitForWrites(block_mgrs);
-  // Both block managers have a usable tmp directory so should still be usable.
-  ASSERT_FALSE(block_mgrs[error_mgr]->IsCancelled());
-  ASSERT_FALSE(block_mgrs[no_error_mgr]->IsCancelled());
-  // Temporary device with error should still be active.
-  vector<TmpFileMgr::DeviceId> active_tmp_devices =
-      test_env_->tmp_file_mgr()->ActiveTmpDevices();
-  ASSERT_EQ(tmp_dirs.size(), active_tmp_devices.size());
-  for (int i = 0; i < active_tmp_devices.size(); ++i) {
-    const string& device_path =
-        test_env_->tmp_file_mgr()->GetTmpDirPath(active_tmp_devices[i]);
-    ASSERT_EQ(string::npos, error_dir.find(device_path));
-  }
-
-  // The error block manager should only allocate from the device that had no error.
-  // The non-error block manager should continue using both devices, since it didn't
-  // encounter a write error itself.
-  vector<BufferedBlockMgr::Block*> error_new_blocks;
-  AllocateBlocks(
-      block_mgrs[error_mgr], clients[error_mgr], blocks_per_mgr, &error_new_blocks);
-  UnpinBlocks(error_new_blocks);
-  WaitForWrites(block_mgrs);
-  EXPECT_TRUE(FindBlockForDir(error_new_blocks, good_dir) != NULL);
-  EXPECT_TRUE(FindBlockForDir(error_new_blocks, error_dir) == NULL);
-  for (int i = 0; i < error_new_blocks.size(); ++i) {
-    LOG(INFO) << "Newly created block backed by file "
-              << error_new_blocks[i]->TmpFilePath();
-    EXPECT_TRUE(BlockInDir(error_new_blocks[i], good_dir));
-  }
-  DeleteBlocks(error_new_blocks);
-
-  PinBlocks(blocks[no_error_mgr]);
-  UnpinBlocks(blocks[no_error_mgr]);
-  WaitForWrites(block_mgrs);
-  EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], good_dir) != NULL);
-  EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], error_dir) != NULL);
-
-  // The second block manager should use the bad directory for new blocks since
-  // blacklisting is per-manager, not global.
-  vector<BufferedBlockMgr::Block*> no_error_new_blocks;
-  AllocateBlocks(block_mgrs[no_error_mgr], clients[no_error_mgr], blocks_per_mgr,
-      &no_error_new_blocks);
-  UnpinBlocks(no_error_new_blocks);
-  WaitForWrites(block_mgrs);
-  EXPECT_TRUE(FindBlockForDir(no_error_new_blocks, good_dir) != NULL);
-  EXPECT_TRUE(FindBlockForDir(no_error_new_blocks, error_dir) != NULL);
-  DeleteBlocks(no_error_new_blocks);
-
-  // A new block manager should use the both dirs for backing storage.
-  BufferedBlockMgr::Client* new_client;
-  BufferedBlockMgr* new_block_mgr =
-      CreateMgrAndClient(9999, blocks_per_mgr, block_size_, 0, false, &new_client);
-  vector<BufferedBlockMgr::Block*> new_mgr_blocks;
-  AllocateBlocks(new_block_mgr, new_client, blocks_per_mgr, &new_mgr_blocks);
-  UnpinBlocks(new_mgr_blocks);
-  WaitForWrites(block_mgrs);
-  EXPECT_TRUE(FindBlockForDir(new_mgr_blocks, good_dir) != NULL);
-  EXPECT_TRUE(FindBlockForDir(new_mgr_blocks, error_dir) != NULL);
-  DeleteBlocks(new_mgr_blocks);
-
-  DeleteBlocks(all_blocks);
-}
-
-// Check that allocation error resulting from removal of directory results in blocks
-/// being allocated in other directories.
-TEST_F(BufferedBlockMgrTest, AllocationErrorHandling) {
-  // Set up two buffered block managers with two temporary dirs.
-  vector<string> tmp_dirs = InitMultipleTmpDirs(2);
-  // Simulate two concurrent queries.
-  int num_block_mgrs = 2;
-  int max_num_blocks = 4;
-  int blocks_per_mgr = max_num_blocks / num_block_mgrs;
-  vector<RuntimeState*> runtime_states;
-  vector<BufferedBlockMgr*> block_mgrs;
-  vector<BufferedBlockMgr::Client*> clients;
-  CreateMgrsAndClients(
-      0, num_block_mgrs, blocks_per_mgr, block_size_, 0, false, &block_mgrs, &clients);
-
-  // Allocate files for all 2x2 combinations by unpinning blocks.
-  vector<vector<BufferedBlockMgr::Block*>> blocks;
-  for (int i = 0; i < num_block_mgrs; ++i) {
-    vector<BufferedBlockMgr::Block*> mgr_blocks;
-    LOG(INFO) << "Iter " << i;
-    AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks);
-    blocks.push_back(mgr_blocks);
-  }
-  const string& bad_dir = tmp_dirs[0];
-  const string& bad_scratch_subdir = bad_dir + SCRATCH_SUFFIX;
-  chmod(bad_scratch_subdir.c_str(), 0);
-  // The block mgr should attempt to allocate space in bad dir for one block, which will
-  // cause an error when it tries to create/expand the file. It should recover and just
-  // use the good dir.
-  UnpinBlocks(blocks[0]);
-  // Directories remain on active list even when they experience errors.
-  ASSERT_EQ(2, test_env_->tmp_file_mgr()->NumActiveTmpDevices());
-  // Blocks should not be written to bad dir even if it remains non-writable.
-  UnpinBlocks(blocks[1]);
-  // All writes should succeed.
-  WaitForWrites(block_mgrs);
-  for (int i = 0; i < blocks.size(); ++i) {
-    DeleteBlocks(blocks[i]);
-  }
-}
-
-// Test that block manager fails cleanly when all directories are inaccessible at runtime.
-TEST_F(BufferedBlockMgrTest, NoDirsAllocationError) {
-  vector<string> tmp_dirs = InitMultipleTmpDirs(2);
-  int max_num_buffers = 2;
-  RuntimeState* runtime_state;
-  BufferedBlockMgr::Client* client;
-  BufferedBlockMgr* block_mgr = CreateMgrAndClient(
-      0, max_num_buffers, block_size_, 0, false, &client, &runtime_state);
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  for (int i = 0; i < tmp_dirs.size(); ++i) {
-    const string& tmp_scratch_subdir = tmp_dirs[i] + SCRATCH_SUFFIX;
-    chmod(tmp_scratch_subdir.c_str(), 0);
-  }
-  ErrorLogMap error_log;
-  runtime_state->GetErrors(&error_log);
-  ASSERT_TRUE(error_log.empty());
-  // Unpin the blocks. Unpinning may fail if it hits a write error before this thread is
-  // done unpinning.
-  vector<TErrorCode::type> cancelled_code = {TErrorCode::CANCELLED};
-  UnpinBlocks(blocks, &cancelled_code);
-
-  LOG(INFO) << "Waiting for writes.";
-  // Write failure should cancel query.
-  WaitForWrites(block_mgr);
-  LOG(INFO) << "writes done.";
-  ASSERT_TRUE(block_mgr->IsCancelled());
-  runtime_state->GetErrors(&error_log);
-  ASSERT_FALSE(error_log.empty());
-  stringstream error_string;
-  PrintErrorMap(&error_string, error_log);
-  LOG(INFO) << "Errors: " << error_string.str();
-  // SCRATCH_ALLOCATION_FAILED error should exist in the error log.
-  ErrorLogMap::const_iterator it = error_log.find(TErrorCode::SCRATCH_ALLOCATION_FAILED);
-  ASSERT_NE(it, error_log.end());
-  ASSERT_GT(it->second.count, 0);
-  DeleteBlocks(blocks);
-}
-
-// Test that block manager can still allocate buffers when spilling is disabled.
-TEST_F(BufferedBlockMgrTest, NoTmpDirs) {
-  InitMultipleTmpDirs(0);
-  int max_num_buffers = 3;
-  BufferedBlockMgr::Client* client;
-  BufferedBlockMgr* block_mgr =
-      CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  DeleteBlocks(blocks);
-}
-
-// Test that block manager can still allocate buffers when spilling is disabled by
-// setting scratch_limit = 0.
-TEST_F(BufferedBlockMgrTest, ScratchLimitZero) {
-  int max_num_buffers = 3;
-  BufferedBlockMgr::Client* client;
-  TQueryOptions query_options;
-  query_options.scratch_limit = 0;
-  BufferedBlockMgr* block_mgr = CreateMgrAndClient(
-      0, max_num_buffers, block_size_, 0, false, &client, NULL, &query_options);
-  vector<BufferedBlockMgr::Block*> blocks;
-  AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-  DeleteBlocks(blocks);
-}
-
-// Create two clients with different number of reserved buffers.
-TEST_F(BufferedBlockMgrTest, MultipleClients) {
-  int client1_buffers = 3;
-  int client2_buffers = 5;
-  int max_num_buffers = client1_buffers + client2_buffers;
-  const int block_size = 1024;
-  RuntimeState* runtime_state;
-  BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
-  BufferedBlockMgr::Client* client1 = NULL;
-  BufferedBlockMgr::Client* client2 = NULL;
-  ASSERT_OK(block_mgr->RegisterClient("", client1_buffers, false,
-      NewClientTracker(runtime_state), runtime_state, &client1));
-  ASSERT_TRUE(client1 != NULL);
-  ASSERT_OK(block_mgr->RegisterClient("", client2_buffers, false,
-      NewClientTracker(runtime_state), runtime_state, &client2));
-  ASSERT_TRUE(client2 != NULL);
-
-  // Reserve client 1's and 2's buffers. They should succeed.
-  bool reserved = block_mgr->TryAcquireTmpReservation(client1, 1);
-  ASSERT_TRUE(reserved);
-  reserved = block_mgr->TryAcquireTmpReservation(client2, 1);
-  ASSERT_TRUE(reserved);
-
-  vector<BufferedBlockMgr::Block*> client1_blocks;
-  // Allocate all of client1's reserved blocks, they should all succeed.
-  AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks);
-
-  // Try allocating one more, that should fail.
-  BufferedBlockMgr::Block* block;
-  ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-
-  // Trying to reserve should also fail.
-  reserved = block_mgr->TryAcquireTmpReservation(client1, 1);
-  ASSERT_FALSE(reserved);
-
-  // Allocate all of client2's reserved blocks, these should succeed.
-  vector<BufferedBlockMgr::Block*> client2_blocks;
-  AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks);
-
-  // Try allocating one more from client 2, that should fail.
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-
-  // Unpin one block from client 1.
-  ASSERT_OK(client1_blocks[0]->Unpin());
-
-  // Client 2 should still not be able to allocate.
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-
-  // Client 2 should still not be able to reserve.
-  reserved = block_mgr->TryAcquireTmpReservation(client2, 1);
-  ASSERT_FALSE(reserved);
-
-  // Client 1 should be able to though.
-  ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
-  ASSERT_TRUE(block != NULL);
-  client1_blocks.push_back(block);
-
-  // Unpin two of client 1's blocks (client 1 should have 3 unpinned blocks now).
-  ASSERT_OK(client1_blocks[1]->Unpin());
-  ASSERT_OK(client1_blocks[2]->Unpin());
-
-  // Clear client 1's reservation
-  block_mgr->ClearReservations(client1);
-
-  // Client 2 should be able to reserve 1 buffers now (there are 2 left);
-  reserved = block_mgr->TryAcquireTmpReservation(client2, 1);
-  ASSERT_TRUE(reserved);
-
-  // Client one can only pin 1.
-  bool pinned;
-  ASSERT_OK(client1_blocks[0]->Pin(&pinned));
-  ASSERT_TRUE(pinned);
-  // Can't get this one.
-  ASSERT_OK(client1_blocks[1]->Pin(&pinned));
-  ASSERT_FALSE(pinned);
-
-  // Client 2 can pick up the one reserved buffer
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
-  ASSERT_TRUE(block != NULL);
-  client2_blocks.push_back(block);
-
-  // But not a second
-  BufferedBlockMgr::Block* block2;
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block2));
-  ASSERT_TRUE(block2 == NULL);
-
-  // Unpin client 2's block it got from the reservation. Sine this is a tmp
-  // reservation, client 1 can pick it up again (it is not longer reserved).
-  ASSERT_OK(block->Unpin());
-  ASSERT_OK(client1_blocks[1]->Pin(&pinned));
-  ASSERT_TRUE(pinned);
-
-  DeleteBlocks(client1_blocks);
-  DeleteBlocks(client2_blocks);
-  TearDownMgrs();
-}
-
-// Create two clients with different number of reserved buffers and some additional.
-TEST_F(BufferedBlockMgrTest, MultipleClientsExtraBuffers) {
-  int client1_buffers = 1;
-  int client2_buffers = 1;
-  int max_num_buffers = client1_buffers + client2_buffers + 2;
-  const int block_size = 1024;
-  RuntimeState* runtime_state;
-  BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
-  BufferedBlockMgr::Client* client1 = NULL;
-  BufferedBlockMgr::Client* client2 = NULL;
-  BufferedBlockMgr::Block* block = NULL;
-  ASSERT_OK(block_mgr->RegisterClient("", client1_buffers, false,
-      NewClientTracker(runtime_state), runtime_state, &client1));
-  ASSERT_TRUE(client1 != NULL);
-  ASSERT_OK(block_mgr->RegisterClient("", client2_buffers, false,
-      NewClientTracker(runtime_state), runtime_state, &client2));
-  ASSERT_TRUE(client2 != NULL);
-
-  vector<BufferedBlockMgr::Block*> client1_blocks;
-  // Allocate all of client1's reserved blocks, they should all succeed.
-  AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks);
-
-  // Allocate all of client2's reserved blocks, these should succeed.
-  vector<BufferedBlockMgr::Block*> client2_blocks;
-  AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks);
-
-  // We have two spare buffers now. Each client should be able to allocate it.
-  ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
-  ASSERT_TRUE(block != NULL);
-  client1_blocks.push_back(block);
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
-  ASSERT_TRUE(block != NULL);
-  client2_blocks.push_back(block);
-
-  // Now we are completely full, no one should be able to allocate a new block.
-  ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-
-  DeleteBlocks(client1_blocks);
-  DeleteBlocks(client2_blocks);
-  TearDownMgrs();
-}
-
-// Create multiple clients causing oversubscription.
-TEST_F(BufferedBlockMgrTest, ClientOversubscription) {
-  Status status;
-  int client1_buffers = 1;
-  int client2_buffers = 2;
-  int client3_buffers = 2;
-  int max_num_buffers = 2;
-  const int block_size = 1024;
-  RuntimeState* runtime_state;
-  BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-  vector<BufferedBlockMgr::Block*> blocks;
-
-  BufferedBlockMgr::Client* client1 = NULL;
-  BufferedBlockMgr::Client* client2 = NULL;
-  BufferedBlockMgr::Client* client3 = NULL;
-  BufferedBlockMgr::Block* block = NULL;
-  ASSERT_OK(block_mgr->RegisterClient("", client1_buffers, false,
-      NewClientTracker(runtime_state), runtime_state, &client1));
-  ASSERT_TRUE(client1 != NULL);
-  ASSERT_OK(block_mgr->RegisterClient("", client2_buffers, false,
-      NewClientTracker(runtime_state), runtime_state, &client2));
-  ASSERT_TRUE(client2 != NULL);
-  ASSERT_OK(block_mgr->RegisterClient("", client3_buffers, true,
-      NewClientTracker(runtime_state), runtime_state, &client3));
-  ASSERT_TRUE(client3 != NULL);
-
-  // Client one allocates first block, should work.
-  ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
-  ASSERT_TRUE(block != NULL);
-  blocks.push_back(block);
-
-  // Client two allocates first block, should work.
-  ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
-  ASSERT_TRUE(block != NULL);
-  blocks.push_back(block);
-
-  // At this point we've used both buffers. Client one reserved one so subsequent
-  // calls should fail with no error (but returns no block).
-  ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-
-  // Allocate with client two. Since client two reserved 2 buffers, this should fail
-  // with MEM_LIMIT_EXCEEDED.
-  ASSERT_TRUE(block_mgr->GetNewBlock(client2, NULL, &block).IsMemLimitExceeded());
-
-  // Allocate with client three. Since client three can tolerate oversubscription,
-  // this should fail with no error even though it was a reserved request.
-  ASSERT_OK(block_mgr->GetNewBlock(client3, NULL, &block));
-  ASSERT_TRUE(block == NULL);
-
-  DeleteBlocks(blocks);
-  TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, SingleRandom_plain) {
-  FLAGS_disk_spill_encryption = false;
-  TestRandomInternalSingle(1024);
-  TestRandomInternalSingle(8 * 1024);
-  TestRandomInternalSingle(8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi2Random_plain) {
-  FLAGS_disk_spill_encryption = false;
-  TestRandomInternalMulti(2, 1024);
-  TestRandomInternalMulti(2, 8 * 1024);
-  TestRandomInternalMulti(2, 8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi4Random_plain) {
-  FLAGS_disk_spill_encryption = false;
-  TestRandomInternalMulti(4, 1024);
-  TestRandomInternalMulti(4, 8 * 1024);
-  TestRandomInternalMulti(4, 8 * 1024 * 1024);
-}
-
-// TODO: Enable when we improve concurrency/scalability of block mgr.
-// TEST_F(BufferedBlockMgrTest, Multi8Random_plain) {
-//   FLAGS_disk_spill_encryption = false;
-//   TestRandomInternalMulti(8);
-// }
-
-TEST_F(BufferedBlockMgrTest, SingleRandom_encryption) {
-  FLAGS_disk_spill_encryption = true;
-  TestRandomInternalSingle(8 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi2Random_encryption) {
-  FLAGS_disk_spill_encryption = true;
-  TestRandomInternalMulti(2, 8 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi4Random_encryption) {
-  FLAGS_disk_spill_encryption = true;
-  TestRandomInternalMulti(4, 8 * 1024);
-}
-
-// TODO: Enable when we improve concurrency/scalability of block mgr.
-// TEST_F(BufferedBlockMgrTest, Multi8Random_encryption) {
-//   FLAGS_disk_spill_encryption = true;
-//   TestRandomInternalMulti(8);
-// }
-
-
-TEST_F(BufferedBlockMgrTest, CreateDestroyMulti) {
-  CreateDestroyMulti();
-}
-
-}
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
-  impala::InitFeSupport();
-  impala::LlvmCodeGen::InitializeLlvm();
-  return RUN_ALL_TESTS();
-}