You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/08/05 03:18:12 UTC
[01/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Repository: incubator-impala
Updated Branches:
refs/heads/master d5b0c6b93 -> a98b90bd3
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test b/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
index 58fe1bf..920195b 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/spillable-buffer-sizing.test
@@ -21,7 +21,7 @@ Per-Host Resources: mem-estimate=24.00MB mem-reservation=1.06MB
| hash predicates: c_nationkey = n_nationkey
| fk/pk conjuncts: c_nationkey = n_nationkey
| runtime filters: RF000 <- n_nationkey
-| mem-estimate=3.15KB mem-reservation=1.06MB
+| mem-estimate=3.15KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=0,1 row-size=355B cardinality=150000
|
|--03:EXCHANGE [BROADCAST]
@@ -66,7 +66,7 @@ Per-Host Resources: mem-estimate=48.01MB mem-reservation=2.12MB
| hash predicates: c_nationkey = n_nationkey
| fk/pk conjuncts: c_nationkey = n_nationkey
| runtime filters: RF000 <- n_nationkey
-| mem-estimate=3.15KB mem-reservation=1.06MB
+| mem-estimate=3.15KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=0,1 row-size=355B cardinality=150000
|
|--F03:PLAN FRAGMENT [RANDOM] hosts=1 instances=2
@@ -104,7 +104,7 @@ select straight_join *
from tpch_parquet.lineitem
left join tpch_parquet.orders on l_orderkey = o_orderkey
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=420.41MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -117,11 +117,11 @@ PLAN-ROOT SINK
| tuple-ids=0,1N row-size=454B cardinality=6001215
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=380.41MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=380.41MB mem-reservation=34.00MB
02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
-| mem-estimate=300.41MB mem-reservation=136.00MB
+| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1N row-size=454B cardinality=6001215
|
|--03:EXCHANGE [BROADCAST]
@@ -146,7 +146,7 @@ Per-Host Resources: mem-estimate=380.41MB mem-reservation=136.00MB
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=840.83MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -159,12 +159,12 @@ PLAN-ROOT SINK
| tuple-ids=0,1N row-size=454B cardinality=6001215
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=760.83MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=760.83MB mem-reservation=68.00MB
02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash-table-id=00
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
-| mem-estimate=300.41MB mem-reservation=136.00MB
+| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1N row-size=454B cardinality=6001215
|
|--F03:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -219,7 +219,7 @@ Per-Host Resources: mem-estimate=18.69MB mem-reservation=34.00MB
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF000 <- c_custkey
-| mem-estimate=18.69MB mem-reservation=34.00MB
+| mem-estimate=18.69MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=428B cardinality=1500000
|
|--04:EXCHANGE [HASH(c_custkey)]
@@ -270,7 +270,7 @@ Per-Host Resources: mem-estimate=18.69MB mem-reservation=34.00MB
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF000 <- c_custkey
-| mem-estimate=9.35MB mem-reservation=17.00MB
+| mem-estimate=9.35MB mem-reservation=17.00MB spill-buffer=1.00MB
| tuple-ids=0,1 row-size=428B cardinality=1500000
|
|--F04:PLAN FRAGMENT [HASH(o_custkey)] hosts=1 instances=2
@@ -314,7 +314,7 @@ select straight_join *
from tpch_parquet.orders
join /*+broadcast*/ tpch_parquet.customer on o_custkey = c_custkey
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=68.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=101.38MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -327,12 +327,12 @@ PLAN-ROOT SINK
| tuple-ids=0,1 row-size=428B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
-Per-Host Resources: mem-estimate=77.38MB mem-reservation=68.00MB
+Per-Host Resources: mem-estimate=77.38MB mem-reservation=34.00MB
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF000 <- c_custkey
-| mem-estimate=37.38MB mem-reservation=68.00MB
+| mem-estimate=37.38MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=428B cardinality=1500000
|
|--03:EXCHANGE [BROADCAST]
@@ -358,7 +358,7 @@ Per-Host Resources: mem-estimate=77.38MB mem-reservation=68.00MB
mem-estimate=40.00MB mem-reservation=0B
tuple-ids=0 row-size=191B cardinality=1500000
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=202.76MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -371,13 +371,13 @@ PLAN-ROOT SINK
| tuple-ids=0,1 row-size=428B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
-Per-Host Resources: mem-estimate=154.76MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=154.76MB mem-reservation=68.00MB
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash-table-id=00
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF000 <- c_custkey
-| mem-estimate=37.38MB mem-reservation=68.00MB
+| mem-estimate=37.38MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=428B cardinality=1500000
|
|--F03:PLAN FRAGMENT [RANDOM] hosts=1 instances=2
@@ -415,7 +415,7 @@ select straight_join *
from functional_parquet.alltypes
left join functional_parquet.alltypestiny on alltypes.id = alltypestiny.id
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=2.03GB
WARNING: The following tables are missing relevant table and/or column statistics.
functional_parquet.alltypes, functional_parquet.alltypestiny
@@ -430,11 +430,11 @@ PLAN-ROOT SINK
| tuple-ids=0,1N row-size=176B cardinality=unavailable
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=2.02GB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=2.02GB mem-reservation=34.00MB
02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash predicates: alltypes.id = alltypestiny.id
| fk/pk conjuncts: assumed fk/pk
-| mem-estimate=2.00GB mem-reservation=136.00MB
+| mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1N row-size=176B cardinality=unavailable
|
|--03:EXCHANGE [BROADCAST]
@@ -459,7 +459,7 @@ Per-Host Resources: mem-estimate=2.02GB mem-reservation=136.00MB
mem-estimate=16.00MB mem-reservation=0B
tuple-ids=0 row-size=88B cardinality=unavailable
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=4.06GB
WARNING: The following tables are missing relevant table and/or column statistics.
functional_parquet.alltypestiny
@@ -474,12 +474,12 @@ PLAN-ROOT SINK
| tuple-ids=0,1N row-size=176B cardinality=unavailable
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=4.03GB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=4.03GB mem-reservation=68.00MB
02:HASH JOIN [LEFT OUTER JOIN, BROADCAST]
| hash-table-id=00
| hash predicates: alltypes.id = alltypestiny.id
| fk/pk conjuncts: assumed fk/pk
-| mem-estimate=2.00GB mem-reservation=136.00MB
+| mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1N row-size=176B cardinality=unavailable
|
|--F03:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
@@ -516,7 +516,7 @@ select c_nationkey, avg(c_acctbal)
from tpch_parquet.customer
group by c_nationkey
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=2.12MB
+Per-Host Resource Reservation: Memory=1.12MB
Per-Host Resource Estimates: Memory=44.00MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -529,11 +529,11 @@ PLAN-ROOT SINK
| tuple-ids=2 row-size=10B cardinality=25
|
F01:PLAN FRAGMENT [HASH(c_nationkey)] hosts=1 instances=1
-Per-Host Resources: mem-estimate=10.00MB mem-reservation=2.12MB
+Per-Host Resources: mem-estimate=10.00MB mem-reservation=1.12MB
03:AGGREGATE [FINALIZE]
| output: avg:merge(c_acctbal)
| group by: c_nationkey
-| mem-estimate=10.00MB mem-reservation=2.12MB
+| mem-estimate=10.00MB mem-reservation=1.12MB spill-buffer=64.00KB
| tuple-ids=2 row-size=10B cardinality=25
|
02:EXCHANGE [HASH(c_nationkey)]
@@ -545,7 +545,7 @@ Per-Host Resources: mem-estimate=34.00MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: avg(c_acctbal)
| group by: c_nationkey
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=10B cardinality=25
|
00:SCAN HDFS [tpch_parquet.customer, RANDOM]
@@ -556,7 +556,7 @@ Per-Host Resources: mem-estimate=34.00MB mem-reservation=0B
mem-estimate=24.00MB mem-reservation=0B
tuple-ids=0 row-size=10B cardinality=150000
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=4.25MB
+Per-Host Resource Reservation: Memory=2.25MB
Per-Host Resource Estimates: Memory=88.00MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -569,11 +569,11 @@ PLAN-ROOT SINK
| tuple-ids=2 row-size=10B cardinality=25
|
F01:PLAN FRAGMENT [HASH(c_nationkey)] hosts=1 instances=2
-Per-Host Resources: mem-estimate=20.00MB mem-reservation=4.25MB
+Per-Host Resources: mem-estimate=20.00MB mem-reservation=2.25MB
03:AGGREGATE [FINALIZE]
| output: avg:merge(c_acctbal)
| group by: c_nationkey
-| mem-estimate=10.00MB mem-reservation=2.12MB
+| mem-estimate=10.00MB mem-reservation=1.12MB spill-buffer=64.00KB
| tuple-ids=2 row-size=10B cardinality=25
|
02:EXCHANGE [HASH(c_nationkey)]
@@ -585,7 +585,7 @@ Per-Host Resources: mem-estimate=68.00MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: avg(c_acctbal)
| group by: c_nationkey
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=10B cardinality=25
|
00:SCAN HDFS [tpch_parquet.customer, RANDOM]
@@ -603,7 +603,7 @@ from tpch_parquet.lineitem
group by 1, 2
having count(*) = 1
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=83.00MB
+Per-Host Resource Reservation: Memory=51.00MB
Per-Host Resource Estimates: Memory=205.28MB
F04:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -616,12 +616,12 @@ PLAN-ROOT SINK
| tuple-ids=2 row-size=33B cardinality=4690314
|
F03:PLAN FRAGMENT [HASH(l_orderkey,o_orderstatus)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=18.04MB mem-reservation=66.00MB
+Per-Host Resources: mem-estimate=18.04MB mem-reservation=34.00MB
07:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: l_orderkey, o_orderstatus
| having: count(*) = 1
-| mem-estimate=18.04MB mem-reservation=66.00MB
+| mem-estimate=18.04MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=2 row-size=33B cardinality=4690314
|
06:EXCHANGE [HASH(l_orderkey,o_orderstatus)]
@@ -633,14 +633,14 @@ Per-Host Resources: mem-estimate=67.24MB mem-reservation=17.00MB
03:AGGREGATE [STREAMING]
| output: count(*)
| group by: l_orderkey, o_orderstatus
-| mem-estimate=54.12MB mem-reservation=0B
+| mem-estimate=54.12MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=2 row-size=33B cardinality=4690314
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=13.11MB mem-reservation=17.00MB
+| mem-estimate=13.11MB mem-reservation=17.00MB spill-buffer=1.00MB
| tuple-ids=0,1 row-size=33B cardinality=5757710
|
|--05:EXCHANGE [HASH(o_orderkey)]
@@ -672,7 +672,7 @@ Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=8B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=83.00MB
+Per-Host Resource Reservation: Memory=51.00MB
Per-Host Resource Estimates: Memory=327.24MB
F04:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -685,12 +685,12 @@ PLAN-ROOT SINK
| tuple-ids=2 row-size=33B cardinality=4690314
|
F03:PLAN FRAGMENT [HASH(l_orderkey,o_orderstatus)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=20.00MB mem-reservation=66.00MB
+Per-Host Resources: mem-estimate=20.00MB mem-reservation=34.00MB
07:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: l_orderkey, o_orderstatus
| having: count(*) = 1
-| mem-estimate=10.00MB mem-reservation=33.00MB
+| mem-estimate=10.00MB mem-reservation=17.00MB spill-buffer=1.00MB
| tuple-ids=2 row-size=33B cardinality=4690314
|
06:EXCHANGE [HASH(l_orderkey,o_orderstatus)]
@@ -702,7 +702,7 @@ Per-Host Resources: mem-estimate=67.24MB mem-reservation=17.00MB
03:AGGREGATE [STREAMING]
| output: count(*)
| group by: l_orderkey, o_orderstatus
-| mem-estimate=27.06MB mem-reservation=0B
+| mem-estimate=27.06MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=2 row-size=33B cardinality=4690314
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
@@ -710,7 +710,7 @@ Per-Host Resources: mem-estimate=67.24MB mem-reservation=17.00MB
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=6.56MB mem-reservation=8.50MB
+| mem-estimate=6.56MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=0,1 row-size=33B cardinality=5757710
|
|--F05:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -753,7 +753,7 @@ Per-Host Resources: mem-estimate=160.00MB mem-reservation=0B
select distinct *
from tpch_parquet.lineitem
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=3.31GB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -766,10 +766,10 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=263B cardinality=6001215
|
F01:PLAN FRAGMENT [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=1.62GB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=1.62GB mem-reservation=34.00MB
03:AGGREGATE [FINALIZE]
| group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-| mem-estimate=1.62GB mem-reservation=264.00MB
+| mem-estimate=1.62GB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
02:EXCHANGE [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)]
@@ -780,7 +780,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
Per-Host Resources: mem-estimate=1.69GB mem-reservation=0B
01:AGGREGATE [STREAMING]
| group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-| mem-estimate=1.62GB mem-reservation=0B
+| mem-estimate=1.62GB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -791,7 +791,7 @@ Per-Host Resources: mem-estimate=1.69GB mem-reservation=0B
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=528.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=6.62GB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -804,10 +804,10 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=263B cardinality=6001215
|
F01:PLAN FRAGMENT [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=3.23GB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=3.23GB mem-reservation=68.00MB
03:AGGREGATE [FINALIZE]
| group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-| mem-estimate=1.62GB mem-reservation=264.00MB
+| mem-estimate=1.62GB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
02:EXCHANGE [HASH(tpch_parquet.lineitem.l_orderkey,tpch_parquet.lineitem.l_partkey,tpch_parquet.lineitem.l_suppkey,tpch_parquet.lineitem.l_linenumber,tpch_parquet.lineitem.l_quantity,tpch_parquet.lineitem.l_extendedprice,tpch_parquet.lineitem.l_discount,tpch_parquet.lineitem.l_tax,tpch_parquet.lineitem.l_returnflag,tpch_parquet.lineitem.l_linestatus,tpch_parquet.lineitem.l_shipdate,tpch_parquet.lineitem.l_commitdate,tpch_parquet.lineitem.l_receiptdate,tpch_parquet.lineitem.l_shipinstruct,tpch_parquet.lineitem.l_shipmode,tpch_parquet.lineitem.l_comment)]
@@ -818,7 +818,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
Per-Host Resources: mem-estimate=3.39GB mem-reservation=0B
01:AGGREGATE [STREAMING]
| group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_partkey, tpch_parquet.lineitem.l_suppkey, tpch_parquet.lineitem.l_linenumber, tpch_parquet.lineitem.l_quantity, tpch_parquet.lineitem.l_extendedprice, tpch_parquet.lineitem.l_discount, tpch_parquet.lineitem.l_tax, tpch_parquet.lineitem.l_returnflag, tpch_parquet.lineitem.l_linestatus, tpch_parquet.lineitem.l_shipdate, tpch_parquet.lineitem.l_commitdate, tpch_parquet.lineitem.l_receiptdate, tpch_parquet.lineitem.l_shipinstruct, tpch_parquet.lineitem.l_shipmode, tpch_parquet.lineitem.l_comment
-| mem-estimate=1.62GB mem-reservation=0B
+| mem-estimate=1.62GB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -834,7 +834,7 @@ select string_col, count(*)
from functional_parquet.alltypestiny
group by string_col
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=272.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
functional_parquet.alltypestiny
@@ -849,11 +849,11 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=24B cardinality=unavailable
|
F01:PLAN FRAGMENT [HASH(string_col)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=128.00MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=128.00MB mem-reservation=34.00MB
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: string_col
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=24B cardinality=unavailable
|
02:EXCHANGE [HASH(string_col)]
@@ -865,7 +865,7 @@ Per-Host Resources: mem-estimate=144.00MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: count(*)
| group by: string_col
-| mem-estimate=128.00MB mem-reservation=0B
+| mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=24B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypestiny, RANDOM]
@@ -876,7 +876,7 @@ Per-Host Resources: mem-estimate=144.00MB mem-reservation=0B
mem-estimate=16.00MB mem-reservation=0B
tuple-ids=0 row-size=16B cardinality=unavailable
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=528.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=544.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
functional_parquet.alltypestiny
@@ -891,11 +891,11 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=24B cardinality=unavailable
|
F01:PLAN FRAGMENT [HASH(string_col)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=256.00MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=256.00MB mem-reservation=68.00MB
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: string_col
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=24B cardinality=unavailable
|
02:EXCHANGE [HASH(string_col)]
@@ -907,7 +907,7 @@ Per-Host Resources: mem-estimate=288.00MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: count(*)
| group by: string_col
-| mem-estimate=128.00MB mem-reservation=0B
+| mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=24B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypestiny, RANDOM]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
index 868d6ca..4c208a4 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
@@ -154,14 +154,14 @@ select id from functional.alltypes t1 where exists (
where t1.id = t2.id)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=160.00MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=160.00MB mem-reservation=1.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
02:HASH JOIN [LEFT SEMI JOIN]
| hash predicates: t1.id = t2.id
| runtime filters: RF000 <- t2.id
-| mem-estimate=44B mem-reservation=136.00MB
+| mem-estimate=44B mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=0 row-size=4B cardinality=10
|
|--01:SCAN HDFS [functional.alltypessmall t2]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
index e697914..27459ef 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
@@ -1583,18 +1583,20 @@ from functional.alltypestiny order by id
int, bigint, bigint, double
====
---- QUERY
-# Regression test for IMPALA-2265, IMPALA-2559. The max_block_mgr_memory is tuned to
+# Regression test for IMPALA-2265, IMPALA-2559. The buffer_pool_limit is tuned to
# reproduce the issue when running this query against functional_parquet.
-SET max_block_mgr_memory=16m;
+SET default_spillable_buffer_size=8m;
+SET buffer_pool_limit=16m;
SELECT lag(-180, 13) over (ORDER BY t1.int_col ASC, t2.int_col ASC) AS int_col
FROM functional_parquet.alltypes t1 CROSS JOIN functional_parquet.alltypes t2 LIMIT 10;
---- CATCH
-Memory limit exceeded
+Failed to get minimum memory reservation
====
---- QUERY
# Check that the above query can succeed with the minimum buffers (3 buffers for sort,
-# 1 buffer for analytic).
-SET max_block_mgr_memory=32m;
+# 2 buffer for analytic).
+SET default_spillable_buffer_size=8m;
+SET buffer_pool_limit=40m;
SELECT lag(-180, 13) over (ORDER BY t1.int_col ASC, t2.int_col ASC) AS int_col
FROM functional_parquet.alltypes t1 CROSS JOIN functional_parquet.alltypes t2 LIMIT 10;
---- TYPES
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
index 64f9b45..122d928 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level0.test
@@ -5,7 +5,7 @@ explain
select *
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
'Per-Host Resource Estimates: Memory=476.41MB'
''
'PLAN-ROOT SINK'
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
index f59962c..475758d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level1.test
@@ -5,7 +5,7 @@ explain
select *
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
'Per-Host Resource Estimates: Memory=476.41MB'
''
'PLAN-ROOT SINK'
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
index 2736543..2fa7576 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
@@ -5,7 +5,7 @@ explain
select *
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
'Per-Host Resource Estimates: Memory=476.41MB'
''
'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
@@ -18,12 +18,12 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
'| tuple-ids=0,1 row-size=454B cardinality=5757710'
'|'
'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
-'Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB'
+'Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB'
'02:HASH JOIN [INNER JOIN, BROADCAST]'
'| hash predicates: l_orderkey = o_orderkey'
'| fk/pk conjuncts: l_orderkey = o_orderkey'
'| runtime filters: RF000 <- o_orderkey'
-'| mem-estimate=300.41MB mem-reservation=136.00MB'
+'| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
'| tuple-ids=0,1 row-size=454B cardinality=5757710'
'|'
'|--03:EXCHANGE [BROADCAST]'
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test b/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
index 31f4f5b..76d74ce 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/explain-level3.test
@@ -5,7 +5,7 @@ explain
select *
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
---- RESULTS: VERIFY_IS_EQUAL
-'Per-Host Resource Reservation: Memory=136.00MB'
+'Per-Host Resource Reservation: Memory=34.00MB'
'Per-Host Resource Estimates: Memory=476.41MB'
''
'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
@@ -18,14 +18,14 @@ from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
' tuple-ids=0,1 row-size=454B cardinality=5757710'
''
'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
-'Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB'
+'Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB'
' DATASTREAM SINK [FRAGMENT=F02, EXCHANGE=04, UNPARTITIONED]'
' | mem-estimate=0B mem-reservation=0B'
' 02:HASH JOIN [INNER JOIN, BROADCAST]'
' | hash predicates: l_orderkey = o_orderkey'
' | fk/pk conjuncts: l_orderkey = o_orderkey'
' | runtime filters: RF000 <- o_orderkey'
-' | mem-estimate=300.41MB mem-reservation=136.00MB'
+' | mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
' | tuple-ids=0,1 row-size=454B cardinality=5757710'
' |'
' |--03:EXCHANGE [BROADCAST]'
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
index 626b315..c8a80b2 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-tpch.test
@@ -234,11 +234,10 @@ order by c_custkey
bigint, bigint
====
---- QUERY
-# IMPALA-5446: dropped status from Sorter::Reset() when sort cannot get reserved buffer.
-# This query is designed to allow the initial subplan iterations to succeed, but have
-# later iterations fail because the aggregation outside the subplan has accumulated all
-# the memory.
-set max_block_mgr_memory=100m;
+# This was originally a regression test for IMPALA-5446: dropped status from
+# Sorter::Reset() when sort cannot get reserved buffer. However with the
+# IMPALA-3200 changes it now succeeds.
+set buffer_pool_limit=100m;
select c_custkey, c_name, c_address, c_phone, c_acctbal, c_mktsegment, c_comment,
o_orderdate, sum(o_totalprice), min(rnum)
from customer c,
@@ -247,6 +246,17 @@ from customer c,
group by 1, 2, 3, 4, 5, 6, 7, 8
order by 9, 10 desc
limit 10
----- CATCH
-Memory limit exceeded: Query did not have enough memory to get the minimum required buffers in the block manager.
+---- RESULTS
+3115,'Customer#000003115','oB 75yHls7ptt5zCheWJLQ','22-291-864-7521',8889.56,'BUILDING','ts are quickly across the bold deposits. carefully spe','1998-04-23',857.71,3
+53551,'Customer#000053551','e,fT3URuJDH,tE6a6Z3Pjg0DZMFSqWbtYgd','15-429-275-5686',1137.38,'FURNITURE',' detect evenly along the blithely pending asymptotes. furiously even notornis detect carefu','1992-04-18',866.90,25
+64043,'Customer#000064043','Snyi GOB00','22-446-332-2750',4627.24,'FURNITURE','the quickly express asymptotes are around the pe','1992-01-31',870.88,11
+107698,'Customer#000107698','stUoykCwpTBAO3OC3lw','33-686-199-1188',698.89,'AUTOMOBILE',' accounts eat carefully express packages. slyly even id','1993-11-21',875.52,15
+1351,'Customer#000001351','NYMFfkNlCGoTeaDrNO9nn','11-916-210-6616',3106.00,'FURNITURE',' accounts after the final deposits sleep fluffily ironic accoun','1994-01-14',877.30,13
+85468,'Customer#000085468','EuFCX4qk4k0O4bV3UHoNVBTP','23-876-106-3120',8926.31,'AUTOMOBILE','kages. slyly even requests according to the ironic, ironic accounts cajole furiou','1997-04-12',884.52,4
+148522,'Customer#000148522','PIDMm8ulW4oam3VsoZL4f ,dpAf3LEV','16-597-824-4946',-133.27,'BUILDING','ly quickly express deposits. regularly regular requests cajole carefully slyly even noto','1995-03-20',885.75,12
+83222,'Customer#000083222','vI3tUuqtUYGPfrXAYeonVD9','27-599-263-5978',289.66,'BUILDING','ost quietly idle foxes. packages at the slyly pending pa','1993-05-02',891.74,5
+25090,'Customer#000025090','92GyVjZZiCBUmn','23-396-651-8663',8497.56,'BUILDING','osits. slyly final pinto beans sleep carefully fluffily express deposits. packages affix. carefully spe','1995-08-12',895.39,15
+27490,'Customer#000027490','jRzZQ1z7T,nrX5F58P,ZH','26-121-240-6744',7512.30,'AUTOMOBILE','slyly quickly even pinto beans: pend','1995-07-25',896.59,14
+---- TYPES
+bigint,string,string,string,decimal,string,string,string,decimal,bigint
====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test b/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
index 8c8f770..66391a5 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/runtime_row_filters_phj.test
@@ -8,7 +8,7 @@
# consumption / spilling behaviour.
####################################################
-SET MAX_BLOCK_MGR_MEMORY=275m;
+SET BUFFER_POOL_LIMIT=40m;
SET RUNTIME_FILTER_MODE=GLOBAL;
SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
SET RUNTIME_BLOOM_FILTER_SIZE=16M;
@@ -82,7 +82,8 @@ SET RUNTIME_FILTER_MODE=GLOBAL;
SET RUNTIME_FILTER_WAIT_TIME_MS=30000;
SET RUNTIME_FILTER_MIN_SIZE=128MB;
SET RUNTIME_FILTER_MAX_SIZE=500MB;
-SET MEM_LIMIT=140MB;
+# Allocate enough memory for the join + filter + scan
+SET MEM_LIMIT=170MB;
select STRAIGHT_JOIN * from alltypes a join [BROADCAST] alltypes b
on a.month = b.id and b.int_col = -3
---- RESULTS
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
index d0ac79d..14ad2ed 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-joins-with-limits-exhaustive.test
@@ -16,7 +16,7 @@ row_regex: .*RowsProduced: 10.99..\W10995\W
# Test to verify that is limit_ is correctly enforced when
# output_batch is at AtCapacity.
set batch_size=6;
-set max_block_mgr_memory=180m;
+set buffer_pool_limit=180m;
select * from tpch.lineitem t1 full outer join tpch.lineitem t2 on
t1.l_orderkey = t2.l_orderkey limit 10;
---- RUNTIME_PROFILE
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
index 74b7eee..93ed510 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test
@@ -36,7 +36,7 @@ row_regex: .* SpilledRuns: [^0].*
# Regression test for IMPALA-5554: first string column in sort tuple is null
# on boundary of spilled block. Test does two sorts with a NULL and non-NULL
# string column in both potential orders.
-set max_block_mgr_memory=50m;
+set buffer_pool_limit=50m;
select *
from (
select *, first_value(col) over (order by sort_col) fv
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-query/queries/QueryTest/spilling.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/spilling.test b/testdata/workloads/functional-query/queries/QueryTest/spilling.test
index 0f0e2ca..d8335c6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/spilling.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/spilling.test
@@ -1,6 +1,6 @@
====
---- QUERY
-set max_block_mgr_memory=25m;
+set buffer_pool_limit=10m;
select l_orderkey, count(*)
from lineitem
group by 1
@@ -21,15 +21,12 @@ BIGINT, BIGINT
---- RUNTIME_PROFILE
# Verify that spilling and passthrough were activated.
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
-row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
====
---- QUERY
# Test query with string grouping column and string agg columns
-# Could only get it to spill reliably with num_nodes=1.
-# TODO: revisit with new buffer pool.
+set buffer_pool_limit=10m;
set num_nodes=1;
-set max_block_mgr_memory=25m;
select l_returnflag, l_orderkey, avg(l_tax), min(l_shipmode)
from lineitem
group by 1,2
@@ -45,7 +42,7 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
====
---- QUERY
-set max_block_mgr_memory=25m;
+set buffer_pool_limit=10m;
select l_orderkey, count(*)
from lineitem
group by 1
@@ -65,15 +62,12 @@ order by 1 limit 10;
BIGINT, BIGINT
---- RUNTIME_PROFILE
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
-row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
====
---- QUERY
# Test query with string grouping column
-# Could only get it to spill reliably with num_nodes=1.
-# TODO: revisit with new buffer pool.
+set buffer_pool_limit=10m;
set num_nodes=1;
-set max_block_mgr_memory=25m;
select l_comment, count(*)
from lineitem
group by 1
@@ -92,10 +86,8 @@ row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
====
---- QUERY
# Test query with string grouping column and string agg columns
-# Could only get it to spill reliably with num_nodes=1.
-# TODO: revisit with new buffer pool.
+set buffer_pool_limit=10m;
set num_nodes=1;
-set max_block_mgr_memory=25m;
select l_returnflag, l_orderkey, round(avg(l_tax),2), min(l_shipmode)
from lineitem
group by 1,2
@@ -113,7 +105,7 @@ row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
====
---- QUERY
# Test with string intermediate state (avg() uses string intermediate value).
-set max_block_mgr_memory=25m;
+set buffer_pool_limit=10m;
select l_orderkey, avg(l_orderkey)
from lineitem
group by 1
@@ -129,12 +121,10 @@ BIGINT, DOUBLE
---- RUNTIME_PROFILE
# Verify that passthrough and spilling happened in the pre and merge agg.
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
-row_regex: .*NumRepartitions: .* \([1-9][0-9]*\)
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
====
---- QUERY
-set num_nodes=0;
-set max_block_mgr_memory=100m;
+set buffer_pool_limit=15m;
select count(l1.l_tax)
from
lineitem l1,
@@ -156,8 +146,7 @@ BIGINT
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
====
---- QUERY
-set num_nodes=0;
-set max_block_mgr_memory=40m;
+set buffer_pool_limit=2m;
select max(t1.total_count), max(t1.l_shipinstruct), max(t1.l_comment) from
(select l_shipinstruct, l_comment, count(*) over () total_count from lineitem) t1
---- RESULTS
@@ -165,13 +154,12 @@ select max(t1.total_count), max(t1.l_shipinstruct), max(t1.l_comment) from
---- TYPES
BIGINT, STRING, STRING
---- RUNTIME_PROFILE
-# Indirectly verify that the analytic spilled: if it spills a block, it must repin it.
-row_regex: .*PinTime: [1-9][0-9]*.*
+# Verify that the analytic spilled
+row_regex: .*PeakUnpinnedBytes: [1-9][0-9]*.*
====
---- QUERY
-# Run this query with very low memory. Since the tables are small, the PA/PHJ should be
-# using buffers much smaller than the io buffer.
-set max_block_mgr_memory=10m;
+# Run this query with very low memory, but enough not to spill.
+set buffer_pool_limit=20m;
select a.int_col, count(*)
from functional.alltypessmall a, functional.alltypessmall b, functional.alltypessmall c
where a.id = b.id and b.id = c.id group by a.int_col
@@ -192,12 +180,11 @@ INT, BIGINT
# This query is not meant to spill.
row_regex: .*SpilledPartitions: 0 .*
====
----- QUERY: TPCH-Q21
+---- QUERY: TPCH-Q22
# Adding TPCH-Q21 in the spilling test to check for IMPALA-1471 (spilling left anti
# and left outer joins were returning wrong results).
# Q21 - Suppliers Who Kept Orders Waiting Query
-set num_nodes=0;
-set max_block_mgr_memory=65m;
+set buffer_pool_limit=20m;
select
s_name,
count(*) as numwait
@@ -347,8 +334,7 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
====
---- QUERY
# Test aggregation spill with group_concat distinct
-set num_nodes=1;
-set max_block_mgr_memory=100m;
+set buffer_pool_limit=50m;
select l_orderkey, count(*), group_concat(distinct l_linestatus, '|')
from lineitem
group by 1
@@ -376,7 +362,6 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
# nodes. CastToChar will do "local" memory allocation. Without the fix of
# IMPALA-2612, the peak memory consumption will be higher.
set mem_limit=800m;
-set num_nodes=1;
set num_scanner_threads=1;
select count(distinct concat(cast(l_comment as char(120)), cast(l_comment as char(120)),
cast(l_comment as char(120)), cast(l_comment as char(120)),
@@ -394,8 +379,7 @@ row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
# Test sort with small char column materialized by exprs.
# Set low memory limit to force spilling.
# IMPALA-3332: comparator makes local allocations that cause runaway memory consumption.
-set num_nodes=0;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
set mem_limit=200m;
set disable_outermost_topn=1;
select cast(l_comment as char(50))
@@ -432,9 +416,8 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
====
---- QUERY
# Test sort with small input char column materialized before sort.
-set num_nodes=0;
set mem_limit=200m;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
set disable_outermost_topn=1;
select char_col
from (select cast(l_comment as char(50)) char_col
@@ -472,9 +455,8 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
---- QUERY
# Test sort with large input char column materialized before sort.
# Set low memory limit to force spilling.
-set num_nodes=0;
set mem_limit=200m;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
set disable_outermost_topn=1;
select char_col
from (select cast(l_comment as char(200)) char_col
@@ -512,8 +494,7 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
---- QUERY
# Test sort with varchar column materialized by exprs.
# Set low memory limit to force spilling.
-set num_nodes=0;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
# IMPALA-3332: comparator makes local allocations that cause runaway memory consumption.
set mem_limit=200m;
set disable_outermost_topn=1;
@@ -552,9 +533,8 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
---- QUERY
# Test sort with input varchar column materialized before sort.
# Set low memory limit to force spilling.
-set num_nodes=0;
set mem_limit=200m;
-set max_block_mgr_memory=4m;
+set buffer_pool_limit=4m;
set disable_outermost_topn=1;
select char_col
from (select cast(l_comment as varchar(50)) char_col
@@ -592,9 +572,7 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
---- QUERY
# IMPALA-1346/IMPALA-1546: fix sorter memory management so that it can complete
# successfully when in same pipeline as a spilling join.
-set num_nodes=0;
-set mem_limit=200m;
-set max_block_mgr_memory=50m;
+set buffer_pool_limit=50m;
set disable_outermost_topn=1;
select * from lineitem
inner join orders on l_orderkey = o_orderkey
@@ -632,7 +610,7 @@ row_regex: .*TotalMergesPerformed: .* \([1-9][0-9]*\)
# Tests for the case where a spilled partition has 0 probe rows and so we don't build the
# hash table in a partitioned hash join.
# INNER JOIN
-set max_block_mgr_memory=10m;
+set buffer_pool_limit=10m;
select straight_join count(*)
from
lineitem a, lineitem b
@@ -648,7 +626,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
====
---- QUERY
# spilled partition with 0 probe rows, NULL AWARE LEFT ANTI JOIN
-set max_block_mgr_memory=10m;
+set buffer_pool_limit=10m;
select straight_join count(*)
from
lineitem a
@@ -664,7 +642,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
====
---- QUERY
# spilled partition with 0 probe rows, RIGHT OUTER JOIN
-set max_block_mgr_memory=10m;
+set buffer_pool_limit=10m;
select straight_join count(*)
from
supplier right outer join lineitem on s_suppkey = l_suppkey
@@ -678,7 +656,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
====
---- QUERY
# spilled partition with 0 probe rows, RIGHT ANTI JOIN
-set max_block_mgr_memory=30m;
+set buffer_pool_limit=30m;
with x as (select * from supplier limit 10)
select straight_join count(*)
from
@@ -698,7 +676,7 @@ row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
# where l1.l_quantity = 31.0 and l1.l_tax = 0.03 and l1.l_orderkey <= 100000
# order by l_orderkey, l_partkey, l_suppkey, l_linenumber
# limit 5
-set max_block_mgr_memory=7m;
+set buffer_pool_limit=7m;
set num_nodes=1;
select straight_join l.*
from
@@ -726,3 +704,16 @@ bigint,bigint,bigint,int,decimal,decimal,decimal,decimal,string,string,string,st
1382,156162,6163,5,31.00,37762.96,0.07,0.03,'R','F','1993-10-26','1993-10-15','1993-11-09','TAKE BACK RETURN','FOB','hely regular dependencies. f'
1509,186349,3904,6,31.00,44495.54,0.04,0.03,'A','F','1993-07-14','1993-08-21','1993-08-06','COLLECT COD','SHIP','ic deposits cajole carefully. quickly bold '
====
+---- QUERY
+# Test aggregation with minimum required reservation to exercise IMPALA-2708.
+# Merge aggregation requires 17 buffers. The buffer size is 256k for this test.
+set buffer_pool_limit=4352k;
+select count(*)
+from (select distinct * from orders) t
+---- TYPES
+BIGINT
+---- RESULTS
+1500000
+---- RUNTIME_PROFILE
+row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
+====
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/targeted-stress/queries/agg_stress.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/targeted-stress/queries/agg_stress.test b/testdata/workloads/targeted-stress/queries/agg_stress.test
index b2d45a9..a6657ba 100644
--- a/testdata/workloads/targeted-stress/queries/agg_stress.test
+++ b/testdata/workloads/targeted-stress/queries/agg_stress.test
@@ -1,7 +1,7 @@
====
---- QUERY
# This memory limit causes a spill to happen for this query
-set max_block_mgr_memory=250m;
+set buffer_pool_limit=250m;
# This query forces many joins and aggregations with spilling
# and can expose race conditions in the spilling code if run in parallel
select
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/tpch/queries/insert_parquet.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/queries/insert_parquet.test b/testdata/workloads/tpch/queries/insert_parquet.test
index 4707b7b..862548e 100644
--- a/testdata/workloads/tpch/queries/insert_parquet.test
+++ b/testdata/workloads/tpch/queries/insert_parquet.test
@@ -67,6 +67,8 @@ insert overwrite table test_insert_huge_vals
---- QUERY
# Verify the values written to test_insert_huge_vals were as expected by counting
# the results of an inner join of that table with the same query used in the insert.
+# Increase spillable buffer size to fit the large values on right side of hash join.
+set min_spillable_buffer_size=1m;
select count(*) from
(select cast(l_orderkey as string) s from tpch.lineitem union
select group_concat(concat(s_name, s_address, s_phone)) from tpch.supplier union
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/comparison/discrepancy_searcher.py
----------------------------------------------------------------------
diff --git a/tests/comparison/discrepancy_searcher.py b/tests/comparison/discrepancy_searcher.py
index ccbdd66..e0e1725 100755
--- a/tests/comparison/discrepancy_searcher.py
+++ b/tests/comparison/discrepancy_searcher.py
@@ -315,7 +315,7 @@ class QueryExecutor(object):
SET DISABLE_STREAMING_PREAGGREGATIONS={disable_streaming_preaggregations};
SET DISABLE_UNSAFE_SPILLS={disable_unsafe_spills};
SET EXEC_SINGLE_NODE_ROWS_THRESHOLD={exec_single_node_rows_threshold};
- SET MAX_BLOCK_MGR_MEMORY={max_block_mgr_memory};
+ SET BUFFER_POOL_LIMIT={buffer_pool_limit};
SET MAX_IO_BUFFERS={max_io_buffers};
SET MAX_SCAN_RANGE_LENGTH={max_scan_range_length};
SET NUM_NODES={num_nodes};
@@ -333,7 +333,7 @@ class QueryExecutor(object):
disable_streaming_preaggregations=choice((0, 1)),
disable_unsafe_spills=choice((0, 1)),
exec_single_node_rows_threshold=randint(1, 100000000),
- max_block_mgr_memory=randint(1, 100000000),
+ buffer_pool_limit=randint(1, 100000000),
max_io_buffers=randint(1, 100000000),
max_scan_range_length=randint(1, 100000000),
num_nodes=randint(3, 3),
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/custom_cluster/test_scratch_disk.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_scratch_disk.py b/tests/custom_cluster/test_scratch_disk.py
index 7e02de5..579ca1e 100644
--- a/tests/custom_cluster/test_scratch_disk.py
+++ b/tests/custom_cluster/test_scratch_disk.py
@@ -40,7 +40,7 @@ class TestScratchDir(CustomClusterTestSuite):
# Block manager memory limit that is low enough to force Impala to spill to disk when
# executing spill_query and high enough that we can execute in_mem_query without
# spilling.
- max_block_mgr_memory = "64m"
+ buffer_pool_limit = "64m"
def count_nonempty_dirs(self, dirs):
count = 0
@@ -87,7 +87,7 @@ class TestScratchDir(CustomClusterTestSuite):
self.assert_impalad_log_contains("INFO", "Using scratch directory ",
expected_count=1)
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
self.execute_query_expect_success(client, self.spill_query, exec_option)
@@ -100,7 +100,7 @@ class TestScratchDir(CustomClusterTestSuite):
self.assert_impalad_log_contains("WARNING",
"Running without spill to disk: no scratch directories provided\.")
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
# Expect spill to disk to fail
@@ -121,7 +121,7 @@ class TestScratchDir(CustomClusterTestSuite):
self.assert_impalad_log_contains("WARNING", "Could not remove and recreate directory "
+ ".*: cannot use it for scratch\. Error was: .*", expected_count=5)
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
# Expect spill to disk to fail
@@ -144,7 +144,7 @@ class TestScratchDir(CustomClusterTestSuite):
+ "Encountered exception while verifying existence of directory path",
expected_count=5)
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
# Expect spill to disk to fail
@@ -164,7 +164,7 @@ class TestScratchDir(CustomClusterTestSuite):
self.assert_impalad_log_contains("INFO", "Using scratch directory ",
expected_count=len(dirs))
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
# Trigger errors when writing the first two directories.
shutil.rmtree(dirs[0]) # Remove the first directory.
# Make all subdirectories in the second directory non-writable.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/custom_cluster/test_spilling.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_spilling.py b/tests/custom_cluster/test_spilling.py
deleted file mode 100644
index 774e83f..0000000
--- a/tests/custom_cluster/test_spilling.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-from copy import deepcopy
-
-from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
-from tests.common.test_dimensions import (
- create_single_exec_option_dimension,
- create_parquet_dimension)
-
-class TestSpilling(CustomClusterTestSuite):
- @classmethod
- def get_workload(self):
- return 'functional-query'
-
- @classmethod
- def add_test_dimensions(cls):
- super(TestSpilling, cls).add_test_dimensions()
- cls.ImpalaTestMatrix.clear_constraints()
- cls.ImpalaTestMatrix.add_dimension(create_parquet_dimension('tpch'))
- cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
-
- # Reduce the IO read size. This reduces the memory required to trigger spilling.
- @pytest.mark.execute_serially
- @CustomClusterTestSuite.with_args(
- impalad_args="--read_size=200000",
- catalogd_args="--load_catalog_in_background=false")
- def test_spilling(self, vector):
- new_vector = deepcopy(vector)
- # remove this. the test cases set this explicitly.
- del new_vector.get_value('exec_option')['num_nodes']
- self.run_test_case('QueryTest/spilling', new_vector)
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_cancellation.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_cancellation.py b/tests/query_test/test_cancellation.py
index bb1fc0d..91be5d4 100644
--- a/tests/query_test/test_cancellation.py
+++ b/tests/query_test/test_cancellation.py
@@ -52,7 +52,7 @@ DEBUG_ACTIONS = [None, 'WAIT']
# Extra dimensions to test order by without limit
SORT_QUERY = 'select * from lineitem order by l_orderkey'
SORT_CANCEL_DELAY = range(6, 10)
-SORT_BLOCK_MGR_LIMIT = ['0', '300m'] # Test spilling and non-spilling sorts.
+SORT_BUFFER_POOL_LIMIT = ['0', '300m'] # Test spilling and non-spilling sorts.
class TestCancellation(ImpalaTestSuite):
@classmethod
@@ -71,7 +71,7 @@ class TestCancellation(ImpalaTestSuite):
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('action', *DEBUG_ACTIONS))
cls.ImpalaTestMatrix.add_dimension(
- ImpalaTestDimension('max_block_mgr_memory', 0))
+ ImpalaTestDimension('buffer_pool_limit', 0))
cls.ImpalaTestMatrix.add_constraint(
lambda v: v.get_value('query_type') != 'CTAS' or (\
@@ -121,8 +121,8 @@ class TestCancellation(ImpalaTestSuite):
debug_action = '0:GETNEXT:' + action if action != None else ''
vector.get_value('exec_option')['debug_action'] = debug_action
- vector.get_value('exec_option')['max_block_mgr_memory'] =\
- vector.get_value('max_block_mgr_memory')
+ vector.get_value('exec_option')['buffer_pool_limit'] =\
+ vector.get_value('buffer_pool_limit')
# Execute the query multiple times, cancelling it each time.
for i in xrange(NUM_CANCELATION_ITERATIONS):
@@ -216,7 +216,7 @@ class TestCancellationFullSort(TestCancellation):
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('cancel_delay', *SORT_CANCEL_DELAY))
cls.ImpalaTestMatrix.add_dimension(
- ImpalaTestDimension('max_block_mgr_memory', *SORT_BLOCK_MGR_LIMIT))
+ ImpalaTestDimension('buffer_pool_limit', *SORT_BUFFER_POOL_LIMIT))
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('action', None))
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format =='parquet' and\
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_mem_usage_scaling.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_mem_usage_scaling.py b/tests/query_test/test_mem_usage_scaling.py
index e6eccf9..bbdc771 100644
--- a/tests/query_test/test_mem_usage_scaling.py
+++ b/tests/query_test/test_mem_usage_scaling.py
@@ -82,7 +82,8 @@ class TestExprMemUsage(ImpalaTestSuite):
class TestLowMemoryLimits(ImpalaTestSuite):
'''Super class for the memory limit tests with the TPC-H and TPC-DS queries'''
- EXPECTED_ERROR_MSG = "Memory limit exceeded"
+ EXPECTED_ERROR_MSGS = ["Memory limit exceeded",
+ "Failed to get minimum memory reservation"]
def low_memory_limit_test(self, vector, tpch_query, limit, xfail_mem_limit=None):
mem = vector.get_value('mem_limit')
@@ -93,28 +94,36 @@ class TestLowMemoryLimits(ImpalaTestSuite):
# If memory limit larger than the minimum threshold, then it is not expected to fail.
expects_error = mem < limit
new_vector = copy(vector)
- new_vector.get_value('exec_option')['mem_limit'] = str(mem) + "m"
+ exec_options = new_vector.get_value('exec_option')
+ exec_options['mem_limit'] = str(mem) + "m"
+
+ # Reduce the page size to better exercise page boundary logic.
+ exec_options['default_spillable_buffer_size'] = "256k"
try:
self.run_test_case(tpch_query, new_vector)
except ImpalaBeeswaxException as e:
if not expects_error and not xfail_mem_limit: raise
- assert TestLowMemoryLimits.EXPECTED_ERROR_MSG in str(e)
+ found_expected_error = False
+ for error_msg in TestLowMemoryLimits.EXPECTED_ERROR_MSGS:
+ if error_msg in str(e): found_expected_error = True
+ assert found_expected_error, str(e)
if not expects_error and xfail_mem_limit:
pytest.xfail(xfail_mem_limit)
class TestTpchMemLimitError(TestLowMemoryLimits):
- # TODO: After we stabilize the mem usage test, we should move this test to exhaustive.
+ # TODO: consider moving this test to exhaustive.
# The mem limits that will be used.
- MEM_IN_MB = [20, 140, 180, 275, 450, 700, 980]
+ MEM_IN_MB = [20, 140, 180, 220, 275, 450, 700]
# Different values of mem limits and minimum mem limit (in MBs) each query is expected
- # to run without problem. Those values were determined by manual testing.
- MIN_MEM_FOR_TPCH = { 'Q1' : 140, 'Q2' : 120, 'Q3' : 240, 'Q4' : 125, 'Q5' : 235,\
- 'Q6' : 25, 'Q7' : 265, 'Q8' : 250, 'Q9' : 400, 'Q10' : 240,\
- 'Q11' : 110, 'Q12' : 125, 'Q13' : 110, 'Q14' : 229, 'Q15' : 125,\
- 'Q16' : 125, 'Q17' : 130, 'Q18' : 475, 'Q19' : 240, 'Q20' : 250,\
- 'Q21' : 620, 'Q22' : 125}
+ # to run without problem. These were determined using the query_runtime_info.json file
+ # produced by the stress test (i.e. concurrent_select.py).
+ MIN_MEM_FOR_TPCH = { 'Q1' : 125, 'Q2' : 125, 'Q3' : 112, 'Q4' : 137, 'Q5' : 137,\
+ 'Q6' : 25, 'Q7' : 200, 'Q8' : 125, 'Q9' : 200, 'Q10' : 162,\
+ 'Q11' : 112, 'Q12' : 150, 'Q13' : 125, 'Q14' : 125, 'Q15' : 125,\
+ 'Q16' : 137, 'Q17' : 137, 'Q18' : 196, 'Q19' : 112, 'Q20' : 162,\
+ 'Q21' : 187, 'Q22' : 125}
@classmethod
def get_workload(self):
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_nested_types.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_nested_types.py b/tests/query_test/test_nested_types.py
index 96a170b..bb74faa 100644
--- a/tests/query_test/test_nested_types.py
+++ b/tests/query_test/test_nested_types.py
@@ -27,7 +27,6 @@ from tests.common.skip import (
SkipIfS3,
SkipIfADLS,
SkipIfLocal)
-
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
class TestNestedTypes(ImpalaTestSuite):
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_scratch_limit.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scratch_limit.py b/tests/query_test/test_scratch_limit.py
index 6a13318..6e19bb5 100644
--- a/tests/query_test/test_scratch_limit.py
+++ b/tests/query_test/test_scratch_limit.py
@@ -28,7 +28,7 @@ class TestScratchLimit(ImpalaTestSuite):
# Block manager memory limit that is low enough to
# force Impala to spill to disk when executing 'spill_query'
- max_block_mgr_memory = "64m"
+ buffer_pool_limit = "64m"
@classmethod
def get_workload(self):
@@ -48,7 +48,7 @@ class TestScratchLimit(ImpalaTestSuite):
its required scratch space which in this case is 128m.
"""
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
exec_option['scratch_limit'] = '500m'
self.execute_query_expect_success(self.client, self.spill_query, exec_option)
@@ -58,7 +58,7 @@ class TestScratchLimit(ImpalaTestSuite):
its required scratch space which in this case is 128m.
"""
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
exec_option['scratch_limit'] = '24m'
expected_error = 'Scratch space limit of %s bytes exceeded'
scratch_limit_in_bytes = 24 * 1024 * 1024
@@ -74,7 +74,7 @@ class TestScratchLimit(ImpalaTestSuite):
zero which means no scratch space can be allocated.
"""
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
exec_option['scratch_limit'] = '0'
self.execute_query_expect_failure(self.spill_query, exec_option)
@@ -83,7 +83,7 @@ class TestScratchLimit(ImpalaTestSuite):
Query runs to completion with a scratch Limit of -1 means default/no limit.
"""
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
exec_option['scratch_limit'] = '-1'
self.execute_query_expect_success(self.client, self.spill_query, exec_option)
@@ -92,7 +92,7 @@ class TestScratchLimit(ImpalaTestSuite):
Query runs to completion with the default setting of no scratch limit.
"""
exec_option = vector.get_value('exec_option')
- exec_option['max_block_mgr_memory'] = self.max_block_mgr_memory
+ exec_option['buffer_pool_limit'] = self.buffer_pool_limit
self.execute_query_expect_success(self.client, self.spill_query, exec_option)
def test_with_zero_scratch_limit_no_memory_limit(self, vector):
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_sort.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_sort.py b/tests/query_test/test_sort.py
index b048c9f..df95ddd 100644
--- a/tests/query_test/test_sort.py
+++ b/tests/query_test/test_sort.py
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+from copy import copy
+
from tests.common.impala_test_suite import ImpalaTestSuite
def transpose_results(result, map_fn=lambda x: x):
@@ -46,7 +48,7 @@ class TestQueryFullSort(ImpalaTestSuite):
takes about a minute"""
query = """select l_comment, l_partkey, l_orderkey, l_suppkey, l_commitdate
from lineitem order by l_comment limit 100000"""
- exec_option = vector.get_value('exec_option')
+ exec_option = copy(vector.get_value('exec_option'))
exec_option['disable_outermost_topn'] = 1
table_format = vector.get_value('table_format')
@@ -63,16 +65,18 @@ class TestQueryFullSort(ImpalaTestSuite):
query = """select o_orderdate, o_custkey, o_comment
from orders
order by o_orderdate"""
- exec_option = vector.get_value('exec_option')
+ exec_option = copy(vector.get_value('exec_option'))
table_format = vector.get_value('table_format')
- max_block_mgr_memory_values = ['-1', '48M'] # Unlimited and minimum memory.
+ # The below memory value assume 8M pages.
+ exec_option['default_spillable_buffer_size'] = '8M'
+ buffer_pool_limit_values = ['-1', '48M'] # Unlimited and minimum memory.
if self.exploration_strategy() == 'exhaustive' and \
table_format.file_format == 'parquet':
# Test some intermediate values for parquet on exhaustive.
- max_block_mgr_memory_values += ['64M', '128M', '256M']
- for max_block_mgr_memory in max_block_mgr_memory_values:
- exec_option['max_block_mgr_memory'] = max_block_mgr_memory
+ buffer_pool_limit_values += ['64M', '128M', '256M']
+ for buffer_pool_limit in buffer_pool_limit_values:
+ exec_option['buffer_pool_limit'] = buffer_pool_limit
result = transpose_results(self.execute_query(
query, exec_option, table_format=table_format).data)
assert(result[0] == sorted(result[0]))
@@ -83,7 +87,7 @@ class TestQueryFullSort(ImpalaTestSuite):
query = """select o1.o_orderdate, o2.o_custkey, o1.o_comment from orders o1 join
orders o2 on (o1.o_orderkey = o2.o_orderkey) order by o1.o_orderdate limit 100000"""
- exec_option = vector.get_value('exec_option')
+ exec_option = copy(vector.get_value('exec_option'))
exec_option['disable_outermost_topn'] = 1
exec_option['mem_limit'] = "1200m"
table_format = vector.get_value('table_format')
@@ -97,7 +101,7 @@ class TestQueryFullSort(ImpalaTestSuite):
select * from orders union all select * from orders) as i
order by o_orderdate limit 100000"""
- exec_option = vector.get_value('exec_option')
+ exec_option = copy(vector.get_value('exec_option'))
exec_option['disable_outermost_topn'] = 1
exec_option['mem_limit'] = "3000m"
table_format = vector.get_value('table_format')
@@ -120,7 +124,7 @@ class TestQueryFullSort(ImpalaTestSuite):
select * from lineitem limit 300000) t
order by l_orderkey"""
- exec_option = vector.get_value('exec_option')
+ exec_option = copy(vector.get_value('exec_option'))
exec_option['disable_outermost_topn'] = 1
# Run with a single scanner thread so that the input doesn't get reordered.
exec_option['num_nodes'] = "1"
@@ -145,9 +149,9 @@ class TestQueryFullSort(ImpalaTestSuite):
limit 100000
"""
- exec_option = vector.get_value('exec_option')
+ exec_option = copy(vector.get_value('exec_option'))
exec_option['disable_outermost_topn'] = 1
- exec_option['max_block_mgr_memory'] = "256m"
+ exec_option['buffer_pool_limit'] = "256m"
exec_option['num_nodes'] = "1"
table_format = vector.get_value('table_format')
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/tests/query_test/test_spilling.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_spilling.py b/tests/query_test/test_spilling.py
new file mode 100644
index 0000000..e2d5141
--- /dev/null
+++ b/tests/query_test/test_spilling.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.test_dimensions import (create_exec_option_dimension_from_dict,
+ create_parquet_dimension)
+
+class TestSpilling(ImpalaTestSuite):
+ @classmethod
+ def get_workload(self):
+ return 'functional-query'
+
+ @classmethod
+ def add_test_dimensions(cls):
+ super(TestSpilling, cls).add_test_dimensions()
+ cls.ImpalaTestMatrix.clear_constraints()
+ cls.ImpalaTestMatrix.add_dimension(create_parquet_dimension('tpch'))
+ # Tests are calibrated so that they can execute and spill with this page size.
+ cls.ImpalaTestMatrix.add_dimension(
+ create_exec_option_dimension_from_dict({'default_spillable_buffer_size' : ['256k']}))
+
+ def test_spilling(self, vector):
+ self.run_test_case('QueryTest/spilling', vector)
[05/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.h b/be/src/runtime/buffered-tuple-stream.h
deleted file mode 100644
index 41d63bf..0000000
--- a/be/src/runtime/buffered-tuple-stream.h
+++ /dev/null
@@ -1,561 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_H
-#define IMPALA_RUNTIME_BUFFERED_TUPLE_STREAM_H
-
-#include <vector>
-#include <set>
-
-#include "common/status.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/row-batch.h"
-
-namespace impala {
-
-class BufferedBlockMgr;
-class RuntimeProfile;
-class RuntimeState;
-class RowDescriptor;
-class SlotDescriptor;
-class TupleRow;
-
-/// Class that provides an abstraction for a stream of tuple rows. Rows can be
-/// added to the stream and returned. Rows are returned in the order they are added.
-///
-/// The underlying memory management is done by the BufferedBlockMgr.
-///
-/// The tuple stream consists of a number of small (less than IO-sized blocks) before
-/// an arbitrary number of IO-sized blocks. The smaller blocks do not spill and are
-/// there to lower the minimum buffering requirements. For example, an operator that
-/// needs to maintain 64 streams (1 buffer per partition) would need, by default,
-/// 64 * 8MB = 512MB of buffering. A query with 5 of these operators would require
-/// 2.56GB just to run, regardless of how much of that is used. This is
-/// problematic for small queries. Instead we will start with a fixed number of small
-/// buffers (currently 2 small buffers: one 64KB and one 512KB) and only start using IO
-/// sized buffers when those fill up. The small buffers never spill.
-/// The stream will *not* automatically switch from using small buffers to IO-sized
-/// buffers when all the small buffers for this stream have been used.
-///
-/// The BufferedTupleStream is *not* thread safe from the caller's point of view. It is
-/// expected that all the APIs are called from a single thread. Internally, the
-/// object is thread safe wrt to the underlying block mgr.
-///
-/// Buffer management:
-/// The stream is either pinned or unpinned, set via PinStream() and UnpinStream().
-/// Blocks are optionally deleted as they are read, set with the delete_on_read argument
-/// to PrepareForRead().
-///
-/// Block layout:
-/// If the stream's tuples are nullable (i.e. has_nullable_tuple_ is true), there is a
-/// bitstring at the start of each block with null indicators for all tuples in each row
-/// in the block. The length of the bitstring is a function of the block size. Row data
-/// is stored after the null indicators if present, or at the start of the block
-/// otherwise. Rows are stored back to back in the stream, with no interleaving of data
-/// from different rows. There is no padding or alignment between rows.
-///
-/// Null tuples:
-/// The order of bits in the null indicators bitstring corresponds to the order of
-/// tuples in the block. The NULL tuples are not stored in the row iself, only as set
-/// bits in the null indicators bitstring.
-///
-/// Tuple row layout:
-/// The fixed length parts of the row's tuples are stored first, followed by var len data
-/// for inlined_string_slots_ and inlined_coll_slots_. Other "external" var len slots can
-/// point to var len data outside the stream. When reading the stream, the length of each
-/// row's var len data in the stream must be computed to find the next row's start.
-///
-/// The tuple stream supports reading from the stream into RowBatches without copying
-/// out any data: the RowBatches' Tuple pointers will point directly into the stream's
-/// blocks. The fixed length parts follow Impala's internal tuple format, so for the
-/// tuple to be valid, we only need to update pointers to point to the var len data
-/// in the stream. These pointers need to be updated by the stream because a spilled
-/// block may be relocated to a different location in memory. The pointers are updated
-/// lazily upon reading the stream via GetNext() or GetRows().
-///
-/// Example layout for a row with two tuples ((1, "hello"), (2, "world")) with all var
-/// len data stored in the stream:
-/// <---- tuple 1 -----> <------ tuple 2 ------> <- var len -> <- next row ...
-/// +--------+-----------+-----------+-----------+-------------+
-/// | IntVal | StringVal | BigIntVal | StringVal | | ...
-/// +--------+-----------+-----------+-----------++------------+
-/// | val: 1 | len: 5 | val: 2 | len: 5 | helloworld | ...
-/// | | ptr: 0x.. | | ptr: 0x.. | | ...
-/// +--------+-----------+-----------+-----------+-------------+
-/// <--4b--> <---12b---> <----8b---> <---12b---> <----10b---->
-//
-/// Example layout for a row with a single tuple (("hello", "world")) with the second
-/// string slot stored externally to the stream:
-/// <------ tuple 1 ------> <- var len -> <- next row ...
-/// +-----------+-----------+-------------+
-/// | StringVal | StringVal | | ...
-/// +-----------+-----------+-------------+
-/// | len: 5 | len: 5 | hello | ...
-/// | ptr: 0x.. | ptr: 0x.. | | ...
-/// +-----------+-----------+-------------+
-/// <---12b---> <---12b---> <-----5b---->
-///
-/// The behavior of reads and writes is as follows:
-/// Read:
-/// 1. Delete on read (delete_on_read_): Blocks are deleted as we go through the stream.
-/// The data returned by the tuple stream is valid until the next read call so the
-/// caller does not need to copy if it is streaming.
-/// 2. Unpinned: Blocks remain in blocks_ and are unpinned after reading.
-/// 3. Pinned: Blocks remain in blocks_ and are left pinned after reading. If the next
-/// block in the stream cannot be pinned, the read call will fail and the caller needs
-/// to free memory from the underlying block mgr.
-/// Write:
-/// 1. Unpinned: Unpin blocks as they fill up. This means only a single (i.e. the
-/// current) block needs to be in memory regardless of the input size (if read_write is
-/// true, then two blocks need to be in memory).
-/// 2. Pinned: Blocks are left pinned. If we run out of blocks, the write will fail and
-/// the caller needs to free memory from the underlying block mgr.
-///
-/// Memory lifetime of rows read from stream:
-/// If the stream is pinned, it is valid to access any tuples returned via
-/// GetNext() or GetRows() until the stream is unpinned. If the stream is unpinned, and
-/// the batch returned from GetNext() has the needs_deep_copy flag set, any tuple memory
-/// returned so far from the stream may be freed on the next call to GetNext().
-///
-/// Manual construction of rows with AllocateRow():
-/// The BufferedTupleStream supports allocation of uninitialized rows with AllocateRow().
-/// The caller of AllocateRow() is responsible for writing the row with exactly the
-/// layout described above.
-///
-/// If a caller constructs a tuple in this way, the caller can set the pointers and they
-/// will not be modified until the stream is read via GetNext() or GetRows().
-///
-/// TODO: we need to be able to do read ahead in the BufferedBlockMgr. It currently
-/// only has PinAllBlocks() which is blocking. We need a non-blocking version of this or
-/// some way to indicate a block will need to be pinned soon.
-/// TODO: see if this can be merged with Sorter::Run. The key difference is that this
-/// does not need to return rows in the order they were added, which allows it to be
-/// simpler.
-/// TODO: we could compact the small buffers when we need to spill but they use very
-/// little memory so ths might not be very useful.
-/// TODO: improvements:
-/// - It would be good to allocate the null indicators at the end of each block and grow
-/// this array as new rows are inserted in the block. If we do so, then there will be
-/// fewer gaps in case of many rows with NULL tuples.
-/// - We will want to multithread this. Add a AddBlock() call so the synchronization
-/// happens at the block level. This is a natural extension.
-/// - Instead of allocating all blocks from the block_mgr, allocate some blocks that
-/// are much smaller (e.g. 16K and doubling up to the block size). This way, very
-/// small streams (a common case) will use very little memory. This small blocks
-/// are always in memory since spilling them frees up negligible memory.
-/// - Return row batches in GetNext() instead of filling one in
-class BufferedTupleStream {
- public:
- /// Ordinal index into the stream to retrieve a row in O(1) time. This index can
- /// only be used if the stream is pinned.
- /// To read a row from a stream we need three pieces of information that we squeeze in
- /// 64 bits:
- /// - The index of the block. The block id is stored in 16 bits. We can have up to
- /// 64K blocks per tuple stream. With 8MB blocks that is 512GB per stream.
- /// - The offset of the start of the row (data) within the block. Since blocks are 8MB
- /// we use 24 bits for the offsets. (In theory we could use 23 bits.)
- /// - The idx of the row in the block. We need this for retrieving the null indicators.
- /// We use 24 bits for this index as well.
- struct RowIdx {
- static const uint64_t BLOCK_MASK = 0xFFFF;
- static const uint64_t BLOCK_SHIFT = 0;
- static const uint64_t OFFSET_MASK = 0xFFFFFF0000;
- static const uint64_t OFFSET_SHIFT = 16;
- static const uint64_t IDX_MASK = 0xFFFFFF0000000000;
- static const uint64_t IDX_SHIFT = 40;
-
- uint64_t block() const {
- return (data & BLOCK_MASK);
- }
-
- uint64_t offset() const {
- return (data & OFFSET_MASK) >> OFFSET_SHIFT;
- }
-
- uint64_t idx() const {
- return (data & IDX_MASK) >> IDX_SHIFT;
- }
-
- uint64_t set(uint64_t block, uint64_t offset, uint64_t idx) {
- DCHECK_LE(block, BLOCK_MASK)
- << "Cannot have more than 2^16 = 64K blocks in a tuple stream.";
- DCHECK_LE(offset, OFFSET_MASK >> OFFSET_SHIFT)
- << "Cannot have blocks larger than 2^24 = 16MB";
- DCHECK_LE(idx, IDX_MASK >> IDX_SHIFT)
- << "Cannot have more than 2^24 = 16M rows in a block.";
- data = block | (offset << OFFSET_SHIFT) | (idx << IDX_SHIFT);
- return data;
- }
-
- std::string DebugString() const;
-
- uint64_t data;
- };
-
- /// row_desc: description of rows stored in the stream. This is the desc for rows
- /// that are added and the rows being returned.
- /// block_mgr: Underlying block mgr that owns the data blocks.
- /// use_initial_small_buffers: If true, the initial N buffers allocated for the
- /// tuple stream use smaller than IO-sized buffers.
- /// read_write: Stream allows interchanging read and write operations. Requires at
- /// least two blocks may be pinned.
- /// ext_varlen_slots: set of varlen slots with data stored externally to the stream
- BufferedTupleStream(RuntimeState* state, const RowDescriptor* row_desc,
- BufferedBlockMgr* block_mgr, BufferedBlockMgr::Client* client,
- bool use_initial_small_buffers, bool read_write,
- const std::set<SlotId>& ext_varlen_slots = std::set<SlotId>());
-
- ~BufferedTupleStream();
-
- /// Initializes the tuple stream object on behalf of node 'node_id'. Must be called
- /// once before any of the other APIs.
- /// If 'pinned' is true, the tuple stream starts of pinned, otherwise it is unpinned.
- /// If 'profile' is non-NULL, counters are created.
- /// 'node_id' is only used for error reporting.
- Status Init(int node_id, RuntimeProfile* profile, bool pinned);
-
- /// Prepares the stream for writing by attempting to allocate a write block.
- /// Called after Init() and before the first AddRow() call.
- /// 'got_buffer': set to true if the first write block was successfully pinned, or
- /// false if the block could not be pinned and no error was encountered. Undefined
- /// if an error status is returned.
- Status PrepareForWrite(bool* got_buffer);
-
- /// Must be called for streams using small buffers to switch to IO-sized buffers.
- /// If it fails to get a buffer (i.e. the switch fails) it resets the use_small_buffers_
- /// back to false.
- /// TODO: IMPALA-3200: remove this when small buffers are removed.
- Status SwitchToIoBuffers(bool* got_buffer);
-
- /// Adds a single row to the stream. Returns true if the append succeeded, returns false
- /// and sets 'status' to OK if appending failed but can be retried or returns false and
- /// sets 'status' to an error if an error occurred.
- /// BufferedTupleStream will do a deep copy of the memory in the row. After AddRow()
- /// returns an error, it should not be called again. If appending failed without an
- /// error and the stream is using small buffers, it is valid to call
- /// SwitchToIoBuffers() then AddRow() again.
- bool AddRow(TupleRow* row, Status* status) noexcept;
-
- /// Allocates space to store a row of with fixed length 'fixed_size' and variable
- /// length data 'varlen_size'. If successful, returns the pointer where fixed length
- /// data should be stored and assigns 'varlen_data' to where var-len data should
- /// be stored. Returns NULL if there is not enough memory or an error occurred.
- /// Sets *status if an error occurred. The returned memory is guaranteed to all
- /// be allocated in the same block. AllocateRow does not currently support nullable
- /// tuples.
- uint8_t* AllocateRow(int fixed_size, int varlen_size, uint8_t** varlen_data,
- Status* status);
-
- /// Populates 'row' with the row at 'idx'. The stream must be pinned. The row must have
- /// been allocated with the stream's row desc.
- void GetTupleRow(const RowIdx& idx, TupleRow* row) const;
-
- /// Prepares the stream for reading. If read_write_, this can be called at any time to
- /// begin reading. Otherwise this must be called after the last AddRow() and
- /// before GetNext().
- /// delete_on_read: Blocks are deleted after they are read.
- /// got_buffer: set to true if the first read block was successfully pinned, or
- /// false if the block could not be pinned and no error was encountered.
- Status PrepareForRead(bool delete_on_read, bool* got_buffer);
-
- /// Pins all blocks in this stream and switches to pinned mode.
- /// If there is not enough memory, *pinned is set to false and the stream is unmodified.
- /// If already_reserved is true, the caller has already made a reservation on
- /// block_mgr_client_ to pin the stream.
- Status PinStream(bool already_reserved, bool* pinned);
-
- /// Modes for UnpinStream().
- enum UnpinMode {
- /// All blocks in the stream are unpinned and the read/write positions in the stream
- /// are reset. No more rows can be written to the stream after this. The stream can
- /// be re-read from the beginning by calling PrepareForRead().
- UNPIN_ALL,
- /// All blocks are unpinned aside from the current read and write blocks (if any),
- /// which is left in the same state. The unpinned stream can continue being read
- /// or written from the current read or write positions.
- UNPIN_ALL_EXCEPT_CURRENT,
- };
-
- /// Unpins stream with the given 'mode' as described above.
- Status UnpinStream(UnpinMode mode);
-
- /// Get the next batch of output rows. Memory is still owned by the BufferedTupleStream
- /// and must be copied out by the caller.
- Status GetNext(RowBatch* batch, bool* eos);
-
- /// Same as above, but also populate 'indices' with the index of each returned row.
- Status GetNext(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
-
- /// Returns all the rows in the stream in batch. This pins the entire stream in the
- /// process.
- /// *got_rows is false if the stream could not be pinned.
- Status GetRows(boost::scoped_ptr<RowBatch>* batch, bool* got_rows);
-
- /// Must be called once at the end to cleanup all resources. If 'batch' is non-NULL,
- /// attaches any pinned blocks to the batch and deletes unpinned blocks. Otherwise
- /// deletes all blocks. Does nothing if the stream was already closed. The 'flush'
- /// mode is forwarded to RowBatch::AddBlock() when attaching blocks.
- void Close(RowBatch* batch, RowBatch::FlushMode flush);
-
- /// Number of rows in the stream.
- int64_t num_rows() const { return num_rows_; }
-
- /// Number of rows returned via GetNext().
- int64_t rows_returned() const { return rows_returned_; }
-
- /// Returns the byte size necessary to store the entire stream in memory.
- int64_t byte_size() const { return total_byte_size_; }
-
- /// Returns the byte size of the stream that is currently pinned in memory.
- /// If ignore_current is true, the write_block_ memory is not included.
- int64_t bytes_in_mem(bool ignore_current) const;
-
- bool is_closed() const { return closed_; }
- bool is_pinned() const { return pinned_; }
- int blocks_pinned() const { return num_pinned_; }
- int blocks_unpinned() const { return blocks_.size() - num_pinned_ - num_small_blocks_; }
- bool has_read_block() const { return read_block_ != blocks_.end(); }
- bool has_write_block() const { return write_block_ != NULL; }
- bool using_small_buffers() const { return use_small_buffers_; }
-
- /// Returns true if the row consumes any memory. If false, the stream only needs to
- /// store the count of rows.
- bool RowConsumesMemory() const {
- return fixed_tuple_row_size_ > 0 || has_nullable_tuple_;
- }
-
- std::string DebugString() const;
-
- private:
- friend class ArrayTupleStreamTest_TestArrayDeepCopy_Test;
- friend class ArrayTupleStreamTest_TestComputeRowSize_Test;
- friend class MultiNullableTupleStreamTest_TestComputeRowSize_Test;
- friend class SimpleTupleStreamTest_TestGetRowsOverflow_Test;
-
- /// Runtime state instance used to check for cancellation. Not owned.
- RuntimeState* const state_;
-
- /// Description of rows stored in the stream.
- const RowDescriptor* desc_;
-
- /// Sum of the fixed length portion of all the tuples in desc_.
- int fixed_tuple_row_size_;
-
- /// The size of the fixed length portion for each tuple in the row.
- std::vector<int> fixed_tuple_sizes_;
-
- /// Max size (in bytes) of null indicators bitmap in the current read and write
- /// blocks. If 0, it means that there is no need to store null indicators for this
- /// RowDesc. We calculate this value based on the block's size and the
- /// fixed_tuple_row_size_. When not 0, this value is also an upper bound for the number
- /// of (rows * tuples_per_row) in this block.
- int read_block_null_indicators_size_;
- int write_block_null_indicators_size_;
-
- /// Size (in bytes) of the null indicators bitmap reserved in a block of maximum
- /// size (i.e. IO block size). 0 if no tuple is nullable.
- int max_null_indicators_size_;
-
- /// Vectors of all the strings slots that have their varlen data stored in stream
- /// grouped by tuple_idx.
- std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_string_slots_;
-
- /// Vectors of all the collection slots that have their varlen data stored in the
- /// stream, grouped by tuple_idx.
- std::vector<std::pair<int, std::vector<SlotDescriptor*>>> inlined_coll_slots_;
-
- /// Block manager and client used to allocate, pin and release blocks. Not owned.
- BufferedBlockMgr* block_mgr_;
- BufferedBlockMgr::Client* block_mgr_client_;
-
- /// List of blocks in the stream.
- std::list<BufferedBlockMgr::Block*> blocks_;
-
- /// Total size of blocks_, including small blocks.
- int64_t total_byte_size_;
-
- /// Iterator pointing to the current block for read. Equal to list.end() until
- /// PrepareForRead() is called.
- std::list<BufferedBlockMgr::Block*>::iterator read_block_;
-
- /// For each block in the stream, the buffer of the start of the block. This is only
- /// valid when the stream is pinned, giving random access to data in the stream.
- /// This is not maintained for delete_on_read_.
- std::vector<uint8_t*> block_start_idx_;
-
- /// Current idx of the tuple read from the read_block_ buffer.
- uint32_t read_tuple_idx_;
-
- /// Current offset in read_block_ of the end of the last data read.
- uint8_t* read_ptr_;
-
- /// Pointer to one byte past the end of read_block_.
- uint8_t* read_end_ptr_;
-
- /// Current idx of the tuple written at the write_block_ buffer.
- uint32_t write_tuple_idx_;
-
- /// Pointer into write_block_ of the end of the last data written.
- uint8_t* write_ptr_;
-
- /// Pointer to one byte past the end of write_block_.
- uint8_t* write_end_ptr_;
-
- /// Number of rows returned to the caller from GetNext().
- int64_t rows_returned_;
-
- /// The block index of the current read block in blocks_.
- int read_block_idx_;
-
- /// The current block for writing. NULL if there is no available block to write to.
- /// The entire write_block_ buffer is marked as allocated, so any data written into
- /// the buffer will be spilled without having to allocate additional space.
- BufferedBlockMgr::Block* write_block_;
-
- /// Number of pinned blocks in blocks_, stored to avoid iterating over the list
- /// to compute bytes_in_mem and bytes_unpinned.
- /// This does not include small blocks.
- int num_pinned_;
-
- /// The total number of small blocks in blocks_;
- int num_small_blocks_;
-
- /// Number of rows stored in the stream.
- int64_t num_rows_;
-
- /// Counters added by this object to the parent runtime profile.
- RuntimeProfile::Counter* pin_timer_;
- RuntimeProfile::Counter* unpin_timer_;
- RuntimeProfile::Counter* get_new_block_timer_;
-
- /// If true, read and write operations may be interleaved. Otherwise all calls
- /// to AddRow() must occur before calling PrepareForRead() and subsequent calls to
- /// GetNext().
- const bool read_write_;
-
- /// Whether any tuple in the rows is nullable.
- const bool has_nullable_tuple_;
-
- /// If true, this stream is still using small buffers.
- bool use_small_buffers_;
-
- /// If true, blocks are deleted after they are read.
- bool delete_on_read_;
-
- bool closed_; // Used for debugging.
-
- /// If true, this stream has been explicitly pinned by the caller. This changes the
- /// memory management of the stream. The blocks are not unpinned until the caller calls
- /// UnpinAllBlocks(). If false, only the write_block_ and/or read_block_ are pinned
- /// (both are if read_write_ is true).
- bool pinned_;
-
- /// The slow path for AddRow() that is called if there is not sufficient space in
- /// the current block.
- bool AddRowSlow(TupleRow* row, Status* status) noexcept;
-
- /// Copies 'row' into write_block_. Returns false if there is not enough space in
- /// 'write_block_'. After returning false, write_ptr_ may be left pointing to the
- /// partially-written row, and no more data can be written to write_block_.
- template <bool HAS_NULLABLE_TUPLE>
- bool DeepCopyInternal(TupleRow* row) noexcept;
-
- /// Helper function to copy strings in string_slots from tuple into write_block_.
- /// Updates write_ptr_ to the end of the string data added. Returns false if the data
- /// does not fit in the current write block. After returning false, write_ptr_ is left
- /// pointing to the partially-written row, and no more data can be written to
- /// write_block_.
- bool CopyStrings(const Tuple* tuple, const std::vector<SlotDescriptor*>& string_slots);
-
- /// Helper function to deep copy collections in collection_slots from tuple into
- /// write_block_. Updates write_ptr_ to the end of the collection data added. Returns
- /// false if the data does not fit in the current write block.. After returning false,
- /// write_ptr_ is left pointing to the partially-written row, and no more data can be
- /// written to write_block_.
- bool CopyCollections(const Tuple* tuple,
- const std::vector<SlotDescriptor*>& collection_slots);
-
- /// Wrapper of the templated DeepCopyInternal() function.
- bool DeepCopy(TupleRow* row) noexcept;
-
- /// Gets a new block of 'block_len' bytes from the block_mgr_, updating write_block_,
- /// write_tuple_idx_, write_ptr_ and write_end_ptr_. 'null_indicators_size' is the
- /// number of bytes that will be reserved in the block for the null indicators bitmap.
- /// *got_block is set to true if a block was successfully acquired. Null indicators
- /// (if any) will also be reserved and initialized. If there are no blocks available,
- /// *got_block is set to false and write_block_ is unchanged.
- Status NewWriteBlock(
- int64_t block_len, int64_t null_indicators_size, bool* got_block) noexcept;
-
- /// A wrapper around NewWriteBlock(). 'row_size' is the size of the tuple row to be
- /// appended to this block. This function determines the block size required in order
- /// to fit the row and null indicators.
- Status NewWriteBlockForRow(int64_t row_size, bool* got_block) noexcept;
-
- /// Reads the next block from the block_mgr_. This blocks if necessary.
- /// Updates read_block_, read_ptr_, read_tuple_idx_ and read_end_ptr_.
- Status NextReadBlock();
-
- /// Returns the total additional bytes that this row will consume in write_block_ if
- /// appended to the block. This includes the fixed length part of the row and the
- /// data for inlined_string_slots_ and inlined_coll_slots_.
- int64_t ComputeRowSize(TupleRow* row) const noexcept;
-
- /// Unpins block if it is an IO-sized block and updates tracking stats.
- Status UnpinBlock(BufferedBlockMgr::Block* block);
-
- /// Templated GetNext implementations.
- template <bool FILL_INDICES>
- Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
- template <bool FILL_INDICES, bool HAS_NULLABLE_TUPLE>
- Status GetNextInternal(RowBatch* batch, bool* eos, std::vector<RowIdx>* indices);
-
- /// Helper function for GetNextInternal(). For each string slot in string_slots,
- /// update StringValue's ptr field to point to the corresponding string data stored
- /// inline in the stream (at the current value of read_ptr_) advance read_ptr_ by the
- /// StringValue's length field.
- void FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots, Tuple* tuple);
-
- /// Helper function for GetNextInternal(). For each collection slot in collection_slots,
- /// recursively update any pointers in the CollectionValue to point to the corresponding
- /// var len data stored inline in the stream, advancing read_ptr_ as data is read.
- /// Assumes that the collection was serialized to the stream in DeepCopy()'s format.
- void FixUpCollectionsForRead(const vector<SlotDescriptor*>& collection_slots,
- Tuple* tuple);
-
- /// Computes the number of bytes needed for null indicators for a block of 'block_size'.
- /// Return 0 if no tuple is nullable. Return -1 if a single row of fixed-size tuples
- /// plus its null indicator (if any) cannot fit in the block.
- int ComputeNumNullIndicatorBytes(int block_size) const;
-
- uint32_t read_block_bytes_remaining() const {
- DCHECK_GE(read_end_ptr_, read_ptr_);
- DCHECK_LE(read_end_ptr_ - read_ptr_, (*read_block_)->buffer_len());
- return read_end_ptr_ - read_ptr_;
- }
-
- uint32_t write_block_bytes_remaining() const {
- DCHECK_GE(write_end_ptr_, write_ptr_);
- DCHECK_LE(write_end_ptr_ - write_ptr_, write_block_->buffer_len());
- return write_end_ptr_ - write_ptr_;
- }
-
-};
-
-}
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.inline.h b/be/src/runtime/buffered-tuple-stream.inline.h
deleted file mode 100644
index ba6bb8c..0000000
--- a/be/src/runtime/buffered-tuple-stream.inline.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_TUPLE_BUFFERED_STREAM_INLINE_H
-#define IMPALA_RUNTIME_TUPLE_BUFFERED_STREAM_INLINE_H
-
-#include "runtime/buffered-tuple-stream.h"
-
-#include "runtime/descriptors.h"
-#include "runtime/tuple-row.h"
-
-namespace impala {
-
-inline bool BufferedTupleStream::AddRow(TupleRow* row, Status* status) noexcept {
- DCHECK(!closed_);
- if (LIKELY(DeepCopy(row))) return true;
- return AddRowSlow(row, status);
-}
-
-inline uint8_t* BufferedTupleStream::AllocateRow(int fixed_size, int varlen_size,
- uint8_t** varlen_data, Status* status) {
- DCHECK(!closed_);
- DCHECK(!has_nullable_tuple_) << "AllocateRow does not support nullable tuples";
- const int total_size = fixed_size + varlen_size;
- if (UNLIKELY(write_block_ == NULL || write_block_bytes_remaining() < total_size)) {
- bool got_block;
- *status = NewWriteBlockForRow(total_size, &got_block);
- if (!status->ok() || !got_block) return NULL;
- }
- DCHECK(write_block_ != NULL);
- DCHECK(write_block_->is_pinned());
- DCHECK_GE(write_block_bytes_remaining(), total_size);
- ++num_rows_;
- write_block_->AddRow();
-
- uint8_t* fixed_data = write_ptr_;
- write_ptr_ += fixed_size;
- *varlen_data = write_ptr_;
- write_ptr_ += varlen_size;
- return fixed_data;
-}
-
-}
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/bufferpool/buffer-pool.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool.cc b/be/src/runtime/bufferpool/buffer-pool.cc
index 9b16112..83f2e6a 100644
--- a/be/src/runtime/bufferpool/buffer-pool.cc
+++ b/be/src/runtime/bufferpool/buffer-pool.cc
@@ -308,6 +308,16 @@ int64_t BufferPool::ClientHandle::GetUnusedReservation() const {
return impl_->reservation()->GetUnusedReservation();
}
+bool BufferPool::ClientHandle::TransferReservationFrom(
+ ReservationTracker* src, int64_t bytes) {
+ return src->TransferReservationTo(impl_->reservation(), bytes);
+}
+
+bool BufferPool::ClientHandle::TransferReservationTo(
+ ReservationTracker* dst, int64_t bytes) {
+ return impl_->reservation()->TransferReservationTo(dst, bytes);
+}
+
void BufferPool::ClientHandle::SaveReservation(SubReservation* dst, int64_t bytes) {
DCHECK_EQ(dst->tracker_->parent(), impl_->reservation());
bool success = impl_->reservation()->TransferReservationTo(dst->tracker_.get(), bytes);
@@ -355,7 +365,7 @@ BufferPool::Client::Client(BufferPool* pool, TmpFileMgr::FileGroup* file_group,
RuntimeProfile* child_profile = profile->CreateChild("Buffer pool", true, true);
reservation_.InitChildTracker(
child_profile, parent_reservation, mem_tracker, reservation_limit);
- counters_.alloc_time = ADD_TIMER(profile, "AllocTime");
+ counters_.alloc_time = ADD_TIMER(child_profile, "AllocTime");
counters_.cumulative_allocations =
ADD_COUNTER(child_profile, "CumulativeAllocations", TUnit::UNIT);
counters_.cumulative_bytes_alloced =
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/bufferpool/buffer-pool.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool.h b/be/src/runtime/bufferpool/buffer-pool.h
index f2ff99b..e3df8df 100644
--- a/be/src/runtime/bufferpool/buffer-pool.h
+++ b/be/src/runtime/bufferpool/buffer-pool.h
@@ -338,6 +338,14 @@ class BufferPool::ClientHandle {
int64_t GetUsedReservation() const;
int64_t GetUnusedReservation() const;
+ /// Try to transfer 'bytes' of reservation from 'src' to this client using
+ /// ReservationTracker::TransferReservationTo().
+ bool TransferReservationFrom(ReservationTracker* src, int64_t bytes);
+
+ /// Transfer 'bytes' of reservation from this client to 'dst' using
+ /// ReservationTracker::TransferReservationTo().
+ bool TransferReservationTo(ReservationTracker* dst, int64_t bytes);
+
bool is_registered() const { return impl_ != NULL; }
std::string DebugString() const;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/bufferpool/reservation-tracker.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/reservation-tracker.h b/be/src/runtime/bufferpool/reservation-tracker.h
index 4d525c0..80084bc 100644
--- a/be/src/runtime/bufferpool/reservation-tracker.h
+++ b/be/src/runtime/bufferpool/reservation-tracker.h
@@ -127,6 +127,10 @@ class ReservationTracker {
/// Returns true if the reservation increase was successful or not necessary.
bool IncreaseReservationToFit(int64_t bytes) WARN_UNUSED_RESULT;
+ /// Decrease reservation by 'bytes' on this tracker and all ancestors. This tracker's
+ /// reservation must be at least 'bytes' before calling this method.
+ void DecreaseReservation(int64_t bytes) { DecreaseReservation(bytes, false); }
+
/// Transfer reservation from this tracker to 'other'. Both trackers must be in the
/// same query subtree of the hierarchy. One tracker can be the ancestor of the other,
/// or they can share a common ancestor. The subtree root must be at the query level
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
index 3393ab3..55042d8 100644
--- a/be/src/runtime/disk-io-mgr.cc
+++ b/be/src/runtime/disk-io-mgr.cc
@@ -83,7 +83,7 @@ DEFINE_int32(num_adls_io_threads, 16, "Number of ADLS I/O threads");
// not introduce seeks. The literature seems to agree that with 8 MB reads, random
// io and sequential io perform similarly.
DEFINE_int32(read_size, 8 * 1024 * 1024, "Read Size (in bytes)");
-DEFINE_int32(min_buffer_size, 1024, "The minimum read buffer size (in bytes)");
+DECLARE_int64(min_buffer_size);
// With 1024B through 8MB buffers, this is up to ~2GB of buffers.
DEFINE_int32(max_free_io_buffers, 128,
@@ -937,9 +937,8 @@ void DiskIoMgr::HandleWriteFinished(
int disk_id = write_range->disk_id_;
// Execute the callback before decrementing the thread count. Otherwise CancelContext()
- // that waits for the disk ref count to be 0 will return, creating a race, e.g.
- // between BufferedBlockMgr::WriteComplete() and BufferedBlockMgr::~BufferedBlockMgr().
- // See IMPALA-1890.
+ // that waits for the disk ref count to be 0 will return, creating a race, e.g. see
+ // IMPALA-1890.
// The status of the write does not affect the status of the writer context.
write_range->callback_(write_status);
{
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/exec-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.cc b/be/src/runtime/exec-env.cc
index 960e3c9..f2ee6f0 100644
--- a/be/src/runtime/exec-env.cc
+++ b/be/src/runtime/exec-env.cc
@@ -75,6 +75,8 @@ DEFINE_int32(state_store_subscriber_port, 23000,
DEFINE_int32(num_hdfs_worker_threads, 16,
"(Advanced) The number of threads in the global HDFS operation pool");
DEFINE_bool(disable_admission_control, false, "Disables admission control.");
+DEFINE_int64(min_buffer_size, 64 * 1024,
+ "(Advanced) The minimum buffer size to use in the buffer pool");
DECLARE_int32(state_store_port);
DECLARE_int32(num_threads_per_core);
@@ -204,13 +206,14 @@ Status ExecEnv::StartServices() {
// memory limit either based on the available physical memory, or if overcommitting
// is turned off, we use the memory commit limit from /proc/meminfo (see
// IMPALA-1690).
- // --mem_limit="" means no memory limit
+ // --mem_limit="" means no memory limit. TODO: IMPALA-5652: deprecate this mode
int64_t bytes_limit = 0;
bool is_percent;
+ int64_t system_mem;
if (MemInfo::vm_overcommit() == 2 &&
MemInfo::commit_limit() < MemInfo::physical_mem()) {
- bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent,
- MemInfo::commit_limit());
+ system_mem = MemInfo::commit_limit();
+ bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent, system_mem);
// There might be the case of misconfiguration, when on a system swap is disabled
// and overcommitting is turned off the actual usable memory is less than the
// available physical memory.
@@ -225,14 +228,23 @@ Status ExecEnv::StartServices() {
<< "/proc/sys/vm/overcommit_memory and "
<< "/proc/sys/vm/overcommit_ratio.";
} else {
- bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent,
- MemInfo::physical_mem());
+ system_mem = MemInfo::physical_mem();
+ bytes_limit = ParseUtil::ParseMemSpec(FLAGS_mem_limit, &is_percent, system_mem);
}
-
+ // ParseMemSpec returns 0 to mean unlimited. TODO: IMPALA-5652: deprecate this mode.
+ bool no_process_mem_limit = bytes_limit == 0;
if (bytes_limit < 0) {
return Status("Failed to parse mem limit from '" + FLAGS_mem_limit + "'.");
}
+ if (!BitUtil::IsPowerOf2(FLAGS_min_buffer_size)) {
+ return Status(Substitute(
+ "--min_buffer_size must be a power-of-two: $0", FLAGS_min_buffer_size));
+ }
+ int64_t buffer_pool_capacity = BitUtil::RoundDown(
+ no_process_mem_limit ? system_mem : bytes_limit * 4 / 5, FLAGS_min_buffer_size);
+ InitBufferPool(FLAGS_min_buffer_size, buffer_pool_capacity);
+
metrics_->Init(enable_webserver_ ? webserver_.get() : nullptr);
impalad_client_cache_->InitMetrics(metrics_.get(), "impala-server.backends");
catalogd_client_cache_->InitMetrics(metrics_.get(), "catalog.server");
@@ -240,8 +252,8 @@ Status ExecEnv::StartServices() {
metrics_.get(), true, buffer_reservation_.get(), buffer_pool_.get()));
// Limit of -1 means no memory limit.
- mem_tracker_.reset(new MemTracker(
- AggregateMemoryMetrics::TOTAL_USED, bytes_limit > 0 ? bytes_limit : -1, "Process"));
+ mem_tracker_.reset(new MemTracker(AggregateMemoryMetrics::TOTAL_USED,
+ no_process_mem_limit ? -1 : bytes_limit, "Process"));
if (buffer_pool_ != nullptr) {
// Add BufferPool MemTrackers for cached memory that is not tracked against queries
// but is included in process memory consumption.
@@ -270,6 +282,8 @@ Status ExecEnv::StartServices() {
}
LOG(INFO) << "Using global memory limit: "
<< PrettyPrinter::Print(bytes_limit, TUnit::BYTES);
+ LOG(INFO) << "Buffer pool capacity: "
+ << PrettyPrinter::Print(buffer_pool_capacity, TUnit::BYTES);
RETURN_IF_ERROR(disk_io_mgr_->Init(mem_tracker_.get()));
@@ -310,9 +324,8 @@ Status ExecEnv::StartServices() {
return Status::OK();
}
-void ExecEnv::InitBufferPool(int64_t min_page_size, int64_t capacity) {
- DCHECK(buffer_pool_ == nullptr);
- buffer_pool_.reset(new BufferPool(min_page_size, capacity));
+void ExecEnv::InitBufferPool(int64_t min_buffer_size, int64_t capacity) {
+ buffer_pool_.reset(new BufferPool(min_buffer_size, capacity));
buffer_reservation_.reset(new ReservationTracker());
buffer_reservation_->InitRootTracker(nullptr, capacity);
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/exec-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/exec-env.h b/be/src/runtime/exec-env.h
index 4674072..63d2e0b 100644
--- a/be/src/runtime/exec-env.h
+++ b/be/src/runtime/exec-env.h
@@ -159,8 +159,8 @@ class ExecEnv {
boost::scoped_ptr<QueryExecMgr> query_exec_mgr_;
/// Query-wide buffer pool and the root reservation tracker for the pool. The
- /// reservation limit is equal to the maximum capacity of the pool.
- /// For now this is only used by backend tests that create them via InitBufferPool();
+ /// reservation limit is equal to the maximum capacity of the pool. Created in
+ /// InitBufferPool();
boost::scoped_ptr<ReservationTracker> buffer_reservation_;
boost::scoped_ptr<BufferPool> buffer_pool_;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/fragment-instance-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/fragment-instance-state.cc b/be/src/runtime/fragment-instance-state.cc
index 2385eab..07b3f1c 100644
--- a/be/src/runtime/fragment-instance-state.cc
+++ b/be/src/runtime/fragment-instance-state.cc
@@ -126,8 +126,6 @@ Status FragmentInstanceState::Prepare() {
profile()->AddChild(timings_profile_);
SCOPED_TIMER(ADD_TIMER(timings_profile_, PREPARE_TIMER_NAME));
- // TODO: move this into a RuntimeState::Init()
- RETURN_IF_ERROR(runtime_state_->CreateBlockMgr());
runtime_state_->InitFilterBank();
// Reserve one main thread from the pool
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/initial-reservations.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/initial-reservations.cc b/be/src/runtime/initial-reservations.cc
new file mode 100644
index 0000000..4987ec3
--- /dev/null
+++ b/be/src/runtime/initial-reservations.cc
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/initial-reservations.h"
+
+#include <limits>
+
+#include <boost/thread/mutex.hpp>
+#include <gflags/gflags.h>
+
+#include "common/logging.h"
+#include "common/object-pool.h"
+#include "runtime/exec-env.h"
+#include "runtime/mem-tracker.h"
+#include "util/debug-util.h"
+
+#include "common/names.h"
+
+using std::numeric_limits;
+
+DECLARE_int32(be_port);
+DECLARE_string(hostname);
+
+namespace impala {
+
+InitialReservations::InitialReservations(ObjectPool* obj_pool,
+ ReservationTracker* query_reservation, MemTracker* query_mem_tracker,
+ int64_t initial_reservation_total_claims)
+ : remaining_initial_reservation_claims_(initial_reservation_total_claims) {
+ MemTracker* initial_reservation_tracker = obj_pool->Add(
+ new MemTracker(-1, "Unclaimed reservations", query_mem_tracker, false));
+ initial_reservations_.InitChildTracker(nullptr, query_reservation,
+ initial_reservation_tracker, numeric_limits<int64_t>::max());
+}
+
+Status InitialReservations::Init(
+ const TUniqueId& query_id, int64_t query_min_reservation) {
+ DCHECK_EQ(0, initial_reservations_.GetReservation()) << "Already inited";
+ if (!initial_reservations_.IncreaseReservation(query_min_reservation)) {
+ return Status(TErrorCode::MINIMUM_RESERVATION_UNAVAILABLE,
+ PrettyPrinter::Print(query_min_reservation, TUnit::BYTES), FLAGS_hostname,
+ FLAGS_be_port, PrintId(query_id),
+ ExecEnv::GetInstance()->process_mem_tracker()->LogUsage());
+ }
+ VLOG_QUERY << "Successfully claimed initial reservations ("
+ << PrettyPrinter::Print(query_min_reservation, TUnit::BYTES) << ") for"
+ << " query " << PrintId(query_id);
+ return Status::OK();
+}
+
+void InitialReservations::Claim(BufferPool::ClientHandle* dst, int64_t bytes) {
+ DCHECK_GE(bytes, 0);
+ lock_guard<SpinLock> l(lock_);
+ DCHECK_LE(bytes, remaining_initial_reservation_claims_);
+ bool success = dst->TransferReservationFrom(&initial_reservations_, bytes);
+ DCHECK(success) << "Planner computation should ensure enough initial reservations";
+ remaining_initial_reservation_claims_ -= bytes;
+}
+
+void InitialReservations::Return(BufferPool::ClientHandle* src, int64_t bytes) {
+ lock_guard<SpinLock> l(lock_);
+ bool success = src->TransferReservationTo(&initial_reservations_, bytes);
+ // No limits on our tracker - no way this should fail.
+ DCHECK(success);
+ // Check to see if we can release any reservation.
+ int64_t excess_reservation =
+ initial_reservations_.GetReservation() - remaining_initial_reservation_claims_;
+ if (excess_reservation > 0) {
+ initial_reservations_.DecreaseReservation(excess_reservation);
+ }
+}
+
+void InitialReservations::ReleaseResources() {
+ initial_reservations_.Close();
+}
+}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/initial-reservations.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/initial-reservations.h b/be/src/runtime/initial-reservations.h
new file mode 100644
index 0000000..dfcb114
--- /dev/null
+++ b/be/src/runtime/initial-reservations.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_RUNTIME_INITIAL_RESERVATIONS_H
+#define IMPALA_RUNTIME_INITIAL_RESERVATIONS_H
+
+#include "common/status.h"
+#include "gen-cpp/Types_types.h" // for TUniqueId
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/reservation-tracker.h"
+#include "util/spinlock.h"
+
+namespace impala {
+
+class ObjectPool;
+
+/**
+ * Manages the pool of initial reservations for different nodes in the plan tree.
+ * Each plan node and sink claims its initial reservation from here, then returns it when
+ * it is done executing. The frontend is responsible for making sure that enough initial
+ * reservation is in this pool for all of the concurrent claims.
+ */
+class InitialReservations {
+ public:
+ /// 'query_reservation' and 'query_mem_tracker' are the top-level trackers for the
+ /// query. This creates trackers for initial reservations under those.
+ /// 'initial_reservation_total_claims' is the total of initial reservations that will be
+ /// claimed over the lifetime of the query. The total bytes claimed via Claim()
+ /// cannot exceed this. Allocated objects are stored in 'obj_pool'.
+ InitialReservations(ObjectPool* obj_pool, ReservationTracker* query_reservation,
+ MemTracker* query_mem_tracker, int64_t initial_reservation_total_claims);
+
+ /// Initialize the query's pool of initial reservations by acquiring the minimum
+ /// reservation required for the query on this host. Fails if the reservation could
+ /// not be acquired, e.g. because it would exceed a pool or process limit.
+ Status Init(
+ const TUniqueId& query_id, int64_t query_min_reservation) WARN_UNUSED_RESULT;
+
+ /// Claim the initial reservation of 'bytes' for 'dst'. Assumes that the transfer will
+ /// not violate any reservation limits on 'dst'.
+ void Claim(BufferPool::ClientHandle* dst, int64_t bytes);
+
+ /// Return the initial reservation of 'bytes' from 'src'. The reservation is returned
+ /// to the pool of reservations if it may be needed to satisfy a subsequent claim or
+ /// otherwise is released.
+ void Return(BufferPool::ClientHandle* src, int64_t bytes);
+
+ /// Release any reservations held onto by this object.
+ void ReleaseResources();
+
+ private:
+ // Protects all below members to ensure that the internal state is consistent.
+ SpinLock lock_;
+
+ // The pool of initial reservations that Claim() returns reservations from and
+ // Return() returns reservations to.
+ ReservationTracker initial_reservations_;
+
+ /// The total bytes of additional reservations that we expect to be claimed.
+ /// initial_reservations_->GetReservation() <= remaining_initial_reservation_claims_.
+ int64_t remaining_initial_reservation_claims_;
+};
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/query-exec-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-exec-mgr.cc b/be/src/runtime/query-exec-mgr.cc
index 6057b52..22c2826 100644
--- a/be/src/runtime/query-exec-mgr.cc
+++ b/be/src/runtime/query-exec-mgr.cc
@@ -124,6 +124,8 @@ void QueryExecMgr::StartQueryHelper(QueryState* qs) {
}
#endif
+ // decrement refcount taken in QueryState::Init();
+ qs->ReleaseInitialReservationRefcount();
// decrement refcount taken in StartQuery()
ReleaseQueryState(qs);
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/query-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.cc b/be/src/runtime/query-state.cc
index 21f35fb..64a8c5a 100644
--- a/be/src/runtime/query-state.cc
+++ b/be/src/runtime/query-state.cc
@@ -21,11 +21,12 @@
#include <boost/thread/locks.hpp>
#include "exprs/expr.h"
+#include "runtime/backend-client.h"
#include "runtime/bufferpool/buffer-pool.h"
#include "runtime/bufferpool/reservation-tracker.h"
-#include "runtime/backend-client.h"
#include "runtime/exec-env.h"
#include "runtime/fragment-instance-state.h"
+#include "runtime/initial-reservations.h"
#include "runtime/mem-tracker.h"
#include "runtime/query-exec-mgr.h"
#include "runtime/runtime-state.h"
@@ -37,6 +38,20 @@
using namespace impala;
+// The fraction of the query mem limit that is used for buffer reservations. Most
+// operators that accumulate memory use reservations, so the majority of memory should
+// be allocated to buffer reservations, as a heuristic.
+// TODO: this will go away once all operators use buffer reservations.
+static const double RESERVATION_MEM_FRACTION = 0.8;
+
+// The minimum amount of memory that should be left after buffer reservations.
+// The limit on reservations is computed as:
+// min(query_limit * RESERVATION_MEM_FRACTION,
+// query_limit - RESERVATION_MEM_MIN_REMAINING)
+// TODO: this will go away once all operators use buffer reservations and we have accurate
+// minimum requirements.
+static const int64_t RESERVATION_MEM_MIN_REMAINING = 100 * 1024 * 1024;
+
QueryState::ScopedRef::ScopedRef(const TUniqueId& query_id) {
DCHECK(ExecEnv::GetInstance()->query_exec_mgr() != nullptr);
query_state_ = ExecEnv::GetInstance()->query_exec_mgr()->GetQueryState(query_id);
@@ -49,8 +64,10 @@ QueryState::ScopedRef::~ScopedRef() {
QueryState::QueryState(const TQueryCtx& query_ctx, const string& request_pool)
: query_ctx_(query_ctx),
+ initial_reservation_refcnt_(0),
refcnt_(0),
- is_cancelled_(0) {
+ is_cancelled_(0),
+ query_spilled_(0) {
if (query_ctx_.request_pool.empty()) {
// fix up pool name for tests
DCHECK(!request_pool.empty());
@@ -75,6 +92,7 @@ void QueryState::ReleaseResources() {
// Clean up temporary files.
if (file_group_ != nullptr) file_group_->Close();
// Release any remaining reservation.
+ if (initial_reservations_ != nullptr) initial_reservations_->ReleaseResources();
if (buffer_reservation_ != nullptr) buffer_reservation_->Close();
// Avoid dangling reference from the parent of 'query_mem_tracker_'.
if (query_mem_tracker_ != nullptr) query_mem_tracker_->UnregisterFromParent();
@@ -85,6 +103,7 @@ void QueryState::ReleaseResources() {
QueryState::~QueryState() {
DCHECK(released_resources_);
DCHECK_EQ(refcnt_.Load(), 0);
+ DCHECK_EQ(initial_reservation_refcnt_.Load(), 0);
}
Status QueryState::Init(const TExecQueryFInstancesParams& rpc_params) {
@@ -99,9 +118,8 @@ Status QueryState::Init(const TExecQueryFInstancesParams& rpc_params) {
"is over its memory limit", PrintId(query_id()));
RETURN_IF_ERROR(process_mem_tracker->MemLimitExceeded(NULL, msg, 0));
}
- // Do buffer-pool-related setup if running in a backend test that explicitly created
- // the pool.
- if (exec_env->buffer_pool() != nullptr) RETURN_IF_ERROR(InitBufferPoolState());
+
+ RETURN_IF_ERROR(InitBufferPoolState());
// don't copy query_ctx, it's large and we already did that in the c'tor
rpc_params_.__set_coord_state_idx(rpc_params.coord_state_idx);
@@ -112,6 +130,15 @@ Status QueryState::Init(const TExecQueryFInstancesParams& rpc_params) {
rpc_params_.fragment_instance_ctxs.swap(non_const_params.fragment_instance_ctxs);
rpc_params_.__isset.fragment_instance_ctxs = true;
+ // Claim the query-wide minimum reservation. Do this last so that we don't need
+ // to handle releasing it if a later step fails.
+ initial_reservations_ = obj_pool_.Add(new InitialReservations(&obj_pool_,
+ buffer_reservation_, query_mem_tracker_,
+ query_ctx_.per_host_initial_reservation_total_claims));
+ RETURN_IF_ERROR(
+ initial_reservations_->Init(query_id(), query_ctx_.per_host_min_reservation));
+ DCHECK_EQ(0, initial_reservation_refcnt_.Load());
+ initial_reservation_refcnt_.Add(1); // Decremented in QueryExecMgr::StartQueryHelper().
return Status::OK();
}
@@ -129,19 +156,23 @@ void QueryState::InitMemTrackers() {
Status QueryState::InitBufferPoolState() {
ExecEnv* exec_env = ExecEnv::GetInstance();
- int64_t query_mem_limit = query_mem_tracker_->limit();
- if (query_mem_limit == -1) query_mem_limit = numeric_limits<int64_t>::max();
-
- // TODO: IMPALA-3200: add a default upper bound to buffer pool memory derived from
- // query_mem_limit.
- int64_t max_reservation = numeric_limits<int64_t>::max();
- if (query_options().__isset.max_block_mgr_memory
- && query_options().max_block_mgr_memory > 0) {
- max_reservation = query_options().max_block_mgr_memory;
+ int64_t mem_limit = query_mem_tracker_->lowest_limit();
+ int64_t max_reservation;
+ if (query_options().__isset.buffer_pool_limit
+ && query_options().buffer_pool_limit > 0) {
+ max_reservation = query_options().buffer_pool_limit;
+ } else if (mem_limit == -1) {
+ // No query mem limit. The process-wide reservation limit is the only limit on
+ // reservations.
+ max_reservation = numeric_limits<int64_t>::max();
+ } else {
+ DCHECK_GE(mem_limit, 0);
+ max_reservation = min<int64_t>(
+ mem_limit * RESERVATION_MEM_FRACTION, mem_limit - RESERVATION_MEM_MIN_REMAINING);
+ max_reservation = max<int64_t>(0, max_reservation);
}
+ VLOG_QUERY << "Buffer pool limit for " << PrintId(query_id()) << ": " << max_reservation;
- // TODO: IMPALA-3748: claim the query-wide minimum reservation.
- // For now, rely on exec nodes to grab their minimum reservation during Prepare().
buffer_reservation_ = obj_pool_.Add(new ReservationTracker);
buffer_reservation_->InitChildTracker(
NULL, exec_env->buffer_reservation(), query_mem_tracker_, max_reservation);
@@ -256,6 +287,7 @@ void QueryState::StartFInstances() {
VLOG_QUERY << "StartFInstances(): query_id=" << PrintId(query_id())
<< " #instances=" << rpc_params_.fragment_instance_ctxs.size();
DCHECK_GT(refcnt_.Load(), 0);
+ DCHECK_GT(initial_reservation_refcnt_.Load(), 0) << "Should have been taken in Init()";
// set up desc tbl
DCHECK(query_ctx().__isset.desc_tbl);
@@ -290,6 +322,7 @@ void QueryState::StartFInstances() {
// start new thread to execute instance
refcnt_.Add(1); // decremented in ExecFInstance()
+ initial_reservation_refcnt_.Add(1); // decremented in ExecFInstance()
string thread_name = Substitute(
"exec-finstance (finst:$0)", PrintId(instance_ctx.fragment_instance_id));
Thread t(FragmentInstanceState::FINST_THREAD_GROUP_NAME, thread_name,
@@ -311,6 +344,12 @@ void QueryState::StartFInstances() {
instances_prepared_promise_.Set(prepare_status);
}
+void QueryState::ReleaseInitialReservationRefcount() {
+ int32_t new_val = initial_reservation_refcnt_.Add(-1);
+ DCHECK_GE(new_val, 0);
+ if (new_val == 0) initial_reservations_->ReleaseResources();
+}
+
void QueryState::ExecFInstance(FragmentInstanceState* fis) {
ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS_IN_FLIGHT->Increment(1L);
ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS->Increment(1L);
@@ -327,6 +366,8 @@ void QueryState::ExecFInstance(FragmentInstanceState* fis) {
// initiate cancellation if nobody has done so yet
if (!status.ok()) Cancel();
// decrement refcount taken in StartFInstances()
+ ReleaseInitialReservationRefcount();
+ // decrement refcount taken in StartFInstances()
ExecEnv::GetInstance()->query_exec_mgr()->ReleaseQueryState(this);
}
@@ -345,3 +386,21 @@ void QueryState::PublishFilter(int32_t filter_id, int fragment_idx,
fis->PublishFilter(filter_id, thrift_bloom_filter);
}
}
+
+Status QueryState::StartSpilling(RuntimeState* runtime_state, MemTracker* mem_tracker) {
+ // Return an error message with the root cause of why spilling is disabled.
+ if (query_options().scratch_limit == 0) {
+ return mem_tracker->MemLimitExceeded(
+ runtime_state, "Could not free memory by spilling to disk: scratch_limit is 0");
+ } else if (query_ctx_.disable_spilling) {
+ return mem_tracker->MemLimitExceeded(runtime_state,
+ "Could not free memory by spilling to disk: spilling was disabled by planner. "
+ "Re-enable spilling by setting the query option DISABLE_UNSAFE_SPILLS=false");
+ }
+ // 'file_group_' must be non-NULL for spilling to be enabled.
+ DCHECK(file_group_ != nullptr);
+ if (query_spilled_.CompareAndSwap(0, 1)) {
+ ImpaladMetrics::NUM_QUERIES_SPILLED->Increment(1);
+ }
+ return Status::OK();
+}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/query-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.h b/be/src/runtime/query-state.h
index 9ce4316..fc71772 100644
--- a/be/src/runtime/query-state.h
+++ b/be/src/runtime/query-state.h
@@ -34,8 +34,10 @@
namespace impala {
class FragmentInstanceState;
+class InitialReservations;
class MemTracker;
class ReservationTracker;
+class RuntimeState;
/// Central class for all backend execution state (example: the FragmentInstanceStates
/// of the individual fragment instances) created for a particular query.
@@ -110,6 +112,7 @@ class QueryState {
// the following getters are only valid after Prepare()
ReservationTracker* buffer_reservation() const { return buffer_reservation_; }
+ InitialReservations* initial_reservations() const { return initial_reservations_; }
TmpFileMgr::FileGroup* file_group() const { return file_group_; }
const TExecQueryFInstancesParams& rpc_params() const { return rpc_params_; }
@@ -117,8 +120,10 @@ class QueryState {
const DescriptorTbl& desc_tbl() const { return *desc_tbl_; }
/// Sets up state required for fragment execution: memory reservations, etc. Fails
- /// if resources could not be acquired. Uses few cycles and never blocks.
- /// Not idempotent, not thread-safe.
+ /// if resources could not be acquired. On success, acquires an initial reservation
+ /// refcount for the caller, which the caller must release by calling
+ /// ReleaseInitialReservationRefcount().
+ /// Uses few cycles and never blocks. Not idempotent, not thread-safe.
/// The remaining public functions must be called only after Init().
Status Init(const TExecQueryFInstancesParams& rpc_params) WARN_UNUSED_RESULT;
@@ -155,6 +160,12 @@ class QueryState {
/// If there is an error during the rpc, initiates cancellation.
void ReportExecStatus(bool done, const Status& status, FragmentInstanceState* fis);
+ /// Checks whether spilling is enabled for this query. Must be called before the first
+ /// call to BufferPool::Unpin() for the query. Returns OK if spilling is enabled. If
+ /// spilling is not enabled, logs a MEM_LIMIT_EXCEEDED error from
+ /// tracker->MemLimitExceeded() to 'runtime_state'.
+ Status StartSpilling(RuntimeState* runtime_state, MemTracker* mem_tracker);
+
~QueryState();
private:
@@ -162,6 +173,7 @@ class QueryState {
/// test execution
friend class RuntimeState;
+ friend class TestEnv;
static const int DEFAULT_BATCH_SIZE = 1024;
@@ -176,16 +188,21 @@ class QueryState {
/// TODO: find a way not to have to copy this
TExecQueryFInstancesParams rpc_params_;
- /// Buffer reservation for this query (owned by obj_pool_)
- /// Only non-null in backend tests that explicitly enabled the new buffer pool
- /// Set in Prepare().
- /// TODO: this will always be non-null once IMPALA-3200 is done
+ /// Buffer reservation for this query (owned by obj_pool_). Set in Prepare().
ReservationTracker* buffer_reservation_ = nullptr;
- /// Temporary files for this query (owned by obj_pool_)
- /// Only non-null in backend tests the explicitly enabled the new buffer pool
- /// Set in Prepare().
- /// TODO: this will always be non-null once IMPALA-3200 is done
+ /// Pool of buffer reservations used to distribute initial reservations to operators
+ /// in the query. Contains a ReservationTracker that is a child of
+ /// 'buffer_reservation_'. Owned by 'obj_pool_'. Set in Prepare().
+ InitialReservations* initial_reservations_ = nullptr;
+
+ /// Number of fragment instances executing, which may need to claim
+ /// from 'initial_reservations_'.
+ /// TODO: not needed if we call ReleaseResources() in a timely manner (IMPALA-1575).
+ AtomicInt32 initial_reservation_refcnt_;
+
+ /// Temporary files for this query (owned by obj_pool_). Non-null if spilling is
+ /// enabled. Set in Prepare().
TmpFileMgr::FileGroup* file_group_ = nullptr;
/// created in StartFInstances(), owned by obj_pool_
@@ -214,6 +231,11 @@ class QueryState {
/// True if and only if ReleaseResources() has been called.
bool released_resources_ = false;
+ /// Whether the query has spilled. 0 if the query has not spilled. Atomically set to 1
+ /// when the query first starts to spill. Required to correctly maintain the
+ /// "num-queries-spilled" metric.
+ AtomicInt32 query_spilled_;
+
/// Create QueryState w/ refcnt of 0.
/// The query is associated with the resource pool query_ctx.request_pool or
/// 'request_pool', if the former is not set (needed for tests).
@@ -222,13 +244,16 @@ class QueryState {
/// Execute the fragment instance and decrement the refcnt when done.
void ExecFInstance(FragmentInstanceState* fis);
- /// Called from Prepare() to initialize MemTrackers.
+ /// Called from constructor to initialize MemTrackers.
void InitMemTrackers();
- /// Called from Prepare() to setup buffer reservations and the
- /// file group. Fails if required resources are not available.
+ /// Called from Init() to set up buffer reservations and the file group.
Status InitBufferPoolState() WARN_UNUSED_RESULT;
+ /// Decrement 'initial_reservation_refcnt_' and release the initial reservation if it
+ /// goes to zero.
+ void ReleaseInitialReservationRefcount();
+
/// Same behavior as ReportExecStatus().
/// Cancel on error only if instances_started is true.
void ReportExecStatusAux(bool done, const Status& status, FragmentInstanceState* fis,
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/row-batch.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.cc b/be/src/runtime/row-batch.cc
index 11cf363..942ac05 100644
--- a/be/src/runtime/row-batch.cc
+++ b/be/src/runtime/row-batch.cc
@@ -147,9 +147,6 @@ RowBatch::~RowBatch() {
for (int i = 0; i < io_buffers_.size(); ++i) {
ExecEnv::GetInstance()->disk_io_mgr()->ReturnBuffer(move(io_buffers_[i]));
}
- for (int i = 0; i < blocks_.size(); ++i) {
- blocks_[i]->Delete();
- }
for (BufferInfo& buffer_info : buffers_) {
ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(
buffer_info.client, &buffer_info.buffer);
@@ -295,14 +292,6 @@ void RowBatch::AddIoBuffer(unique_ptr<DiskIoMgr::BufferDescriptor> buffer) {
io_buffers_.emplace_back(move(buffer));
}
-void RowBatch::AddBlock(BufferedBlockMgr::Block* block, FlushMode flush) {
- DCHECK(block != NULL);
- DCHECK(block->is_pinned());
- blocks_.push_back(block);
- auxiliary_mem_usage_ += block->buffer_len();
- if (flush == FlushMode::FLUSH_RESOURCES) MarkFlushResources();
-}
-
void RowBatch::AddBuffer(BufferPool::ClientHandle* client,
BufferPool::BufferHandle&& buffer, FlushMode flush) {
auxiliary_mem_usage_ += buffer.len();
@@ -322,10 +311,6 @@ void RowBatch::Reset() {
ExecEnv::GetInstance()->disk_io_mgr()->ReturnBuffer(move(io_buffers_[i]));
}
io_buffers_.clear();
- for (int i = 0; i < blocks_.size(); ++i) {
- blocks_[i]->Delete();
- }
- blocks_.clear();
for (BufferInfo& buffer_info : buffers_) {
ExecEnv::GetInstance()->buffer_pool()->FreeBuffer(
buffer_info.client, &buffer_info.buffer);
@@ -342,10 +327,6 @@ void RowBatch::TransferResourceOwnership(RowBatch* dest) {
dest->AddIoBuffer(move(io_buffers_[i]));
}
io_buffers_.clear();
- for (int i = 0; i < blocks_.size(); ++i) {
- dest->AddBlock(blocks_[i], FlushMode::NO_FLUSH_RESOURCES);
- }
- blocks_.clear();
for (BufferInfo& buffer_info : buffers_) {
dest->AddBuffer(
buffer_info.client, std::move(buffer_info.buffer), FlushMode::NO_FLUSH_RESOURCES);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/row-batch.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.h b/be/src/runtime/row-batch.h
index 1b75ebb..35a8f14 100644
--- a/be/src/runtime/row-batch.h
+++ b/be/src/runtime/row-batch.h
@@ -25,7 +25,6 @@
#include "codegen/impala-ir.h"
#include "common/compiler-util.h"
#include "common/logging.h"
-#include "runtime/buffered-block-mgr.h"
#include "runtime/bufferpool/buffer-pool.h"
#include "runtime/descriptors.h"
#include "runtime/disk-io-mgr.h"
@@ -207,7 +206,6 @@ class RowBatch {
int row_byte_size() { return num_tuples_per_row_ * sizeof(Tuple*); }
MemPool* tuple_data_pool() { return &tuple_data_pool_; }
int num_io_buffers() const { return io_buffers_.size(); }
- int num_blocks() const { return blocks_.size(); }
int num_buffers() const { return buffers_.size(); }
/// Resets the row batch, returning all resources it has accumulated.
@@ -216,13 +214,6 @@ class RowBatch {
/// Add io buffer to this row batch.
void AddIoBuffer(std::unique_ptr<DiskIoMgr::BufferDescriptor> buffer);
- /// Adds a block to this row batch. The block must be pinned. The blocks must be
- /// deleted when freeing resources. The block's memory remains accounted against
- /// the original owner, even when the ownership of batches is transferred. If the
- /// original owner wants the memory to be released, it should call this with 'mode'
- /// FLUSH_RESOURCES (see MarkFlushResources() for further explanation).
- void AddBlock(BufferedBlockMgr::Block* block, FlushMode flush);
-
/// Adds a buffer to this row batch. The buffer is deleted when freeing resources.
/// The buffer's memory remains accounted against the original owner, even when the
/// ownership of batches is transferred. If the original owner wants the memory to be
@@ -426,10 +417,6 @@ class RowBatch {
/// (i.e. they are not ref counted) so most row batches don't own any.
std::vector<std::unique_ptr<DiskIoMgr::BufferDescriptor>> io_buffers_;
- /// Blocks attached to this row batch. The underlying memory and block manager client
- /// are owned by the BufferedBlockMgr.
- std::vector<BufferedBlockMgr::Block*> blocks_;
-
struct BufferInfo {
BufferPool::ClientHandle* client;
BufferPool::BufferHandle buffer;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/runtime-filter.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter.h b/be/src/runtime/runtime-filter.h
index ab70d4a..7b6066a 100644
--- a/be/src/runtime/runtime-filter.h
+++ b/be/src/runtime/runtime-filter.h
@@ -23,6 +23,7 @@
#include "runtime/runtime-filter-bank.h"
#include "util/bloom-filter.h"
#include "util/spinlock.h"
+#include "util/time.h"
namespace impala {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/runtime-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.cc b/be/src/runtime/runtime-state.cc
index 89eec29..ba8e75d 100644
--- a/be/src/runtime/runtime-state.cc
+++ b/be/src/runtime/runtime-state.cc
@@ -17,21 +17,21 @@
#include "runtime/runtime-state.h"
-#include <iostream>
#include <jni.h>
+#include <iostream>
#include <sstream>
#include <string>
-#include "common/logging.h"
#include <boost/algorithm/string/join.hpp>
#include <gutil/strings/substitute.h>
+#include "common/logging.h"
#include "codegen/llvm-codegen.h"
#include "common/object-pool.h"
#include "common/status.h"
#include "exprs/scalar-expr.h"
#include "exprs/scalar-fn-call.h"
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/buffer-pool.h"
#include "runtime/bufferpool/reservation-tracker.h"
#include "runtime/data-stream-mgr.h"
#include "runtime/data-stream-recvr.h"
@@ -54,22 +54,10 @@
#include "common/names.h"
using namespace llvm;
+using strings::Substitute;
DECLARE_int32(max_errors);
-// The fraction of the query mem limit that is used for the block mgr. Operators
-// that accumulate memory all use the block mgr so the majority of the memory should
-// be allocated to the block mgr. The remaining memory is used by the non-spilling
-// operators and should be independent of data size.
-static const float BLOCK_MGR_MEM_FRACTION = 0.8f;
-
-// The minimum amount of memory that must be left after the block mgr reserves the
-// BLOCK_MGR_MEM_FRACTION. The block limit is:
-// min(query_limit * BLOCK_MGR_MEM_FRACTION, query_limit - BLOCK_MGR_MEM_MIN_REMAINING)
-// TODO: this value was picked arbitrarily and the tests are written to rely on this
-// for the minimum memory required to run the query. Revisit.
-static const int64_t BLOCK_MGR_MEM_MIN_REMAINING = 100 * 1024 * 1024;
-
namespace impala {
RuntimeState::RuntimeState(QueryState* query_state, const TPlanFragmentCtx& fragment_ctx,
@@ -82,7 +70,7 @@ RuntimeState::RuntimeState(QueryState* query_state, const TPlanFragmentCtx& frag
query_state->query_ctx().utc_timestamp_string))),
exec_env_(exec_env),
profile_(obj_pool(), "Fragment " + PrintId(instance_ctx.fragment_instance_id)),
- instance_buffer_reservation_(nullptr),
+ instance_buffer_reservation_(new ReservationTracker),
is_cancelled_(false),
root_node_id_(-1) {
Init();
@@ -127,8 +115,7 @@ void RuntimeState::Init() {
instance_mem_tracker_.reset(new MemTracker(
runtime_profile(), -1, runtime_profile()->name(), query_mem_tracker()));
- if (query_state_ != nullptr && exec_env_->buffer_pool() != nullptr) {
- instance_buffer_reservation_ = obj_pool()->Add(new ReservationTracker);
+ if (instance_buffer_reservation_ != nullptr) {
instance_buffer_reservation_->InitChildTracker(&profile_,
query_state_->buffer_reservation(), instance_mem_tracker_.get(),
numeric_limits<int64_t>::max());
@@ -139,28 +126,6 @@ void RuntimeState::InitFilterBank() {
filter_bank_.reset(new RuntimeFilterBank(query_ctx(), this));
}
-Status RuntimeState::CreateBlockMgr() {
- DCHECK(block_mgr_.get() == NULL);
-
- // Compute the max memory the block mgr will use.
- int64_t block_mgr_limit = query_mem_tracker()->lowest_limit();
- if (block_mgr_limit < 0) block_mgr_limit = numeric_limits<int64_t>::max();
- block_mgr_limit = min(static_cast<int64_t>(block_mgr_limit * BLOCK_MGR_MEM_FRACTION),
- block_mgr_limit - BLOCK_MGR_MEM_MIN_REMAINING);
- if (block_mgr_limit < 0) block_mgr_limit = 0;
- if (query_options().__isset.max_block_mgr_memory &&
- query_options().max_block_mgr_memory > 0) {
- block_mgr_limit = query_options().max_block_mgr_memory;
- LOG(WARNING) << "Block mgr mem limit: "
- << PrettyPrinter::Print(block_mgr_limit, TUnit::BYTES);
- }
-
- RETURN_IF_ERROR(BufferedBlockMgr::Create(this, query_mem_tracker(),
- runtime_profile(), exec_env()->tmp_file_mgr(), block_mgr_limit,
- io_mgr()->max_read_buffer_size(), &block_mgr_));
- return Status::OK();
-}
-
Status RuntimeState::CreateCodegen() {
if (codegen_.get() != NULL) return Status::OK();
// TODO: add the fragment ID to the codegen ID as well
@@ -179,6 +144,10 @@ Status RuntimeState::CodegenScalarFns() {
return Status::OK();
}
+Status RuntimeState::StartSpilling(MemTracker* mem_tracker) {
+ return query_state_->StartSpilling(this, mem_tracker);
+}
+
string RuntimeState::ErrorLog() {
lock_guard<SpinLock> l(error_log_lock_);
return PrintErrorMapToString(error_log_);
@@ -270,7 +239,6 @@ void RuntimeState::ReleaseResources() {
if (resource_pool_ != nullptr) {
exec_env_->thread_mgr()->UnregisterPool(resource_pool_);
}
- block_mgr_.reset(); // Release any block mgr memory, if this is the last reference.
codegen_.reset(); // Release any memory associated with codegen.
// Release the reservation, which should be unused at the point.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/runtime-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-state.h b/be/src/runtime/runtime-state.h
index 9a1d0b2..12e7d8c 100644
--- a/be/src/runtime/runtime-state.h
+++ b/be/src/runtime/runtime-state.h
@@ -32,7 +32,7 @@
namespace impala {
-class BufferedBlockMgr;
+class BufferPool;
class DataStreamRecvr;
class DescriptorTbl;
class DiskIoMgr;
@@ -92,9 +92,6 @@ class RuntimeState {
/// Initializes the runtime filter bank.
void InitFilterBank();
- /// Gets/Creates the query wide block mgr.
- Status CreateBlockMgr();
-
QueryState* query_state() const { return query_state_; }
/// Return the query's ObjectPool
ObjectPool* obj_pool() const;
@@ -132,7 +129,7 @@ class RuntimeState {
MemTracker* instance_mem_tracker() { return instance_mem_tracker_.get(); }
MemTracker* query_mem_tracker(); // reference to the query_state_'s memtracker
ReservationTracker* instance_buffer_reservation() {
- return instance_buffer_reservation_;
+ return instance_buffer_reservation_.get();
}
ThreadResourceMgr::ResourcePool* resource_pool() { return resource_pool_; }
@@ -206,11 +203,6 @@ class RuntimeState {
/// Unregisters all reader contexts acquired through AcquireReaderContext().
void UnregisterReaderContexts();
- BufferedBlockMgr* block_mgr() {
- DCHECK(block_mgr_.get() != NULL);
- return block_mgr_.get();
- }
-
inline Status GetQueryStatus() {
// Do a racy check for query_status_ to avoid unnecessary spinlock acquisition.
if (UNLIKELY(!query_status_.ok())) {
@@ -307,21 +299,19 @@ class RuntimeState {
/// TODO: Fix IMPALA-4233
Status CodegenScalarFns();
+ /// Helper to call QueryState::StartSpilling().
+ Status StartSpilling(MemTracker* mem_tracker);
+
/// Release resources and prepare this object for destruction.
void ReleaseResources();
private:
- /// Allow TestEnv to set block_mgr manually for testing.
+ /// Allow TestEnv to use private methods for testing.
friend class TestEnv;
/// Set per-fragment state.
void Init();
- /// Use a custom block manager for the query for testing purposes.
- void set_block_mgr(const std::shared_ptr<BufferedBlockMgr>& block_mgr) {
- block_mgr_ = block_mgr;
- }
-
/// Lock protecting error_log_
SpinLock error_log_lock_;
@@ -382,9 +372,8 @@ class RuntimeState {
boost::scoped_ptr<MemTracker> instance_mem_tracker_;
/// Buffer reservation for this fragment instance - a child of the query buffer
- /// reservation. Non-NULL if 'query_state_' is not NULL and ExecEnv::buffer_pool_
- /// was created by a backend test. Owned by obj_pool().
- ReservationTracker* instance_buffer_reservation_;
+ /// reservation. Non-NULL if 'query_state_' is not NULL.
+ boost::scoped_ptr<ReservationTracker> instance_buffer_reservation_;
/// if true, execution should stop with a CANCELLED status
bool is_cancelled_;
@@ -401,11 +390,6 @@ class RuntimeState {
SpinLock reader_contexts_lock_;
std::vector<DiskIoRequestContext*> reader_contexts_;
- /// BufferedBlockMgr object used to allocate and manage blocks of input data in memory
- /// with a fixed memory budget.
- /// The block mgr is shared by all fragments for this query.
- std::shared_ptr<BufferedBlockMgr> block_mgr_;
-
/// This is the node id of the root node for this plan fragment. This is used as the
/// hash seed and has two useful properties:
/// 1) It is the same for all exec nodes in a fragment, so the resulting hash values
[04/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/sorter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc
index b4ef279..ee0e4be 100644
--- a/be/src/runtime/sorter.cc
+++ b/be/src/runtime/sorter.cc
@@ -17,15 +17,20 @@
#include "runtime/sorter.h"
+#include <limits>
+
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int.hpp>
#include <gutil/strings/substitute.h>
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/reservation-tracker.h"
+#include "runtime/exec-env.h"
#include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-state.h"
#include "runtime/sorted-run-merger.h"
+#include "util/pretty-printer.h"
#include "util/runtime-profile-counters.h"
#include "common/names.h"
@@ -36,7 +41,7 @@ using namespace strings;
namespace impala {
-// Number of pinned blocks required for a merge with fixed-length data only.
+// Number of pinned pages required for a merge with fixed-length data only.
const int MIN_BUFFERS_PER_MERGE = 3;
// Maximum number of buffers to use in each merge to prevent sorter trying to grab
@@ -46,35 +51,140 @@ const int MIN_BUFFERS_PER_MERGE = 3;
// we should base this on the number of reservations.
const int MAX_BUFFERS_PER_MERGE = 128;
-const string MEM_ALLOC_FAILED_ERROR_MSG = "Failed to allocate block for $0-length "
- "data needed for sorting. Reducing query concurrency or increasing the "
- "memory limit may help this query to complete successfully.";
-
-const string MERGE_FAILED_ERROR_MSG = "Failed to allocate block to merge spilled runs "
+const string MERGE_FAILED_ERROR_MSG = "Failed to allocate page to merge spilled runs "
"during sorting. Only $0 runs could be merged, but must be able to merge at least 2 "
"to make progress. Reducing query concurrency or increasing the memory limit may "
"help this query to complete successfully.";
-/// Delete all non-null blocks in blocks and clear vector.
-static void DeleteAndClearBlocks(vector<BufferedBlockMgr::Block*>* blocks) {
- for (BufferedBlockMgr::Block* block: *blocks) {
- if (block != NULL) block->Delete();
+/// Wrapper around BufferPool::PageHandle that tracks additional info about the page.
+/// The Page can be in four states:
+/// * Closed: The page starts in this state before Init() is called. Calling
+/// ExtractBuffer() or Close() puts the page back in this state. No other operations
+/// are valid on a closed page.
+/// * In memory: the page is pinned and the buffer is in memory. data() is valid. The
+/// page is in this state after Init(). If the page is pinned but not in memory, it
+/// can be brought into this state by calling WaitForBuffer().
+/// * Unpinned: the page was unpinned by calling Unpin(). It is invalid to access the
+/// page's buffer.
+/// * Pinned but not in memory: Pin() was called on the unpinned page, but
+/// WaitForBuffer() has not been called. It is invalid to access the page's buffer.
+class Sorter::Page {
+ public:
+ Page() { Reset(); }
+
+ /// Create a new page of length 'sorter->page_len_' bytes using
+ /// 'sorter->buffer_pool_client_'. Caller must ensure the client has enough
+ /// reservation for the page.
+ Status Init(Sorter* sorter) WARN_UNUSED_RESULT {
+ const BufferPool::BufferHandle* page_buffer;
+ RETURN_IF_ERROR(pool()->CreatePage(sorter->buffer_pool_client_, sorter->page_len_,
+ &handle_, &page_buffer));
+ data_ = page_buffer->data();
+ return Status::OK();
}
- blocks->clear();
-}
-static int NumNonNullBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
- int count = 0;
- for (BufferedBlockMgr::Block* block: blocks) {
- if (block != NULL) ++count;
+ /// Extract the buffer from the page. The page must be in memory. When this function
+ /// returns the page is closed.
+ BufferPool::BufferHandle ExtractBuffer(BufferPool::ClientHandle* client) {
+ DCHECK(data_ != nullptr) << "Page must be in memory";
+ BufferPool::BufferHandle buffer;
+ Status status = pool()->ExtractBuffer(client, &handle_, &buffer);
+ DCHECK(status.ok()) << "Page was in memory, ExtractBuffer() shouldn't fail";
+ Reset();
+ return buffer;
+ }
+
+ /// Allocate 'len' bytes in the current page. The page must be in memory, and the
+ /// amount to allocate cannot exceed BytesRemaining().
+ uint8_t* AllocateBytes(int64_t len) {
+ DCHECK_GE(len, 0);
+ DCHECK_LE(len, BytesRemaining());
+ DCHECK(data_ != nullptr);
+ uint8_t* result = data_ + valid_data_len_;
+ valid_data_len_ += len;
+ return result;
+ }
+
+ /// Free the last 'len' bytes allocated from AllocateBytes(). The page must be in
+ /// memory.
+ void FreeBytes(int64_t len) {
+ DCHECK_GE(len, 0);
+ DCHECK_LE(len, valid_data_len_);
+ DCHECK(data_ != nullptr);
+ valid_data_len_ -= len;
+ }
+
+ /// Return number of bytes remaining in page.
+ int64_t BytesRemaining() { return len() - valid_data_len_; }
+
+ /// Brings a pinned page into memory, if not already in memory, and sets 'data_' to
+ /// point to the page's buffer.
+ Status WaitForBuffer() WARN_UNUSED_RESULT {
+ DCHECK(handle_.is_pinned());
+ if (data_ != nullptr) return Status::OK();
+ const BufferPool::BufferHandle* page_buffer;
+ RETURN_IF_ERROR(handle_.GetBuffer(&page_buffer));
+ data_ = page_buffer->data();
+ return Status::OK();
+ }
+
+ /// Helper to pin the page. Caller must ensure the client has enough reservation
+ /// remaining to pin the page. Only valid to call on an unpinned page.
+ Status Pin(BufferPool::ClientHandle* client) WARN_UNUSED_RESULT {
+ DCHECK(!handle_.is_pinned());
+ return pool()->Pin(client, &handle_);
+ }
+
+ /// Helper to unpin the page.
+ void Unpin(BufferPool::ClientHandle* client) {
+ pool()->Unpin(client, &handle_);
+ data_ = nullptr;
+ }
+
+ /// Destroy the page with 'client'.
+ void Close(BufferPool::ClientHandle* client) {
+ pool()->DestroyPage(client, &handle_);
+ Reset();
}
- return count;
-}
+
+ int64_t valid_data_len() const { return valid_data_len_; }
+ /// Returns a pointer to the start of the page's buffer. Only valid to call if the
+ /// page is in memory.
+ uint8_t* data() const {
+ DCHECK(data_ != nullptr);
+ return data_;
+ }
+ int64_t len() const { return handle_.len(); }
+ bool is_open() const { return handle_.is_open(); }
+ bool is_pinned() const { return handle_.is_pinned(); }
+ std::string DebugString() const { return handle_.DebugString(); }
+
+ private:
+ /// Reset the page to an unitialized state. 'handle_' must already be closed.
+ void Reset() {
+ DCHECK(!handle_.is_open());
+ valid_data_len_ = 0;
+ data_ = nullptr;
+ }
+
+ /// Helper to get the singleton buffer pool.
+ static BufferPool* pool() { return ExecEnv::GetInstance()->buffer_pool(); }
+
+ BufferPool::PageHandle handle_;
+
+ /// Length of valid data written to the page.
+ int64_t valid_data_len_;
+
+ /// Cached pointer to the buffer in 'handle_'. NULL if the page is unpinned. May be NULL
+ /// or not NULL if the page is pinned. Can be populated by calling WaitForBuffer() on a
+ /// pinned page.
+ uint8_t* data_;
+};
/// A run is a sequence of tuples. The run can be sorted or unsorted (in which case the
-/// Sorter will sort it). A run comprises a sequence of fixed-length blocks containing the
+/// Sorter will sort it). A run comprises a sequence of fixed-length pages containing the
/// tuples themselves (i.e. fixed-len slots that may contain ptrs to var-length data), and
-/// an optional sequence of var-length blocks containing the var-length data.
+/// an optional sequence of var-length pages containing the var-length data.
///
/// Runs are either "initial runs" constructed from the sorter's input by evaluating
/// the expressions in 'sort_tuple_exprs_' or "intermediate runs" constructed
@@ -84,7 +194,7 @@ static int NumNonNullBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
/// sorted run.
///
/// The expected calling sequence of functions is as follows:
-/// * Init() to initialize the run and allocate initial blocks.
+/// * Init() to initialize the run and allocate initial pages.
/// * Add*Batch() to add batches of tuples to the run.
/// * FinalizeInput() to signal that no more batches will be added.
/// * If the run is unsorted, it must be sorted. After that set_sorted() must be called.
@@ -92,29 +202,30 @@ static int NumNonNullBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
/// * PrepareRead() to allocate resources for reading the run.
/// * GetNext() (if there was a single run) or GetNextBatch() (when merging multiple runs)
/// to read from the run.
-/// * Once reading is done, DeleteAllBlocks() should be called to free resources.
+/// * Once reading is done, CloseAllPages() should be called to free resources.
class Sorter::Run {
public:
Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool initial_run);
~Run() {
- DCHECK(fixed_len_blocks_.empty());
- DCHECK(var_len_blocks_.empty());
- DCHECK(var_len_copy_block_ == NULL);
+ DCHECK(fixed_len_pages_.empty());
+ DCHECK(var_len_pages_.empty());
+ DCHECK(!var_len_copy_page_.is_open());
}
/// Initialize the run for input rows by allocating the minimum number of required
- /// blocks - one block for fixed-len data added to fixed_len_blocks_, one for the
- /// initially unsorted var-len data added to var_len_blocks_, and one to copy sorted
- /// var-len data into var_len_copy_block_.
- Status Init();
+ /// pages - one page for fixed-len data added to fixed_len_pages_, one for the
+ /// initially unsorted var-len data added to var_len_pages_, and one to copy sorted
+ /// var-len data into var_len_copy_page_.
+ Status Init() WARN_UNUSED_RESULT;
/// Add the rows from 'batch' starting at 'start_index' to the current run. Returns the
- /// number of rows actually added in 'num_processed'. If the run is full (no more blocks
+ /// number of rows actually added in 'num_processed'. If the run is full (no more pages
/// can be allocated), 'num_processed' may be less than the number of remaining rows in
/// the batch. AddInputBatch() materializes the input rows using the expressions in
/// sorter_->sort_tuple_expr_evals_, while AddIntermediateBatch() just copies rows.
- Status AddInputBatch(RowBatch* batch, int start_index, int* num_processed) {
+ Status AddInputBatch(
+ RowBatch* batch, int start_index, int* num_processed) WARN_UNUSED_RESULT {
DCHECK(initial_run_);
if (has_var_len_slots_) {
return AddBatchInternal<true, true>(batch, start_index, num_processed);
@@ -122,7 +233,9 @@ class Sorter::Run {
return AddBatchInternal<false, true>(batch, start_index, num_processed);
}
}
- Status AddIntermediateBatch(RowBatch* batch, int start_index, int* num_processed) {
+
+ Status AddIntermediateBatch(
+ RowBatch* batch, int start_index, int* num_processed) WARN_UNUSED_RESULT {
DCHECK(!initial_run_);
if (has_var_len_slots_) {
return AddBatchInternal<true, false>(batch, start_index, num_processed);
@@ -133,53 +246,53 @@ class Sorter::Run {
/// Called after the final call to Add*Batch() to do any bookkeeping necessary to
/// finalize the run. Must be called before sorting or merging the run.
- Status FinalizeInput();
+ Status FinalizeInput() WARN_UNUSED_RESULT;
- /// Unpins all the blocks in a sorted run. Var-length column data is copied into new
- /// blocks in sorted order. Pointers in the original tuples are converted to offsets
- /// from the beginning of the sequence of var-len data blocks. Returns an error and
- /// may leave some blocks pinned if an error is encountered in the block mgr.
- Status UnpinAllBlocks();
+ /// Unpins all the pages in a sorted run. Var-length column data is copied into new
+ /// pages in sorted order. Pointers in the original tuples are converted to offsets
+ /// from the beginning of the sequence of var-len data pages. Returns an error and
+ /// may leave some pages pinned if an error is encountered.
+ Status UnpinAllPages() WARN_UNUSED_RESULT;
- /// Deletes all blocks.
- void DeleteAllBlocks();
+ /// Closes all pages and clears vectors of pages.
+ void CloseAllPages();
- /// Prepare to read a sorted run. Pins the first block(s) in the run if the run was
+ /// Prepare to read a sorted run. Pins the first page(s) in the run if the run was
/// previously unpinned. If the run was unpinned, try to pin the initial fixed and
- /// var len blocks in the run. If it couldn't pin them, set pinned_all_blocks to false.
- /// In that case, none or one of the initial blocks may be pinned and it is valid to
- /// call PrepareRead() again to retry pinning the remainder. pinned_all_blocks is
- /// always set to true if the run is pinned.
- Status PrepareRead(bool* pinned_all_blocks);
+ /// var len pages in the run. If it couldn't pin them, set pinned to false.
+ /// In that case, none of the initial pages will be pinned and it is valid to
+ /// call PrepareRead() again to retry pinning. pinned is always set to
+ /// true if the run was pinned.
+ Status PrepareRead(bool* pinned) WARN_UNUSED_RESULT;
/// Interface for merger - get the next batch of rows from this run. This run still
/// owns the returned batch. Calls GetNext(RowBatch*, bool*).
- Status GetNextBatch(RowBatch** sorted_batch);
+ Status GetNextBatch(RowBatch** sorted_batch) WARN_UNUSED_RESULT;
/// Fill output_batch with rows from this run. If CONVERT_OFFSET_TO_PTR is true, offsets
/// in var-length slots are converted back to pointers. Only row pointers are copied
/// into output_batch. eos is set to true after all rows from the run are returned.
- /// If eos is true, the returned output_batch has zero rows and has no attached blocks.
- /// If this run was unpinned, one block (two if there are var-len slots) is pinned while
- /// rows are filled into output_batch. The block is unpinned before the next block is
- /// pinned, so at most one (two if there are var-len slots) block(s) will be pinned at
- /// once. If the run was pinned, the blocks are not unpinned and each block is attached
- /// to 'output_batch' once all rows referencing data in the block have been returned,
+ /// If eos is true, the returned output_batch has zero rows and has no attached pages.
+ /// If this run was unpinned, one page (two if there are var-len slots) is pinned while
+ /// rows are filled into output_batch. The page is unpinned before the next page is
+ /// pinned, so at most one (two if there are var-len slots) page(s) will be pinned at
+ /// once. If the run was pinned, the pages are not unpinned and each page is attached
+ /// to 'output_batch' once all rows referencing data in the page have been returned,
/// either in the current batch or previous batches. In both pinned and unpinned cases,
- /// all rows in output_batch will reference at most one fixed-len and one var-len block.
+ /// all rows in output_batch will reference at most one fixed-len and one var-len page.
template <bool CONVERT_OFFSET_TO_PTR>
- Status GetNext(RowBatch* output_batch, bool* eos);
+ Status GetNext(RowBatch* output_batch, bool* eos) WARN_UNUSED_RESULT;
- /// Delete all blocks in 'runs' and clear 'runs'.
+ /// Delete all pages in 'runs' and clear 'runs'.
static void CleanupRuns(deque<Run*>* runs) {
- for (Run* run: *runs) {
- run->DeleteAllBlocks();
+ for (Run* run : *runs) {
+ run->CloseAllPages();
}
runs->clear();
}
- /// Return total amount of fixed and var len data in run, not including blocks that
- /// were already transferred.
+ /// Return total amount of fixed and var len data in run, not including pages that
+ /// were already transferred or closed.
int64_t TotalBytes() const;
inline bool is_pinned() const { return is_pinned_; }
@@ -196,34 +309,42 @@ class Sorter::Run {
/// INITIAL_RUN and HAS_VAR_LEN_SLOTS are template arguments for performance and must
/// match 'initial_run_' and 'has_var_len_slots_'.
template <bool HAS_VAR_LEN_SLOTS, bool INITIAL_RUN>
- Status AddBatchInternal(RowBatch* batch, int start_index, int* num_processed);
+ Status AddBatchInternal(
+ RowBatch* batch, int start_index, int* num_processed) WARN_UNUSED_RESULT;
- /// Finalize the list of blocks: delete empty final blocks and unpin the previous block
+ /// Finalize the list of pages: delete empty final pages and unpin the previous page
/// if the run is unpinned.
- Status FinalizeBlocks(vector<BufferedBlockMgr::Block*>* blocks);
+ Status FinalizePages(vector<Page>* pages) WARN_UNUSED_RESULT;
/// Collect the non-null var-len (e.g. STRING) slots from 'src' in 'var_len_values' and
/// return the total length of all var-len values in 'total_var_len'.
- void CollectNonNullVarSlots(Tuple* src, vector<StringValue*>* var_len_values,
- int* total_var_len);
+ void CollectNonNullVarSlots(
+ Tuple* src, vector<StringValue*>* var_len_values, int* total_var_len);
- enum AddBlockMode { KEEP_PREV_PINNED, UNPIN_PREV };
+ enum AddPageMode { KEEP_PREV_PINNED, UNPIN_PREV };
- /// Try to extend the current run by a block. If 'mode' is KEEP_PREV_PINNED, try to
- /// allocate a new block, which may fail to extend the run due to lack of memory. If
- /// mode is 'UNPIN_PREV', unpin the previous block in block_sequence before allocating
- /// and adding a new block - this never fails due to lack of memory.
+ /// Try to extend the current run by a page. If 'mode' is KEEP_PREV_PINNED, try to
+ /// allocate a new page, which may fail to extend the run due to lack of memory. If
+ /// mode is 'UNPIN_PREV', unpin the previous page in page_sequence before allocating
+ /// and adding a new page - this never fails due to lack of memory.
///
- /// Returns an error status only if the block manager returns an error. If no error is
+ /// Returns an error status only if the buffer pool returns an error. If no error is
/// encountered, sets 'added' to indicate whether the run was extended and returns
- /// Status::OK(). The new block is appended to 'block_sequence'.
- Status TryAddBlock(AddBlockMode mode, vector<BufferedBlockMgr::Block*>* block_sequence,
- bool* added);
+ /// Status::OK(). The new page is appended to 'page_sequence'.
+ Status TryAddPage(
+ AddPageMode mode, vector<Page>* page_sequence, bool* added) WARN_UNUSED_RESULT;
+
+ /// Adds a new page to 'page_sequence' by a page. Caller must ensure enough
+ /// reservation is available to create the page.
+ ///
+ /// Returns an error status only if the buffer pool returns an error. If an error
+ /// is returned 'page_sequence' is left unmodified.
+ Status AddPage(vector<Page>* page_sequence) WARN_UNUSED_RESULT;
- /// Advance to the next read block. If the run is pinned, has no effect. If the run
- /// is unpinned, atomically pin the block at 'block_index' + 1 in 'blocks' and delete
- /// the block at 'block_index'.
- Status PinNextReadBlock(vector<BufferedBlockMgr::Block*>* blocks, int block_index);
+ /// Advance to the next read page. If the run is pinned, has no effect. If the run
+ /// is unpinned, atomically pin the page at 'page_index' + 1 in 'pages' and delete
+ /// the page at 'page_index'.
+ Status PinNextReadPage(vector<Page>* pages, int page_index) WARN_UNUSED_RESULT;
/// Copy the StringValues in 'var_values' to 'dest' in order and update the StringValue
/// ptrs in 'dest' to point to the copied data.
@@ -231,25 +352,41 @@ class Sorter::Run {
/// Copy the StringValues in 'var_values' to 'dest' in order. Update the StringValue
/// ptrs in 'dest' to contain a packed offset for the copied data comprising
- /// block_index and the offset relative to block_start.
- void CopyVarLenDataConvertOffset(const vector<StringValue*>& var_values,
- int block_index, const uint8_t* block_start, uint8_t* dest);
+ /// page_index and the offset relative to page_start.
+ void CopyVarLenDataConvertOffset(const vector<StringValue*>& var_values, int page_index,
+ const uint8_t* page_start, uint8_t* dest);
/// Convert encoded offsets to valid pointers in tuple with layout 'sort_tuple_desc_'.
- /// 'tuple' is modified in-place. Returns true if the pointers refer to the block at
- /// 'var_len_blocks_index_' and were successfully converted or false if the var len
- /// data is in the next block, in which case 'tuple' is unmodified.
+ /// 'tuple' is modified in-place. Returns true if the pointers refer to the page at
+ /// 'var_len_pages_index_' and were successfully converted or false if the var len
+ /// data is in the next page, in which case 'tuple' is unmodified.
bool ConvertOffsetsToPtrs(Tuple* tuple);
- /// Returns true if we have var-len blocks in the run.
- inline bool HasVarLenBlocks() const {
- // Shouldn't have any blocks unless there are slots.
- DCHECK(var_len_blocks_.empty() || has_var_len_slots_);
- return !var_len_blocks_.empty();
+ /// Returns true if we have var-len pages in the run.
+ inline bool HasVarLenPages() const {
+ // Shouldn't have any pages unless there are slots.
+ DCHECK(var_len_pages_.empty() || has_var_len_slots_);
+ return !var_len_pages_.empty();
+ }
+
+ static int NumOpenPages(const vector<Page>& pages) {
+ int count = 0;
+ for (const Page& page : pages) {
+ if (page.is_open()) ++count;
+ }
+ return count;
+ }
+
+ /// Close all open pages and clear vector.
+ void DeleteAndClearPages(vector<Page>* pages) {
+ for (Page& page : *pages) {
+ if (page.is_open()) page.Close(sorter_->buffer_pool_client_);
+ }
+ pages->clear();
}
/// Parent sorter object.
- const Sorter* sorter_;
+ Sorter* const sorter_;
/// Materialized sort tuple. Input rows are materialized into 1 tuple (with descriptor
/// sort_tuple_desc_) before sorting.
@@ -258,10 +395,10 @@ class Sorter::Run {
/// The size in bytes of the sort tuple.
const int sort_tuple_size_;
- /// Number of tuples per block in a run. This gets multiplied with
- /// TupleIterator::block_index_ in various places and to make sure we don't overflow the
+ /// Number of tuples per page in a run. This gets multiplied with
+ /// TupleIterator::page_index_ in various places and to make sure we don't overflow the
/// result of that operation we make this int64_t here.
- const int64_t block_capacity_;
+ const int64_t page_capacity_;
const bool has_var_len_slots_;
@@ -269,7 +406,7 @@ class Sorter::Run {
/// resulting from merging other runs.
const bool initial_run_;
- /// True if all blocks in the run are pinned. Initial runs start off pinned and
+ /// True if all pages in the run are pinned. Initial runs start off pinned and
/// can be unpinned. Intermediate runs are always unpinned.
bool is_pinned_;
@@ -281,27 +418,27 @@ class Sorter::Run {
/// Always true for intermediate runs.
bool is_sorted_;
- /// Sequence of blocks in this run containing the fixed-length portion of the sort
+ /// Sequence of pages in this run containing the fixed-length portion of the sort
/// tuples comprising this run. The data pointed to by the var-len slots are in
- /// var_len_blocks_. A run can have zero blocks if no rows are appended.
- /// If the run is sorted, the tuples in fixed_len_blocks_ will be in sorted order.
- /// fixed_len_blocks_[i] is NULL iff it has been transferred or deleted.
- vector<BufferedBlockMgr::Block*> fixed_len_blocks_;
+ /// var_len_pages_. A run can have zero pages if no rows are appended.
+ /// If the run is sorted, the tuples in fixed_len_pages_ will be in sorted order.
+ /// fixed_len_pages_[i] is closed iff it has been transferred or deleted.
+ vector<Page> fixed_len_pages_;
- /// Sequence of blocks in this run containing the var-length data corresponding to the
- /// var-length columns from fixed_len_blocks_. In intermediate runs, the var-len data is
+ /// Sequence of pages in this run containing the var-length data corresponding to the
+ /// var-length columns from fixed_len_pages_. In intermediate runs, the var-len data is
/// always stored in the same order as the fixed-length tuples. In initial runs, the
/// var-len data is initially in unsorted order, but is reshuffled into sorted order in
- /// UnpinAllBlocks(). A run can have no var len blocks if there are no var len slots or
+ /// UnpinAllPages(). A run can have no var len pages if there are no var len slots or
/// if all the var len data is empty or NULL.
- /// var_len_blocks_[i] is NULL iff it has been transferred or deleted.
- vector<BufferedBlockMgr::Block*> var_len_blocks_;
+ /// var_len_pages_[i] is closed iff it has been transferred or deleted.
+ vector<Page> var_len_pages_;
- /// For initial unsorted runs, an extra pinned block is needed to reorder var-len data
- /// into fixed order in UnpinAllBlocks(). 'var_len_copy_block_' stores this extra
- /// block. Deleted in UnpinAllBlocks().
+ /// For initial unsorted runs, an extra pinned page is needed to reorder var-len data
+ /// into fixed order in UnpinAllPages(). 'var_len_copy_page_' stores this extra
+ /// page. Deleted in UnpinAllPages().
/// TODO: in case of in-memory runs, this could be deleted earlier to free up memory.
- BufferedBlockMgr::Block* var_len_copy_block_;
+ Page var_len_copy_page_;
/// Number of tuples added so far to this run.
int64_t num_tuples_;
@@ -313,18 +450,18 @@ class Sorter::Run {
scoped_ptr<RowBatch> buffered_batch_;
/// Members used when a run is read in GetNext().
- /// The index into 'fixed_' and 'var_len_blocks_' of the blocks being read in GetNext().
- int fixed_len_blocks_index_;
- int var_len_blocks_index_;
+ /// The index into 'fixed_' and 'var_len_pages_' of the pages being read in GetNext().
+ int fixed_len_pages_index_;
+ int var_len_pages_index_;
/// If true, the last call to GetNext() reached the end of the previous fixed or
- /// var-len block. The next call to GetNext() must increment 'fixed_len_blocks_index_'
- /// or 'var_len_blocks_index_'. It must also pin the next block if the run is unpinned.
- bool end_of_fixed_len_block_;
- bool end_of_var_len_block_;
+ /// var-len page. The next call to GetNext() must increment 'fixed_len_pages_index_'
+ /// or 'var_len_pages_index_'. It must also pin the next page if the run is unpinned.
+ bool end_of_fixed_len_page_;
+ bool end_of_var_len_page_;
- /// Offset into the current fixed length data block being processed.
- int fixed_len_block_offset_;
+ /// Offset into the current fixed length data page being processed.
+ int fixed_len_page_offset_;
};
/// Helper class used to iterate over tuples in a run during sorting.
@@ -340,7 +477,7 @@ class Sorter::TupleIterator {
/// Default constructor used for local variable. Produces invalid iterator that must
/// be assigned before use.
TupleIterator() : index_(-1), tuple_(NULL), buffer_start_index_(-1),
- buffer_end_index_(-1), block_index_(-1) { }
+ buffer_end_index_(-1), page_index_(-1) { }
/// Create an iterator pointing to the first tuple in the run.
static inline TupleIterator Begin(Sorter::Run* run) { return TupleIterator(run, 0); }
@@ -351,8 +488,8 @@ class Sorter::TupleIterator {
}
/// Increments 'index_' and sets 'tuple_' to point to the next tuple in the run.
- /// Increments 'block_index_' and advances to the next block if the next tuple is in
- /// the next block. Can be advanced one past the last tuple in the run, but is not
+ /// Increments 'page_index_' and advances to the next page if the next tuple is in
+ /// the next page. Can be advanced one past the last tuple in the run, but is not
/// valid to dereference 'tuple_' in that case. 'run' and 'tuple_size' are passed as
/// arguments to avoid redundantly storing the same values in multiple iterators in
/// perf-critical algorithms.
@@ -370,13 +507,13 @@ class Sorter::TupleIterator {
}
private:
- // Move to the next block in the run (or do nothing if at end of run).
+ // Move to the next page in the run (or do nothing if at end of run).
// This is the slow path for Next();
- void NextBlock(Sorter::Run* run, int tuple_size);
+ void NextPage(Sorter::Run* run, int tuple_size);
- // Move to the previous block in the run (or do nothing if at beginning of run).
+ // Move to the previous page in the run (or do nothing if at beginning of run).
// This is the slow path for Prev();
- void PrevBlock(Sorter::Run* run, int tuple_size);
+ void PrevPage(Sorter::Run* run, int tuple_size);
/// Index of the current tuple in the run.
/// Can be -1 or run->num_rows() if Next() or Prev() moves iterator outside of run.
@@ -387,15 +524,15 @@ class Sorter::TupleIterator {
/// iterator outside of run.
uint8_t* tuple_;
- /// Indices of start and end tuples of block at block_index_. I.e. the current block
+ /// Indices of start and end tuples of page at page_index_. I.e. the current page
/// has tuples with indices in range [buffer_start_index_, buffer_end_index).
int64_t buffer_start_index_;
int64_t buffer_end_index_;
- /// Index into fixed_len_blocks_ of the block containing the current tuple.
- /// If index_ is negative or past end of run, will point to the first or last block
+ /// Index into fixed_len_pages_ of the page containing the current tuple.
+ /// If index_ is negative or past end of run, will point to the first or last page
/// in run respectively.
- int block_index_;
+ int page_index_;
};
/// Sorts a sequence of tuples from a run in place using a provided tuple comparator.
@@ -404,16 +541,16 @@ class Sorter::TupleIterator {
/// instance to check for cancellation during an in-memory sort.
class Sorter::TupleSorter {
public:
- TupleSorter(const TupleRowComparator& comparator, int64_t block_size,
- int tuple_size, RuntimeState* state);
+ TupleSorter(const TupleRowComparator& comparator, int64_t page_size, int tuple_size,
+ RuntimeState* state);
~TupleSorter();
/// Performs a quicksort for tuples in 'run' followed by an insertion sort to
- /// finish smaller blocks. Only valid to call if this is an initial run that has not
+ /// finish smaller ranges. Only valid to call if this is an initial run that has not
/// yet been sorted. Returns an error status if any error is encountered or if the
/// query is cancelled.
- Status Sort(Run* run);
+ Status Sort(Run* run) WARN_UNUSED_RESULT;
private:
static const int INSERTION_THRESHOLD = 16;
@@ -451,7 +588,8 @@ class Sorter::TupleSorter {
/// Perform an insertion sort for rows in the range [begin, end) in a run.
/// Only valid to call for ranges of size at least 1.
- Status InsertionSort(const TupleIterator& begin, const TupleIterator& end);
+ Status InsertionSort(
+ const TupleIterator& begin, const TupleIterator& end) WARN_UNUSED_RESULT;
/// Partitions the sequence of tuples in the range [begin, end) in a run into two
/// groups around the pivot tuple - i.e. tuples in first group are <= the pivot, and
@@ -459,12 +597,12 @@ class Sorter::TupleSorter {
/// groups and the index to the first element in the second group is returned in 'cut'.
/// Return an error status if any error is encountered or if the query is cancelled.
Status Partition(TupleIterator begin, TupleIterator end, const Tuple* pivot,
- TupleIterator* cut);
+ TupleIterator* cut) WARN_UNUSED_RESULT;
/// Performs a quicksort of rows in the range [begin, end) followed by insertion sort
/// for smaller groups of elements. Return an error status for any errors or if the
/// query is cancelled.
- Status SortHelper(TupleIterator begin, TupleIterator end);
+ Status SortHelper(TupleIterator begin, TupleIterator end) WARN_UNUSED_RESULT;
/// Select a pivot to partition [begin, end).
Tuple* SelectPivot(TupleIterator begin, TupleIterator end);
@@ -477,45 +615,33 @@ class Sorter::TupleSorter {
};
// Sorter::Run methods
-Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc,
- bool initial_run)
+Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool initial_run)
: sorter_(parent),
sort_tuple_desc_(sort_tuple_desc),
sort_tuple_size_(sort_tuple_desc->byte_size()),
- block_capacity_(parent->block_mgr_->max_block_size() / sort_tuple_size_),
+ page_capacity_(parent->page_len_ / sort_tuple_size_),
has_var_len_slots_(sort_tuple_desc->HasVarlenSlots()),
initial_run_(initial_run),
is_pinned_(initial_run),
is_finalized_(false),
is_sorted_(!initial_run),
- var_len_copy_block_(NULL),
- num_tuples_(0) { }
+ num_tuples_(0) {}
Status Sorter::Run::Init() {
- BufferedBlockMgr::Block* block = NULL;
- RETURN_IF_ERROR(
- sorter_->block_mgr_->GetNewBlock(sorter_->block_mgr_client_, NULL, &block));
- if (block == NULL) {
- return sorter_->mem_tracker_->MemLimitExceeded(
- sorter_->state_, Substitute(MEM_ALLOC_FAILED_ERROR_MSG, "fixed"));
- }
- fixed_len_blocks_.push_back(block);
+ int num_to_create = 1 + has_var_len_slots_ + (has_var_len_slots_ && initial_run_);
+ int64_t required_mem = num_to_create * sorter_->page_len_;
+ if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(required_mem)) {
+ return Status(Substitute(
+ "Unexpected error trying to reserve $0 bytes for a sorted run: $2",
+ required_mem, sorter_->buffer_pool_client_->DebugString()));
+ }
+
+ RETURN_IF_ERROR(AddPage(&fixed_len_pages_));
if (has_var_len_slots_) {
- RETURN_IF_ERROR(
- sorter_->block_mgr_->GetNewBlock(sorter_->block_mgr_client_, NULL, &block));
- if (block == NULL) {
- return sorter_->mem_tracker_->MemLimitExceeded(
- sorter_->state_, Substitute(MEM_ALLOC_FAILED_ERROR_MSG, "variable"));
- }
- var_len_blocks_.push_back(block);
+ RETURN_IF_ERROR(AddPage(&var_len_pages_));
if (initial_run_) {
- // Need additional var len block to reorder var len data in UnpinAllBlocks().
- RETURN_IF_ERROR(sorter_->block_mgr_->GetNewBlock(
- sorter_->block_mgr_client_, NULL, &var_len_copy_block_));
- if (var_len_copy_block_ == NULL) {
- return sorter_->mem_tracker_->MemLimitExceeded(
- sorter_->state_, Substitute(MEM_ALLOC_FAILED_ERROR_MSG, "variable"));
- }
+ // Need additional var len page to reorder var len data in UnpinAllPages().
+ RETURN_IF_ERROR(var_len_copy_page_.Init(sorter_));
}
}
if (initial_run_) {
@@ -527,14 +653,15 @@ Status Sorter::Run::Init() {
}
template <bool HAS_VAR_LEN_SLOTS, bool INITIAL_RUN>
-Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_processed) {
+Status Sorter::Run::AddBatchInternal(
+ RowBatch* batch, int start_index, int* num_processed) {
DCHECK(!is_finalized_);
- DCHECK(!fixed_len_blocks_.empty());
+ DCHECK(!fixed_len_pages_.empty());
DCHECK_EQ(HAS_VAR_LEN_SLOTS, has_var_len_slots_);
DCHECK_EQ(INITIAL_RUN, initial_run_);
*num_processed = 0;
- BufferedBlockMgr::Block* cur_fixed_len_block = fixed_len_blocks_.back();
+ Page* cur_fixed_len_page = &fixed_len_pages_.back();
if (!INITIAL_RUN) {
// For intermediate merges, the input row is the sort tuple.
@@ -543,13 +670,13 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
}
/// Keep initial unsorted runs pinned in memory so we can sort them.
- const AddBlockMode add_mode = INITIAL_RUN ? KEEP_PREV_PINNED : UNPIN_PREV;
+ const AddPageMode add_mode = INITIAL_RUN ? KEEP_PREV_PINNED : UNPIN_PREV;
- // Input rows are copied/materialized into tuples allocated in fixed_len_blocks_.
- // The variable length column data are copied into blocks stored in var_len_blocks_.
+ // Input rows are copied/materialized into tuples allocated in fixed_len_pages_.
+ // The variable length column data are copied into pages stored in var_len_pages_.
// Input row processing is split into two loops.
- // The inner loop processes as many input rows as will fit in cur_fixed_len_block.
- // The outer loop allocates a new block for fixed-len data if the input batch is
+ // The inner loop processes as many input rows as will fit in cur_fixed_len_page.
+ // The outer loop allocates a new page for fixed-len data if the input batch is
// not exhausted.
// cur_input_index is the index into the input 'batch' of the current input row being
@@ -559,22 +686,23 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
string_values.reserve(sort_tuple_desc_->string_slots().size());
while (cur_input_index < batch->num_rows()) {
// tuples_remaining is the number of tuples to copy/materialize into
- // cur_fixed_len_block.
- int tuples_remaining = cur_fixed_len_block->BytesRemaining() / sort_tuple_size_;
+ // cur_fixed_len_page.
+ int tuples_remaining = cur_fixed_len_page->BytesRemaining() / sort_tuple_size_;
tuples_remaining = min(batch->num_rows() - cur_input_index, tuples_remaining);
for (int i = 0; i < tuples_remaining; ++i) {
int total_var_len = 0;
TupleRow* input_row = batch->GetRow(cur_input_index);
- Tuple* new_tuple = cur_fixed_len_block->Allocate<Tuple>(sort_tuple_size_);
+ Tuple* new_tuple =
+ reinterpret_cast<Tuple*>(cur_fixed_len_page->AllocateBytes(sort_tuple_size_));
if (INITIAL_RUN) {
new_tuple->MaterializeExprs<HAS_VAR_LEN_SLOTS, true>(input_row,
*sort_tuple_desc_, sorter_->sort_tuple_expr_evals_, NULL,
&string_values, &total_var_len);
- if (total_var_len > sorter_->block_mgr_->max_block_size()) {
- return Status(ErrorMsg(TErrorCode::INTERNAL_ERROR, Substitute(
- "Variable length data in a single tuple larger than block size $0 > $1",
- total_var_len, sorter_->block_mgr_->max_block_size())));
+ if (total_var_len > sorter_->page_len_) {
+ return Status(TErrorCode::MAX_ROW_SIZE,
+ PrettyPrinter::Print(total_var_len, TUnit::BYTES), sorter_->node_id_,
+ PrettyPrinter::Print(0, TUnit::BYTES));
}
} else {
memcpy(new_tuple, input_row->GetTuple(0), sort_tuple_size_);
@@ -584,17 +712,17 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
}
if (HAS_VAR_LEN_SLOTS) {
- DCHECK_GT(var_len_blocks_.size(), 0);
- BufferedBlockMgr::Block* cur_var_len_block = var_len_blocks_.back();
- if (cur_var_len_block->BytesRemaining() < total_var_len) {
+ DCHECK_GT(var_len_pages_.size(), 0);
+ Page* cur_var_len_page = &var_len_pages_.back();
+ if (cur_var_len_page->BytesRemaining() < total_var_len) {
bool added;
- RETURN_IF_ERROR(TryAddBlock(add_mode, &var_len_blocks_, &added));
+ RETURN_IF_ERROR(TryAddPage(add_mode, &var_len_pages_, &added));
if (added) {
- cur_var_len_block = var_len_blocks_.back();
+ cur_var_len_page = &var_len_pages_.back();
} else {
- // There was not enough space in the last var-len block for this tuple, and
+ // There was not enough space in the last var-len page for this tuple, and
// the run could not be extended. Return the fixed-len allocation and exit.
- cur_fixed_len_block->ReturnAllocation(sort_tuple_size_);
+ cur_fixed_len_page->FreeBytes(sort_tuple_size_);
return Status::OK();
}
}
@@ -605,13 +733,13 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
DCHECK(new_tuple->IsNull(coll_slot->null_indicator_offset()));
}
- uint8_t* var_data_ptr = cur_var_len_block->Allocate<uint8_t>(total_var_len);
+ uint8_t* var_data_ptr = cur_var_len_page->AllocateBytes(total_var_len);
if (INITIAL_RUN) {
CopyVarLenData(string_values, var_data_ptr);
} else {
- DCHECK_EQ(var_len_blocks_.back(), cur_var_len_block);
- CopyVarLenDataConvertOffset(string_values, var_len_blocks_.size() - 1,
- reinterpret_cast<uint8_t*>(cur_var_len_block->buffer()), var_data_ptr);
+ DCHECK_EQ(&var_len_pages_.back(), cur_var_len_page);
+ CopyVarLenDataConvertOffset(string_values, var_len_pages_.size() - 1,
+ cur_var_len_page->data(), var_data_ptr);
}
}
++num_tuples_;
@@ -619,13 +747,13 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
++cur_input_index;
}
- // If there are still rows left to process, get a new block for the fixed-length
+ // If there are still rows left to process, get a new page for the fixed-length
// tuples. If the run is already too long, return.
if (cur_input_index < batch->num_rows()) {
bool added;
- RETURN_IF_ERROR(TryAddBlock(add_mode, &fixed_len_blocks_, &added));
+ RETURN_IF_ERROR(TryAddPage(add_mode, &fixed_len_pages_, &added));
if (!added) return Status::OK();
- cur_fixed_len_block = fixed_len_blocks_.back();
+ cur_fixed_len_page = &fixed_len_pages_.back();
}
}
return Status::OK();
@@ -634,158 +762,146 @@ Status Sorter::Run::AddBatchInternal(RowBatch* batch, int start_index, int* num_
Status Sorter::Run::FinalizeInput() {
DCHECK(!is_finalized_);
- RETURN_IF_ERROR(FinalizeBlocks(&fixed_len_blocks_));
+ RETURN_IF_ERROR(FinalizePages(&fixed_len_pages_));
if (has_var_len_slots_) {
- RETURN_IF_ERROR(FinalizeBlocks(&var_len_blocks_));
+ RETURN_IF_ERROR(FinalizePages(&var_len_pages_));
}
is_finalized_ = true;
return Status::OK();
}
-Status Sorter::Run::FinalizeBlocks(vector<BufferedBlockMgr::Block*>* blocks) {
- DCHECK_GT(blocks->size(), 0);
- BufferedBlockMgr::Block* last_block = blocks->back();
- if (last_block->valid_data_len() > 0) {
+Status Sorter::Run::FinalizePages(vector<Page>* pages) {
+ DCHECK_GT(pages->size(), 0);
+ Page* last_page = &pages->back();
+ if (last_page->valid_data_len() > 0) {
DCHECK_EQ(initial_run_, is_pinned_);
if (!is_pinned_) {
- // Unpin the last block of this unpinned run. We've finished writing the run so
- // all blocks in the run can now be unpinned.
- RETURN_IF_ERROR(last_block->Unpin());
+ // Unpin the last page of this unpinned run. We've finished writing the run so
+ // all pages in the run can now be unpinned.
+ last_page->Unpin(sorter_->buffer_pool_client_);
}
} else {
- last_block->Delete();
- blocks->pop_back();
+ last_page->Close(sorter_->buffer_pool_client_);
+ pages->pop_back();
}
return Status::OK();
}
-void Sorter::Run::DeleteAllBlocks() {
- DeleteAndClearBlocks(&fixed_len_blocks_);
- DeleteAndClearBlocks(&var_len_blocks_);
- if (var_len_copy_block_ != NULL) var_len_copy_block_->Delete();
- var_len_copy_block_ = NULL;
+void Sorter::Run::CloseAllPages() {
+ DeleteAndClearPages(&fixed_len_pages_);
+ DeleteAndClearPages(&var_len_pages_);
+ if (var_len_copy_page_.is_open()) {
+ var_len_copy_page_.Close(sorter_->buffer_pool_client_);
+ }
}
-Status Sorter::Run::UnpinAllBlocks() {
+Status Sorter::Run::UnpinAllPages() {
DCHECK(is_sorted_);
DCHECK(initial_run_);
DCHECK(is_pinned_);
DCHECK(is_finalized_);
- // A list of var len blocks to replace 'var_len_blocks_'. Note that after we are done
- // we may have a different number of blocks, because internal fragmentation may leave
- // slightly different amounts of wasted space at the end of each block.
- // We need to be careful to clean up these blocks if we run into an error in this method.
- vector<BufferedBlockMgr::Block*> sorted_var_len_blocks;
- sorted_var_len_blocks.reserve(var_len_blocks_.size());
+ // A list of var len pages to replace 'var_len_pages_'. Note that after we are done
+ // we may have a different number of pages, because internal fragmentation may leave
+ // slightly different amounts of wasted space at the end of each page.
+ // We need to be careful to clean up these pages if we run into an error in this method.
+ vector<Page> sorted_var_len_pages;
+ sorted_var_len_pages.reserve(var_len_pages_.size());
vector<StringValue*> string_values;
int total_var_len;
string_values.reserve(sort_tuple_desc_->string_slots().size());
- BufferedBlockMgr::Block* cur_sorted_var_len_block = NULL;
- if (HasVarLenBlocks()) {
- DCHECK(var_len_copy_block_ != NULL);
- sorted_var_len_blocks.push_back(var_len_copy_block_);
- cur_sorted_var_len_block = var_len_copy_block_;
- // Set var_len_copy_block_ to NULL since it was moved to var_len_blocks_.
- var_len_copy_block_ = NULL;
+ Page* cur_sorted_var_len_page = NULL;
+ if (HasVarLenPages()) {
+ DCHECK(var_len_copy_page_.is_open());
+ sorted_var_len_pages.push_back(move(var_len_copy_page_));
+ cur_sorted_var_len_page = &sorted_var_len_pages.back();
} else if (has_var_len_slots_) {
- // If we don't have any var-len blocks, clean up the copy block.
- DCHECK(var_len_copy_block_ != NULL);
- var_len_copy_block_->Delete();
- var_len_copy_block_ = NULL;
+ // If we don't have any var-len pages, clean up the copy page.
+ DCHECK(var_len_copy_page_.is_open());
+ var_len_copy_page_.Close(sorter_->buffer_pool_client_);
} else {
- DCHECK(var_len_copy_block_ == NULL);
+ DCHECK(!var_len_copy_page_.is_open());
}
Status status;
- for (int i = 0; i < fixed_len_blocks_.size(); ++i) {
- BufferedBlockMgr::Block* cur_fixed_block = fixed_len_blocks_[i];
+ for (int i = 0; i < fixed_len_pages_.size(); ++i) {
+ Page* cur_fixed_page = &fixed_len_pages_[i];
// Skip converting the pointers if no var-len slots, or if all the values are null
// or zero-length. This will possibly leave zero-length pointers pointing to
// arbitrary memory, but zero-length data cannot be dereferenced anyway.
- if (HasVarLenBlocks()) {
- for (int block_offset = 0; block_offset < cur_fixed_block->valid_data_len();
- block_offset += sort_tuple_size_) {
- Tuple* cur_tuple =
- reinterpret_cast<Tuple*>(cur_fixed_block->buffer() + block_offset);
+ if (HasVarLenPages()) {
+ for (int page_offset = 0; page_offset < cur_fixed_page->valid_data_len();
+ page_offset += sort_tuple_size_) {
+ Tuple* cur_tuple = reinterpret_cast<Tuple*>(cur_fixed_page->data() + page_offset);
CollectNonNullVarSlots(cur_tuple, &string_values, &total_var_len);
- DCHECK(cur_sorted_var_len_block != NULL);
- if (cur_sorted_var_len_block->BytesRemaining() < total_var_len) {
+ DCHECK(cur_sorted_var_len_page->is_open());
+ if (cur_sorted_var_len_page->BytesRemaining() < total_var_len) {
bool added;
- status = TryAddBlock(UNPIN_PREV, &sorted_var_len_blocks, &added);
- if (!status.ok()) goto cleanup_blocks;
- DCHECK(added) << "TryAddBlock() with UNPIN_PREV should not fail to add";
- cur_sorted_var_len_block = sorted_var_len_blocks.back();
+ status = TryAddPage(UNPIN_PREV, &sorted_var_len_pages, &added);
+ if (!status.ok()) goto cleanup_pages;
+ DCHECK(added) << "TryAddPage() with UNPIN_PREV should not fail to add";
+ cur_sorted_var_len_page = &sorted_var_len_pages.back();
}
- uint8_t* var_data_ptr =
- cur_sorted_var_len_block->Allocate<uint8_t>(total_var_len);
- DCHECK_EQ(sorted_var_len_blocks.back(), cur_sorted_var_len_block);
- CopyVarLenDataConvertOffset(string_values, sorted_var_len_blocks.size() - 1,
- reinterpret_cast<uint8_t*>(cur_sorted_var_len_block->buffer()), var_data_ptr);
+ uint8_t* var_data_ptr = cur_sorted_var_len_page->AllocateBytes(total_var_len);
+ DCHECK_EQ(&sorted_var_len_pages.back(), cur_sorted_var_len_page);
+ CopyVarLenDataConvertOffset(string_values, sorted_var_len_pages.size() - 1,
+ cur_sorted_var_len_page->data(), var_data_ptr);
}
}
- status = cur_fixed_block->Unpin();
- if (!status.ok()) goto cleanup_blocks;
+ cur_fixed_page->Unpin(sorter_->buffer_pool_client_);
}
- if (HasVarLenBlocks()) {
- DCHECK_GT(sorted_var_len_blocks.back()->valid_data_len(), 0);
- status = sorted_var_len_blocks.back()->Unpin();
- if (!status.ok()) goto cleanup_blocks;
+ if (HasVarLenPages()) {
+ DCHECK_GT(sorted_var_len_pages.back().valid_data_len(), 0);
+ sorted_var_len_pages.back().Unpin(sorter_->buffer_pool_client_);
}
- // Clear var_len_blocks_ and replace with it with the contents of sorted_var_len_blocks
- DeleteAndClearBlocks(&var_len_blocks_);
- sorted_var_len_blocks.swap(var_len_blocks_);
+ // Clear var_len_pages_ and replace with it with the contents of sorted_var_len_pages
+ DeleteAndClearPages(&var_len_pages_);
+ sorted_var_len_pages.swap(var_len_pages_);
is_pinned_ = false;
sorter_->spilled_runs_counter_->Add(1);
return Status::OK();
-cleanup_blocks:
- DeleteAndClearBlocks(&sorted_var_len_blocks);
+cleanup_pages:
+ DeleteAndClearPages(&sorted_var_len_pages);
return status;
}
-Status Sorter::Run::PrepareRead(bool* pinned_all_blocks) {
+Status Sorter::Run::PrepareRead(bool* pinned) {
DCHECK(is_finalized_);
DCHECK(is_sorted_);
- fixed_len_blocks_index_ = 0;
- fixed_len_block_offset_ = 0;
- var_len_blocks_index_ = 0;
- end_of_fixed_len_block_ = end_of_var_len_block_ = fixed_len_blocks_.empty();
+ fixed_len_pages_index_ = 0;
+ fixed_len_page_offset_ = 0;
+ var_len_pages_index_ = 0;
+ end_of_fixed_len_page_ = end_of_var_len_page_ = fixed_len_pages_.empty();
num_tuples_returned_ = 0;
buffered_batch_.reset(new RowBatch(
sorter_->output_row_desc_, sorter_->state_->batch_size(), sorter_->mem_tracker_));
- // If the run is pinned, all blocks are already pinned, so we're ready to read.
+ // If the run is pinned, all pages are already pinned, so we're ready to read.
if (is_pinned_) {
- *pinned_all_blocks = true;
+ *pinned = true;
return Status::OK();
}
- // Attempt to pin the first fixed and var-length blocks. In either case, pinning may
- // fail if the number of reserved blocks is oversubscribed, see IMPALA-1590.
- if (fixed_len_blocks_.size() > 0) {
- bool pinned;
- RETURN_IF_ERROR(fixed_len_blocks_[0]->Pin(&pinned));
- if (!pinned) {
- *pinned_all_blocks = false;
- return Status::OK();
- }
+ int num_to_pin = (fixed_len_pages_.size() > 0 ? 1 : 0) + (HasVarLenPages() ? 1 : 0);
+ int64_t required_mem = num_to_pin * sorter_->page_len_;
+ if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(required_mem)) {
+ *pinned = false;
+ return Status::OK();
}
- if (HasVarLenBlocks()) {
- bool pinned;
- RETURN_IF_ERROR(var_len_blocks_[0]->Pin(&pinned));
- if (!pinned) {
- *pinned_all_blocks = false;
- return Status::OK();
- }
+ // Attempt to pin the first fixed and var-length pages.
+ if (fixed_len_pages_.size() > 0) {
+ RETURN_IF_ERROR(fixed_len_pages_[0].Pin(sorter_->buffer_pool_client_));
}
-
- *pinned_all_blocks = true;
+ if (HasVarLenPages()) {
+ RETURN_IF_ERROR(var_len_pages_[0].Pin(sorter_->buffer_pool_client_));
+ }
+ *pinned = true;
return Status::OK();
}
@@ -794,7 +910,7 @@ Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {
buffered_batch_->Reset();
// Fill more rows into buffered_batch_.
bool eos;
- if (HasVarLenBlocks() && !is_pinned_) {
+ if (HasVarLenPages() && !is_pinned_) {
RETURN_IF_ERROR(GetNext<true>(buffered_batch_.get(), &eos));
} else {
RETURN_IF_ERROR(GetNext<false>(buffered_batch_.get(), &eos));
@@ -804,7 +920,7 @@ Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {
// Setting output_batch to NULL signals eos to the caller, so GetNext() is not
// allowed to attach resources to the batch on eos.
DCHECK_EQ(buffered_batch_->num_rows(), 0);
- DCHECK_EQ(buffered_batch_->num_blocks(), 0);
+ DCHECK_EQ(buffered_batch_->num_buffers(), 0);
*output_batch = NULL;
return Status::OK();
}
@@ -815,122 +931,130 @@ Status Sorter::Run::GetNextBatch(RowBatch** output_batch) {
template <bool CONVERT_OFFSET_TO_PTR>
Status Sorter::Run::GetNext(RowBatch* output_batch, bool* eos) {
// Var-len offsets are converted only when reading var-len data from unpinned runs.
- // We shouldn't convert var len offsets if there are no blocks, since in that case
- // they must all be null or zero-length strings, which don't point into a valid block.
- DCHECK_EQ(CONVERT_OFFSET_TO_PTR, HasVarLenBlocks() && !is_pinned_);
+ // We shouldn't convert var len offsets if there are no pages, since in that case
+ // they must all be null or zero-length strings, which don't point into a valid page.
+ DCHECK_EQ(CONVERT_OFFSET_TO_PTR, HasVarLenPages() && !is_pinned_);
- if (end_of_fixed_len_block_ &&
- fixed_len_blocks_index_ >= static_cast<int>(fixed_len_blocks_.size()) - 1) {
+ if (end_of_fixed_len_page_
+ && fixed_len_pages_index_ >= static_cast<int>(fixed_len_pages_.size()) - 1) {
if (is_pinned_) {
- // All blocks were previously attached to output batches. GetNextBatch() assumes
+ // All pages were previously attached to output batches. GetNextBatch() assumes
// that we don't attach resources to the batch on eos.
- DCHECK_EQ(NumNonNullBlocks(fixed_len_blocks_), 0);
- DCHECK_EQ(NumNonNullBlocks(var_len_blocks_), 0);
+ DCHECK_EQ(NumOpenPages(fixed_len_pages_), 0);
+ DCHECK_EQ(NumOpenPages(var_len_pages_), 0);
- // Flush resources in case we are in a subplan and need to allocate more blocks
+ // Flush resources in case we are in a subplan and need to allocate more pages
// when the node is reopened.
output_batch->MarkFlushResources();
} else {
// We held onto the last fixed or var len blocks without transferring them to the
// caller. We signalled MarkNeedsDeepCopy() to the caller, so we can safely delete
// them now to free memory.
- if (!fixed_len_blocks_.empty()) DCHECK_EQ(NumNonNullBlocks(fixed_len_blocks_), 1);
- if (!var_len_blocks_.empty()) DCHECK_EQ(NumNonNullBlocks(var_len_blocks_), 1);
+ if (!fixed_len_pages_.empty()) DCHECK_EQ(NumOpenPages(fixed_len_pages_), 1);
+ if (!var_len_pages_.empty()) DCHECK_EQ(NumOpenPages(var_len_pages_), 1);
}
- DeleteAllBlocks();
+ CloseAllPages();
*eos = true;
DCHECK_EQ(num_tuples_returned_, num_tuples_);
return Status::OK();
}
- // Advance the fixed or var len block if we reached the end in the previous call to
+ // Advance the fixed or var len page if we reached the end in the previous call to
// GetNext().
- if (end_of_fixed_len_block_) {
- RETURN_IF_ERROR(PinNextReadBlock(&fixed_len_blocks_, fixed_len_blocks_index_));
- ++fixed_len_blocks_index_;
- fixed_len_block_offset_ = 0;
- end_of_fixed_len_block_ = false;
- }
- if (end_of_var_len_block_) {
- RETURN_IF_ERROR(PinNextReadBlock(&var_len_blocks_, var_len_blocks_index_));
- ++var_len_blocks_index_;
- end_of_var_len_block_ = false;
- }
-
- // Fills rows into the output batch until a block boundary is reached.
- BufferedBlockMgr::Block* fixed_len_block = fixed_len_blocks_[fixed_len_blocks_index_];
- DCHECK(fixed_len_block != NULL);
- while (!output_batch->AtCapacity() &&
- fixed_len_block_offset_ < fixed_len_block->valid_data_len()) {
- DCHECK(fixed_len_block != NULL);
- Tuple* input_tuple = reinterpret_cast<Tuple*>(
- fixed_len_block->buffer() + fixed_len_block_offset_);
+ if (end_of_fixed_len_page_) {
+ RETURN_IF_ERROR(PinNextReadPage(&fixed_len_pages_, fixed_len_pages_index_));
+ ++fixed_len_pages_index_;
+ fixed_len_page_offset_ = 0;
+ end_of_fixed_len_page_ = false;
+ }
+ if (end_of_var_len_page_) {
+ RETURN_IF_ERROR(PinNextReadPage(&var_len_pages_, var_len_pages_index_));
+ ++var_len_pages_index_;
+ end_of_var_len_page_ = false;
+ }
+
+ // Fills rows into the output batch until a page boundary is reached.
+ Page* fixed_len_page = &fixed_len_pages_[fixed_len_pages_index_];
+ DCHECK(fixed_len_page != NULL);
+
+ // Ensure we have a reference to the fixed-length page's buffer.
+ RETURN_IF_ERROR(fixed_len_page->WaitForBuffer());
+
+ // If we're converting offsets into unpinned var-len pages, make sure the
+ // current var-len page is in memory.
+ if (CONVERT_OFFSET_TO_PTR && HasVarLenPages()) {
+ RETURN_IF_ERROR(var_len_pages_[var_len_pages_index_].WaitForBuffer());
+ }
+
+ while (!output_batch->AtCapacity()
+ && fixed_len_page_offset_ < fixed_len_page->valid_data_len()) {
+ DCHECK(fixed_len_page != NULL);
+ Tuple* input_tuple =
+ reinterpret_cast<Tuple*>(fixed_len_page->data() + fixed_len_page_offset_);
if (CONVERT_OFFSET_TO_PTR && !ConvertOffsetsToPtrs(input_tuple)) {
DCHECK(!is_pinned_);
- // The var-len data is in the next block. We are done with the current block, so
- // return rows we've accumulated so far and advance to the next block in the next
- // GetNext() call. This is needed for the unpinned case where we need to exchange
- // this block for the next in the next GetNext() call. So therefore we must hold
- // onto the current var-len block and signal to the caller that the block is going
+ // The var-len data is in the next page. We are done with the current page, so
+ // return rows we've accumulated so far and advance to the next page in the next
+ // GetNext() call. This is needed for the unpinned case where we will exchange
+ // this page for the next in the next GetNext() call. So therefore we must hold
+ // onto the current var-len page and signal to the caller that the page is going
// to be deleted.
output_batch->MarkNeedsDeepCopy();
- end_of_var_len_block_ = true;
+ end_of_var_len_page_ = true;
break;
}
output_batch->GetRow(output_batch->AddRow())->SetTuple(0, input_tuple);
output_batch->CommitLastRow();
- fixed_len_block_offset_ += sort_tuple_size_;
+ fixed_len_page_offset_ += sort_tuple_size_;
++num_tuples_returned_;
}
- if (fixed_len_block_offset_ >= fixed_len_block->valid_data_len()) {
- // Reached the block boundary, need to move to the next block.
+ if (fixed_len_page_offset_ >= fixed_len_page->valid_data_len()) {
+ // Reached the page boundary, need to move to the next page.
if (is_pinned_) {
- // Attach block to batch. Caller can delete the block when it wants to.
- output_batch->AddBlock(fixed_len_blocks_[fixed_len_blocks_index_],
+ BufferPool::ClientHandle* client = sorter_->buffer_pool_client_;
+ // Attach page to batch. Caller can delete the page when it wants to.
+ output_batch->AddBuffer(client,
+ fixed_len_pages_[fixed_len_pages_index_].ExtractBuffer(client),
RowBatch::FlushMode::NO_FLUSH_RESOURCES);
- fixed_len_blocks_[fixed_len_blocks_index_] = NULL;
- // Attach the var-len blocks at eos once no more rows will reference the blocks.
- if (fixed_len_blocks_index_ == fixed_len_blocks_.size() - 1) {
- for (BufferedBlockMgr::Block* var_len_block: var_len_blocks_) {
- DCHECK(var_len_block != NULL);
- output_batch->AddBlock(var_len_block, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+ // Attach the var-len pages at eos once no more rows will reference the pages.
+ if (fixed_len_pages_index_ == fixed_len_pages_.size() - 1) {
+ for (Page& var_len_page : var_len_pages_) {
+ DCHECK(var_len_page.is_open());
+ output_batch->AddBuffer(client, var_len_page.ExtractBuffer(client),
+ RowBatch::FlushMode::NO_FLUSH_RESOURCES);
}
- var_len_blocks_.clear();
+ var_len_pages_.clear();
}
} else {
- // To iterate over unpinned runs, we need to exchange this block for the next
- // in the next GetNext() call, so we need to hold onto the block and signal to
- // the caller that the block is going to be deleted.
+ // To iterate over unpinned runs, we need to exchange this page for the next
+ // in the next GetNext() call, so we need to hold onto the page and signal to
+ // the caller that the page is going to be deleted.
output_batch->MarkNeedsDeepCopy();
}
- end_of_fixed_len_block_ = true;
+ end_of_fixed_len_page_ = true;
}
*eos = false;
return Status::OK();
}
-Status Sorter::Run::PinNextReadBlock(vector<BufferedBlockMgr::Block*>* blocks,
- int block_index) {
- DCHECK_GE(block_index, 0);
- DCHECK_LT(block_index, blocks->size() - 1);
- BufferedBlockMgr::Block* curr_block = (*blocks)[block_index];
- BufferedBlockMgr::Block* next_block = (*blocks)[block_index + 1];
- DCHECK_EQ(is_pinned_, next_block->is_pinned());
+Status Sorter::Run::PinNextReadPage(vector<Page>* pages, int page_index) {
+ DCHECK_GE(page_index, 0);
+ DCHECK_LT(page_index, pages->size() - 1);
+ Page* curr_page = &(*pages)[page_index];
+ Page* next_page = &(*pages)[page_index + 1];
+ DCHECK_EQ(is_pinned_, next_page->is_pinned());
if (is_pinned_) {
- // The current block was attached to a batch and 'next_block' is already pinned.
- DCHECK(curr_block == NULL);
+ // The current page was attached to a batch and 'next_page' is already pinned.
+ DCHECK(!curr_page->is_open());
return Status::OK();
}
- bool pinned;
- // Atomically delete the previous block and pin this one. Should not fail due to lack
- // of memory. Pin() deletes the block even in error cases, so we need to remove it from
- // the vector first to avoid an inconsistent state.
- (*blocks)[block_index] = NULL;
- RETURN_IF_ERROR(next_block->Pin(&pinned, curr_block, false));
- DCHECK(pinned) << "Atomic delete and pin should not fail without error.";
+ // Close the previous page to free memory and pin the next page. Should always succeed
+ // since the pages are the same size.
+ curr_page->Close(sorter_->buffer_pool_client_);
+ RETURN_IF_ERROR(next_page->Pin(sorter_->buffer_pool_client_));
return Status::OK();
}
@@ -948,28 +1072,29 @@ void Sorter::Run::CollectNonNullVarSlots(Tuple* src,
}
}
-Status Sorter::Run::TryAddBlock(AddBlockMode mode,
- vector<BufferedBlockMgr::Block*>* block_sequence, bool* added) {
- DCHECK(!block_sequence->empty());
- BufferedBlockMgr::Block* prev_block;
+Status Sorter::Run::TryAddPage(
+ AddPageMode mode, vector<Page>* page_sequence, bool* added) {
+ DCHECK(!page_sequence->empty());
if (mode == KEEP_PREV_PINNED) {
- prev_block = NULL;
+ if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(sorter_->page_len_)) {
+ *added = false;
+ return Status::OK();
+ }
} else {
DCHECK(mode == UNPIN_PREV);
- // Swap the prev block with the next, to guarantee success.
- prev_block = block_sequence->back();
+ // Unpin the prev page to free up the memory required to pin the next page.
+ page_sequence->back().Unpin(sorter_->buffer_pool_client_);
}
- BufferedBlockMgr::Block* new_block;
- RETURN_IF_ERROR(sorter_->block_mgr_->GetNewBlock(
- sorter_->block_mgr_client_, prev_block, &new_block));
- if (new_block != NULL) {
- *added = true;
- block_sequence->push_back(new_block);
- } else {
- DCHECK_EQ(mode, KEEP_PREV_PINNED);
- *added = false;
- }
+ RETURN_IF_ERROR(AddPage(page_sequence));
+ *added = true;
+ return Status::OK();
+}
+
+Status Sorter::Run::AddPage(vector<Page>* page_sequence) {
+ Page new_page;
+ RETURN_IF_ERROR(new_page.Init(sorter_));
+ page_sequence->push_back(move(new_page));
return Status::OK();
}
@@ -983,27 +1108,26 @@ void Sorter::Run::CopyVarLenData(const vector<StringValue*>& string_values,
}
void Sorter::Run::CopyVarLenDataConvertOffset(const vector<StringValue*>& string_values,
- int block_index, const uint8_t* block_start, uint8_t* dest) {
- DCHECK_GE(block_index, 0);
- DCHECK_GE(dest - block_start, 0);
+ int page_index, const uint8_t* page_start, uint8_t* dest) {
+ DCHECK_GE(page_index, 0);
+ DCHECK_GE(dest - page_start, 0);
- for (StringValue* string_val: string_values) {
+ for (StringValue* string_val : string_values) {
memcpy(dest, string_val->ptr, string_val->len);
- DCHECK_LE(dest - block_start, sorter_->block_mgr_->max_block_size());
- DCHECK_LE(dest - block_start, INT_MAX);
- int block_offset = dest - block_start;
- uint64_t packed_offset =
- (static_cast<uint64_t>(block_index) << 32) | block_offset;
+ DCHECK_LE(dest - page_start, sorter_->page_len_);
+ DCHECK_LE(dest - page_start, numeric_limits<uint32_t>::max());
+ uint32_t page_offset = dest - page_start;
+ uint64_t packed_offset = (static_cast<uint64_t>(page_index) << 32) | page_offset;
string_val->ptr = reinterpret_cast<char*>(packed_offset);
dest += string_val->len;
}
}
bool Sorter::Run::ConvertOffsetsToPtrs(Tuple* tuple) {
- // We need to be careful to handle the case where var_len_blocks_ is empty,
+ // We need to be careful to handle the case where var_len_pages_ is empty,
// e.g. if all strings are NULL.
- uint8_t* block_start = var_len_blocks_.empty() ? NULL :
- var_len_blocks_[var_len_blocks_index_]->buffer();
+ uint8_t* page_start =
+ var_len_pages_.empty() ? NULL : var_len_pages_[var_len_pages_index_].data();
const vector<SlotDescriptor*>& string_slots = sort_tuple_desc_->string_slots();
int num_non_null_string_slots = 0;
@@ -1015,47 +1139,47 @@ bool Sorter::Run::ConvertOffsetsToPtrs(Tuple* tuple) {
DCHECK(slot_desc->type().IsVarLenStringType());
StringValue* value = reinterpret_cast<StringValue*>(
tuple->GetSlot(slot_desc->tuple_offset()));
- // packed_offset includes the block index in the upper 32 bits and the block
+ // packed_offset includes the page index in the upper 32 bits and the page
// offset in the lower 32 bits. See CopyVarLenDataConvertOffset().
uint64_t packed_offset = reinterpret_cast<uint64_t>(value->ptr);
- int block_index = packed_offset >> 32;
- int block_offset = packed_offset & 0xFFFFFFFF;
+ uint32_t page_index = packed_offset >> 32;
+ uint32_t page_offset = packed_offset & 0xFFFFFFFF;
- if (block_index > var_len_blocks_index_) {
- // We've reached the block boundary for the current var-len block.
+ if (page_index > var_len_pages_index_) {
+ // We've reached the page boundary for the current var-len page.
// This tuple will be returned in the next call to GetNext().
- DCHECK_GE(block_index, 0);
- DCHECK_LE(block_index, var_len_blocks_.size());
- DCHECK_EQ(block_index, var_len_blocks_index_ + 1);
- DCHECK_EQ(block_offset, 0); // The data is the first thing in the next block.
+ DCHECK_GE(page_index, 0);
+ DCHECK_LE(page_index, var_len_pages_.size());
+ DCHECK_EQ(page_index, var_len_pages_index_ + 1);
+ DCHECK_EQ(page_offset, 0); // The data is the first thing in the next page.
// This must be the first slot with var len data for the tuple. Var len data
// for tuple shouldn't be split across blocks.
DCHECK_EQ(num_non_null_string_slots, 1);
return false;
}
- DCHECK_EQ(block_index, var_len_blocks_index_);
- if (var_len_blocks_.empty()) {
+ DCHECK_EQ(page_index, var_len_pages_index_);
+ if (var_len_pages_.empty()) {
DCHECK_EQ(value->len, 0);
} else {
- DCHECK_LE(block_offset + value->len, var_len_blocks_[block_index]->valid_data_len());
+ DCHECK_LE(page_offset + value->len, var_len_pages_[page_index].valid_data_len());
}
// Calculate the address implied by the offset and assign it. May be NULL for
- // zero-length strings if there are no blocks in the run since block_start is NULL.
- DCHECK(block_start != NULL || block_offset == 0);
- value->ptr = reinterpret_cast<char*>(block_start + block_offset);
+ // zero-length strings if there are no pages in the run since page_start is NULL.
+ DCHECK(page_start != NULL || page_offset == 0);
+ value->ptr = reinterpret_cast<char*>(page_start + page_offset);
}
return true;
}
int64_t Sorter::Run::TotalBytes() const {
int64_t total_bytes = 0;
- for (BufferedBlockMgr::Block* block: fixed_len_blocks_) {
- if (block != NULL) total_bytes += block->valid_data_len();
+ for (const Page& page : fixed_len_pages_) {
+ if (page.is_open()) total_bytes += page.valid_data_len();
}
- for (BufferedBlockMgr::Block* block: var_len_blocks_) {
- if (block != NULL) total_bytes += block->valid_data_len();
+ for (const Page& page : var_len_pages_) {
+ if (page.is_open()) total_bytes += page.valid_data_len();
}
return total_bytes;
}
@@ -1072,61 +1196,61 @@ Sorter::TupleIterator::TupleIterator(Sorter::Run* run, int64_t index)
}
const int tuple_size = run->sort_tuple_size_;
- int block_offset;
+ uint32_t page_offset;
if (UNLIKELY(index == run->num_tuples())) {
// If the iterator is initialized past the end, set up buffer_start_index_,
- // 'buffer_end_index_' and 'block_index_' for the last block, then set 'tuple' to
+ // 'buffer_end_index_' and 'page_index_' for the last page, then set 'tuple' to
// 'tuple_size' bytes past the last tuple, so everything is correct when Prev() is
// invoked.
- block_index_ = run->fixed_len_blocks_.size() - 1;
- block_offset = ((index - 1) % run->block_capacity_) * tuple_size + tuple_size;
+ page_index_ = run->fixed_len_pages_.size() - 1;
+ page_offset = ((index - 1) % run->page_capacity_) * tuple_size + tuple_size;
} else {
- block_index_ = index / run->block_capacity_;
- block_offset = (index % run->block_capacity_) * tuple_size;
+ page_index_ = index / run->page_capacity_;
+ page_offset = (index % run->page_capacity_) * tuple_size;
}
- buffer_start_index_ = block_index_ * run->block_capacity_;
- buffer_end_index_ = buffer_start_index_ + run->block_capacity_;
- tuple_ = run->fixed_len_blocks_[block_index_]->buffer() + block_offset;
+ buffer_start_index_ = page_index_ * run->page_capacity_;
+ buffer_end_index_ = buffer_start_index_ + run->page_capacity_;
+ tuple_ = run->fixed_len_pages_[page_index_].data() + page_offset;
}
void Sorter::TupleIterator::Next(Sorter::Run* run, int tuple_size) {
DCHECK_LT(index_, run->num_tuples()) << "Can only advance one past end of run";
tuple_ += tuple_size;
++index_;
- if (UNLIKELY(index_ >= buffer_end_index_)) NextBlock(run, tuple_size);
+ if (UNLIKELY(index_ >= buffer_end_index_)) NextPage(run, tuple_size);
}
-void Sorter::TupleIterator::NextBlock(Sorter::Run* run, int tuple_size) {
- // When moving after the last tuple, stay at the last block.
+void Sorter::TupleIterator::NextPage(Sorter::Run* run, int tuple_size) {
+ // When moving after the last tuple, stay at the last page.
if (index_ >= run->num_tuples()) return;
- ++block_index_;
- DCHECK_LT(block_index_, run->fixed_len_blocks_.size());
- buffer_start_index_ = block_index_ * run->block_capacity_;
+ ++page_index_;
+ DCHECK_LT(page_index_, run->fixed_len_pages_.size());
+ buffer_start_index_ = page_index_ * run->page_capacity_;
DCHECK_EQ(index_, buffer_start_index_);
- buffer_end_index_ = buffer_start_index_ + run->block_capacity_;
- tuple_ = run->fixed_len_blocks_[block_index_]->buffer();
+ buffer_end_index_ = buffer_start_index_ + run->page_capacity_;
+ tuple_ = run->fixed_len_pages_[page_index_].data();
}
void Sorter::TupleIterator::Prev(Sorter::Run* run, int tuple_size) {
DCHECK_GE(index_, 0) << "Can only advance one before start of run";
tuple_ -= tuple_size;
--index_;
- if (UNLIKELY(index_ < buffer_start_index_)) PrevBlock(run, tuple_size);
+ if (UNLIKELY(index_ < buffer_start_index_)) PrevPage(run, tuple_size);
}
-void Sorter::TupleIterator::PrevBlock(Sorter::Run* run, int tuple_size) {
- // When moving before the first tuple, stay at the first block.
+void Sorter::TupleIterator::PrevPage(Sorter::Run* run, int tuple_size) {
+ // When moving before the first tuple, stay at the first page.
if (index_ < 0) return;
- --block_index_;
- DCHECK_GE(block_index_, 0);
- buffer_start_index_ = block_index_ * run->block_capacity_;
- buffer_end_index_ = buffer_start_index_ + run->block_capacity_;
+ --page_index_;
+ DCHECK_GE(page_index_, 0);
+ buffer_start_index_ = page_index_ * run->page_capacity_;
+ buffer_end_index_ = buffer_start_index_ + run->page_capacity_;
DCHECK_EQ(index_, buffer_end_index_ - 1);
- int last_tuple_block_offset = run->sort_tuple_size_ * (run->block_capacity_ - 1);
- tuple_ = run->fixed_len_blocks_[block_index_]->buffer() + last_tuple_block_offset;
+ int last_tuple_page_offset = run->sort_tuple_size_ * (run->page_capacity_ - 1);
+ tuple_ = run->fixed_len_pages_[page_index_].data() + last_tuple_page_offset;
}
-Sorter::TupleSorter::TupleSorter(const TupleRowComparator& comp, int64_t block_size,
+Sorter::TupleSorter::TupleSorter(const TupleRowComparator& comp, int64_t page_size,
int tuple_size, RuntimeState* state)
: tuple_size_(tuple_size),
comparator_(comp),
@@ -1340,13 +1464,15 @@ inline void Sorter::TupleSorter::Swap(Tuple* left, Tuple* right, Tuple* swap_tup
Sorter::Sorter(const TupleRowComparator& compare_less_than,
const vector<ScalarExpr*>& sort_tuple_exprs, RowDescriptor* output_row_desc,
- MemTracker* mem_tracker, RuntimeProfile* profile, RuntimeState* state,
+ MemTracker* mem_tracker, BufferPool::ClientHandle* buffer_pool_client,
+ int64_t page_len, RuntimeProfile* profile, RuntimeState* state, int node_id,
bool enable_spilling)
- : state_(state),
+ : node_id_(node_id),
+ state_(state),
compare_less_than_(compare_less_than),
in_mem_tuple_sorter_(NULL),
- block_mgr_(state->block_mgr()),
- block_mgr_client_(NULL),
+ buffer_pool_client_(buffer_pool_client),
+ page_len_(page_len),
has_var_len_slots_(false),
sort_tuple_exprs_(sort_tuple_exprs),
mem_tracker_(mem_tracker),
@@ -1370,10 +1496,24 @@ Sorter::~Sorter() {
Status Sorter::Prepare(ObjectPool* obj_pool, MemPool* expr_mem_pool) {
DCHECK(in_mem_tuple_sorter_ == NULL) << "Already prepared";
+ // Page byte offsets are packed into uint32_t values, which limits the supported
+ // page size.
+ if (page_len_ > numeric_limits<uint32_t>::max()) {
+ return Status(Substitute(
+ "Page size $0 exceeded maximum supported in sorter ($1)",
+ PrettyPrinter::PrintBytes(page_len_),
+ PrettyPrinter::PrintBytes(numeric_limits<uint32_t>::max())));
+ }
+
TupleDescriptor* sort_tuple_desc = output_row_desc_->tuple_descriptors()[0];
+ if (sort_tuple_desc->byte_size() > page_len_) {
+ return Status(TErrorCode::MAX_ROW_SIZE,
+ PrettyPrinter::Print(sort_tuple_desc->byte_size(), TUnit::BYTES), node_id_,
+ PrettyPrinter::Print(0, TUnit::BYTES));
+ }
has_var_len_slots_ = sort_tuple_desc->HasVarlenSlots();
- in_mem_tuple_sorter_.reset(new TupleSorter(compare_less_than_,
- block_mgr_->max_block_size(), sort_tuple_desc->byte_size(), state_));
+ in_mem_tuple_sorter_.reset(new TupleSorter(compare_less_than_, page_len_,
+ sort_tuple_desc->byte_size(), state_));
initial_runs_counter_ = ADD_COUNTER(profile_, "InitialRunsCreated", TUnit::UNIT);
spilled_runs_counter_ = ADD_COUNTER(profile_, "SpilledRuns", TUnit::UNIT);
@@ -1382,17 +1522,6 @@ Status Sorter::Prepare(ObjectPool* obj_pool, MemPool* expr_mem_pool) {
sorted_data_size_ = ADD_COUNTER(profile_, "SortDataSize", TUnit::BYTES);
run_sizes_ = ADD_SUMMARY_STATS_COUNTER(profile_, "NumRowsPerRun", TUnit::UNIT);
- // If spilling is enabled, we need enough buffers to perform merges. Otherwise, there
- // won't be any merges and we only need 1 buffer.
- // Must be kept in sync with SortNode.computeResourceProfile() in fe.
- int min_buffers_required = enable_spilling_ ? MIN_BUFFERS_PER_MERGE : 1;
- // Fixed and var-length blocks are separate, so we need twice as many blocks for both if
- // there is var-length data.
- if (sort_tuple_desc->HasVarlenSlots()) min_buffers_required *= 2;
-
- RETURN_IF_ERROR(block_mgr_->RegisterClient(Substitute("Sorter ptr=$0", this),
- min_buffers_required, false, mem_tracker_, state_, &block_mgr_client_));
-
RETURN_IF_ERROR(ScalarExprEvaluator::Create(sort_tuple_exprs_, state_, obj_pool,
expr_mem_pool, &sort_tuple_expr_evals_));
return Status::OK();
@@ -1413,6 +1542,15 @@ void Sorter::FreeLocalAllocations() {
ScalarExprEvaluator::FreeLocalAllocations(sort_tuple_expr_evals_);
}
+int64_t Sorter::ComputeMinReservation() {
+ // Must be kept in sync with SortNode.computeNodeResourceProfile() in fe.
+ int min_buffers_required = enable_spilling_ ? MIN_BUFFERS_PER_MERGE : 1;
+ // Fixed and var-length pages are separate, so we need double the pages
+ // if there is var-length data.
+ if (output_row_desc_->HasVarlenSlots()) min_buffers_required *= 2;
+ return min_buffers_required * page_len_;
+}
+
Status Sorter::AddBatch(RowBatch* batch) {
DCHECK(unsorted_run_ != NULL);
DCHECK(batch != NULL);
@@ -1424,11 +1562,12 @@ Status Sorter::AddBatch(RowBatch* batch) {
cur_batch_index += num_processed;
if (cur_batch_index < batch->num_rows()) {
- // The current run is full. Sort it and begin the next one.
+ // The current run is full. Sort it, spill it and begin the next one.
+ RETURN_IF_ERROR(state_->StartSpilling(mem_tracker_));
RETURN_IF_ERROR(SortCurrentInputRun());
- RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllBlocks());
- unsorted_run_ = obj_pool_.Add(
- new Run(this, output_row_desc_->tuple_descriptors()[0], true));
+ RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllPages());
+ unsorted_run_ =
+ obj_pool_.Add(new Run(this, output_row_desc_->tuple_descriptors()[0], true));
RETURN_IF_ERROR(unsorted_run_->Init());
}
}
@@ -1459,7 +1598,7 @@ Status Sorter::InputDone() {
// Unpin the final run to free up memory for the merge.
// TODO: we could keep it in memory in some circumstances as an optimisation, once
// we have a buffer pool with more reliable reservations (IMPALA-3200).
- RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllBlocks());
+ RETURN_IF_ERROR(sorted_runs_.back()->UnpinAllPages());
// Merge intermediate runs until we have a final merge set-up.
// TODO: Attempt to allocate more memory before doing intermediate merges. This may
@@ -1487,7 +1626,6 @@ void Sorter::Reset() {
void Sorter::Close(RuntimeState* state) {
CleanupAllRuns();
- block_mgr_->ClearReservations(block_mgr_client_);
obj_pool_.Clear();
ScalarExprEvaluator::Close(sort_tuple_expr_evals_, state);
}
@@ -1495,9 +1633,9 @@ void Sorter::Close(RuntimeState* state) {
void Sorter::CleanupAllRuns() {
Run::CleanupRuns(&sorted_runs_);
Run::CleanupRuns(&merging_runs_);
- if (unsorted_run_ != NULL) unsorted_run_->DeleteAllBlocks();
+ if (unsorted_run_ != NULL) unsorted_run_->CloseAllPages();
unsorted_run_ = NULL;
- if (merge_output_run_ != NULL) merge_output_run_->DeleteAllBlocks();
+ if (merge_output_run_ != NULL) merge_output_run_->CloseAllPages();
merge_output_run_ = NULL;
}
@@ -1519,10 +1657,10 @@ Status Sorter::SortCurrentInputRun() {
Status Sorter::MergeIntermediateRuns() {
DCHECK_GE(sorted_runs_.size(), 2);
- int pinned_blocks_per_run = has_var_len_slots_ ? 2 : 1;
- int max_runs_per_final_merge = MAX_BUFFERS_PER_MERGE / pinned_blocks_per_run;
+ int pinned_pages_per_run = has_var_len_slots_ ? 2 : 1;
+ int max_runs_per_final_merge = MAX_BUFFERS_PER_MERGE / pinned_pages_per_run;
- // During an intermediate merge, the one or two blocks from the output sorted run
+ // During an intermediate merge, the one or two pages from the output sorted run
// that are being written must be pinned.
int max_runs_per_intermediate_merge = max_runs_per_final_merge - 1;
DCHECK_GT(max_runs_per_intermediate_merge, 1);
@@ -1549,7 +1687,7 @@ Status Sorter::MergeIntermediateRuns() {
if (sorted_runs_.empty()) {
// Don't need intermediate run for final merge.
if (merge_output_run_ != NULL) {
- merge_output_run_->DeleteAllBlocks();
+ merge_output_run_->CloseAllPages();
merge_output_run_ = NULL;
}
return Status::OK();
@@ -1604,7 +1742,8 @@ Status Sorter::CreateMerger(int max_num_runs) {
}
Status Sorter::ExecuteIntermediateMerge(Sorter::Run* merged_run) {
- RowBatch intermediate_merge_batch(output_row_desc_, state_->batch_size(), mem_tracker_);
+ RowBatch intermediate_merge_batch(
+ output_row_desc_, state_->batch_size(), mem_tracker_);
bool eos = false;
while (!eos) {
// Copy rows into the new run until done.
@@ -1621,5 +1760,4 @@ Status Sorter::ExecuteIntermediateMerge(Sorter::Run* merged_run) {
RETURN_IF_ERROR(merged_run->FinalizeInput());
return Status::OK();
}
-
} // namespace impala
[11/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
IMPALA-4674: Part 2: port backend exec to BufferPool
Always create global BufferPool at startup using 80% of memory and
limit reservations to 80% of query memory (same as BufferedBlockMgr).
The query's initial reservation is computed in the planner, claimed
centrally (managed by the InitialReservations class) and distributed
to query operators from there.
min_spillable_buffer_size and default_spillable_buffer_size query
options control the buffer size that the planner selects for
spilling operators.
Port ExecNodes to use BufferPool:
* Each ExecNode has to claim its reservation during Open()
* Port Sorter to use BufferPool.
* Switch from BufferedTupleStream to BufferedTupleStreamV2
* Port HashTable to use BufferPool via a Suballocator.
This also makes PAGG memory consumption more efficient (avoid wasting buffers)
and improve the spilling algorithm:
* Allow preaggs to execute with 0 reservation - if streams and hash tables
cannot be allocated, it will pass through rows.
* Halve the buffer requirement for spilling aggs - avoid allocating
buffers for aggregated and unaggregated streams simultaneously.
* Rebuild spilled partitions instead of repartitioning (IMPALA-2708)
TODO in follow-up patches:
* Rename BufferedTupleStreamV2 to BufferedTupleStream
* Implement max_row_size query option.
Testing:
* Updated tests to reflect new memory requirements
Change-Id: I7fc7fe1c04e9dfb1a0c749fb56a5e0f2bf9c6c3e
Reviewed-on: http://gerrit.cloudera.org:8080/5801
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a98b90bd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a98b90bd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a98b90bd
Branch: refs/heads/master
Commit: a98b90bd3877886e97dc2385cfdf5e3f95245533
Parents: d5b0c6b
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed Mar 16 16:09:36 2016 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Aug 5 01:03:02 2017 +0000
----------------------------------------------------------------------
be/src/codegen/gen_ir_descriptions.py | 2 +-
be/src/exec/analytic-eval-node.cc | 53 +-
be/src/exec/analytic-eval-node.h | 25 +-
be/src/exec/exec-node.cc | 45 +-
be/src/exec/exec-node.h | 17 +
be/src/exec/hash-table-test.cc | 251 ++-
be/src/exec/hash-table.cc | 149 +-
be/src/exec/hash-table.h | 140 +-
be/src/exec/hash-table.inline.h | 20 +-
be/src/exec/nested-loop-join-builder.cc | 3 +-
be/src/exec/partial-sort-node.cc | 7 +-
be/src/exec/partial-sort-node.h | 1 -
be/src/exec/partitioned-aggregation-node-ir.cc | 20 +-
be/src/exec/partitioned-aggregation-node.cc | 639 ++++----
be/src/exec/partitioned-aggregation-node.h | 192 ++-
be/src/exec/partitioned-hash-join-builder-ir.cc | 12 +-
be/src/exec/partitioned-hash-join-builder.cc | 159 +-
be/src/exec/partitioned-hash-join-builder.h | 76 +-
be/src/exec/partitioned-hash-join-node-ir.cc | 7 +-
be/src/exec/partitioned-hash-join-node.cc | 136 +-
be/src/exec/partitioned-hash-join-node.h | 26 +-
be/src/exec/partitioned-hash-join-node.inline.h | 2 +-
be/src/exec/sort-node.cc | 15 +-
be/src/exec/sort-node.h | 3 +-
be/src/runtime/CMakeLists.txt | 5 +-
be/src/runtime/buffered-block-mgr-test.cc | 1547 ------------------
be/src/runtime/buffered-block-mgr.cc | 1254 --------------
be/src/runtime/buffered-block-mgr.h | 606 -------
be/src/runtime/buffered-tuple-stream-test.cc | 1264 --------------
be/src/runtime/buffered-tuple-stream.cc | 903 ----------
be/src/runtime/buffered-tuple-stream.h | 561 -------
be/src/runtime/buffered-tuple-stream.inline.h | 59 -
be/src/runtime/bufferpool/buffer-pool.cc | 12 +-
be/src/runtime/bufferpool/buffer-pool.h | 8 +
be/src/runtime/bufferpool/reservation-tracker.h | 4 +
be/src/runtime/disk-io-mgr.cc | 7 +-
be/src/runtime/exec-env.cc | 35 +-
be/src/runtime/exec-env.h | 4 +-
be/src/runtime/fragment-instance-state.cc | 2 -
be/src/runtime/initial-reservations.cc | 90 +
be/src/runtime/initial-reservations.h | 79 +
be/src/runtime/query-exec-mgr.cc | 2 +
be/src/runtime/query-state.cc | 91 +-
be/src/runtime/query-state.h | 51 +-
be/src/runtime/row-batch.cc | 19 -
be/src/runtime/row-batch.h | 13 -
be/src/runtime/runtime-filter.h | 1 +
be/src/runtime/runtime-state.cc | 52 +-
be/src/runtime/runtime-state.h | 32 +-
be/src/runtime/sorter.cc | 1058 ++++++------
be/src/runtime/sorter.h | 65 +-
be/src/runtime/test-env.cc | 23 +-
be/src/runtime/test-env.h | 9 +-
be/src/runtime/tmp-file-mgr-test.cc | 10 +-
be/src/runtime/tmp-file-mgr.h | 23 +-
be/src/service/client-request-state.cc | 4 +-
be/src/service/query-options.cc | 28 +-
be/src/service/query-options.h | 6 +-
be/src/util/bloom-filter.h | 2 +-
be/src/util/static-asserts.cc | 2 -
common/thrift/Frontend.thrift | 16 +-
common/thrift/ImpalaInternalService.thrift | 22 +-
common/thrift/ImpalaService.thrift | 8 +-
common/thrift/PlanNodes.thrift | 18 +
common/thrift/generate_error_codes.py | 10 +-
.../org/apache/impala/common/RuntimeEnv.java | 10 -
.../apache/impala/planner/AggregationNode.java | 11 +-
.../apache/impala/planner/AnalyticEvalNode.java | 7 +-
.../impala/planner/DataSourceScanNode.java | 2 +-
.../apache/impala/planner/DataStreamSink.java | 2 +-
.../org/apache/impala/planner/EmptySetNode.java | 2 +-
.../org/apache/impala/planner/ExchangeNode.java | 2 +-
.../apache/impala/planner/HBaseScanNode.java | 2 +-
.../apache/impala/planner/HBaseTableSink.java | 2 +-
.../org/apache/impala/planner/HashJoinNode.java | 8 +-
.../org/apache/impala/planner/HdfsScanNode.java | 4 +-
.../apache/impala/planner/HdfsTableSink.java | 2 +-
.../apache/impala/planner/JoinBuildSink.java | 2 +-
.../org/apache/impala/planner/KuduScanNode.java | 2 +-
.../apache/impala/planner/KuduTableSink.java | 2 +-
.../impala/planner/NestedLoopJoinNode.java | 5 +-
.../org/apache/impala/planner/PlanNode.java | 14 +-
.../org/apache/impala/planner/PlanRootSink.java | 2 +-
.../java/org/apache/impala/planner/Planner.java | 12 +-
.../apache/impala/planner/ResourceProfile.java | 72 +-
.../org/apache/impala/planner/SelectNode.java | 2 +-
.../impala/planner/SingularRowSrcNode.java | 2 +-
.../org/apache/impala/planner/SortNode.java | 47 +-
.../org/apache/impala/planner/SubplanNode.java | 2 +-
.../org/apache/impala/planner/UnionNode.java | 2 +-
.../org/apache/impala/planner/UnnestNode.java | 2 +-
.../org/apache/impala/service/Frontend.java | 2 +-
.../org/apache/impala/planner/PlannerTest.java | 2 -
.../queries/PlannerTest/constant-folding.test | 32 +-
.../queries/PlannerTest/disable-codegen.test | 2 +-
.../PlannerTest/fk-pk-join-detection.test | 52 +-
.../queries/PlannerTest/mt-dop-validation.test | 30 +-
.../queries/PlannerTest/parquet-filtering.test | 6 +-
.../PlannerTest/resource-requirements.test | 418 ++---
.../PlannerTest/sort-expr-materialization.test | 30 +-
.../PlannerTest/spillable-buffer-sizing.test | 112 +-
.../queries/PlannerTest/tablesample.test | 4 +-
.../queries/QueryTest/analytic-fns.test | 12 +-
.../queries/QueryTest/explain-level0.test | 2 +-
.../queries/QueryTest/explain-level1.test | 2 +-
.../queries/QueryTest/explain-level2.test | 6 +-
.../queries/QueryTest/explain-level3.test | 6 +-
.../queries/QueryTest/nested-types-tpch.test | 24 +-
.../QueryTest/runtime_row_filters_phj.test | 5 +-
...ingle-node-joins-with-limits-exhaustive.test | 2 +-
.../QueryTest/single-node-large-sorts.test | 2 +-
.../queries/QueryTest/spilling.test | 87 +-
.../targeted-stress/queries/agg_stress.test | 2 +-
.../workloads/tpch/queries/insert_parquet.test | 2 +
tests/comparison/discrepancy_searcher.py | 4 +-
tests/custom_cluster/test_scratch_disk.py | 12 +-
tests/custom_cluster/test_spilling.py | 47 -
tests/query_test/test_cancellation.py | 10 +-
tests/query_test/test_mem_usage_scaling.py | 31 +-
tests/query_test/test_nested_types.py | 1 -
tests/query_test/test_scratch_limit.py | 12 +-
tests/query_test/test_sort.py | 26 +-
tests/query_test/test_spilling.py | 39 +
123 files changed, 2885 insertions(+), 8366 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index 94dc86a..be4be80 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -119,7 +119,7 @@ ir_functions = [
["PHJ_PROCESS_PROBE_BATCH_FULL_OUTER_JOIN",
"_ZN6impala23PartitionedHashJoinNode17ProcessProbeBatchILi8EEEiNS_13TPrefetchMode4typeEPNS_8RowBatchEPNS_12HashTableCtxEPNS_6StatusE"],
["PHJ_INSERT_BATCH",
- "_ZN6impala10PhjBuilder9Partition11InsertBatchENS_13TPrefetchMode4typeEPNS_12HashTableCtxEPNS_8RowBatchERKSt6vectorINS_19BufferedTupleStream6RowIdxESaISA_EE"],
+ "_ZN6impala10PhjBuilder9Partition11InsertBatchENS_13TPrefetchMode4typeEPNS_12HashTableCtxEPNS_8RowBatchERKSt6vectorIPhSaIS9_EEPNS_6StatusE"],
["HASH_TABLE_GET_HASH_SEED",
"_ZNK6impala12HashTableCtx11GetHashSeedEv"],
["HASH_TABLE_GET_BUILD_EXPR_EVALUATORS",
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/analytic-eval-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/analytic-eval-node.cc b/be/src/exec/analytic-eval-node.cc
index b789188..f6d96ae 100644
--- a/be/src/exec/analytic-eval-node.cc
+++ b/be/src/exec/analytic-eval-node.cc
@@ -23,9 +23,10 @@
#include "exprs/agg-fn-evaluator.h"
#include "exprs/scalar-expr.h"
#include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
#include "runtime/descriptors.h"
#include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-state.h"
#include "udf/udf-internal.h"
@@ -34,13 +35,14 @@
#include "common/names.h"
static const int MAX_TUPLE_POOL_SIZE = 8 * 1024 * 1024; // 8MB
+static const int MIN_REQUIRED_BUFFERS = 2;
using namespace strings;
namespace impala {
-AnalyticEvalNode::AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode,
- const DescriptorTbl& descs)
+AnalyticEvalNode::AnalyticEvalNode(
+ ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs)
: ExecNode(pool, tnode, descs),
window_(tnode.analytic_node.window),
intermediate_tuple_desc_(
@@ -51,7 +53,6 @@ AnalyticEvalNode::AnalyticEvalNode(ObjectPool* pool, const TPlanNode& tnode,
rows_end_offset_(0),
has_first_val_null_offset_(false),
first_val_null_offset_(0),
- client_(nullptr),
child_tuple_cmp_row_(nullptr),
last_result_idx_(-1),
prev_pool_last_result_idx_(-1),
@@ -110,6 +111,7 @@ AnalyticEvalNode::~AnalyticEvalNode() {
Status AnalyticEvalNode::Init(const TPlanNode& tnode, RuntimeState* state) {
RETURN_IF_ERROR(ExecNode::Init(tnode, state));
DCHECK_EQ(conjunct_evals_.size(), 0);
+ state_ = state;
const TAnalyticNode& analytic_node = tnode.analytic_node;
bool has_lead_fn = false;
@@ -154,6 +156,8 @@ Status AnalyticEvalNode::Prepare(RuntimeState* state) {
SCOPED_TIMER(runtime_profile_->total_time_counter());
RETURN_IF_ERROR(ExecNode::Prepare(state));
DCHECK(child(0)->row_desc()->IsPrefixOf(*row_desc()));
+ DCHECK_GE(resource_profile_.min_reservation,
+ resource_profile_.spillable_buffer_size * MIN_REQUIRED_BUFFERS);
curr_tuple_pool_.reset(new MemPool(mem_tracker()));
prev_tuple_pool_.reset(new MemPool(mem_tracker()));
mem_pool_.reset(new MemPool(mem_tracker()));
@@ -175,12 +179,6 @@ Status AnalyticEvalNode::Prepare(RuntimeState* state) {
fn_pool_.get(), &order_by_eq_expr_eval_));
AddEvaluatorToFree(order_by_eq_expr_eval_);
}
-
- // Must be kept in sync with AnalyticEvalNode.computeResourceProfile() in fe.
- const int MIN_REQUIRED_BUFFERS = 2;
- RETURN_IF_ERROR(state->block_mgr()->RegisterClient(
- Substitute("AnalyticEvalNode id=$0 ptr=$1", id_, this),
- MIN_REQUIRED_BUFFERS, false, mem_tracker(), state, &client_));
return Status::OK();
}
@@ -190,22 +188,20 @@ Status AnalyticEvalNode::Open(RuntimeState* state) {
RETURN_IF_CANCELLED(state);
RETURN_IF_ERROR(QueryMaintenance(state));
RETURN_IF_ERROR(child(0)->Open(state));
- DCHECK(client_ != nullptr);
- DCHECK(input_stream_ == nullptr);
- input_stream_.reset(
- new BufferedTupleStream(state, child(0)->row_desc(), state->block_mgr(), client_,
- false /* use_initial_small_buffers */, true /* read_write */));
- RETURN_IF_ERROR(input_stream_->Init(id(), runtime_profile(), true));
- bool got_write_buffer;
- RETURN_IF_ERROR(input_stream_->PrepareForWrite(&got_write_buffer));
- if (!got_write_buffer) {
- return state->block_mgr()->MemLimitTooLowError(client_, id());
- }
- bool got_read_buffer;
- RETURN_IF_ERROR(input_stream_->PrepareForRead(true, &got_read_buffer));
- if (!got_read_buffer) {
- return state->block_mgr()->MemLimitTooLowError(client_, id());
+
+ // Claim reservation after the child has been opened to reduce the peak reservation
+ // requirement.
+ if (!buffer_pool_client_.is_registered()) {
+ RETURN_IF_ERROR(ClaimBufferReservation(state));
}
+ DCHECK(input_stream_ == nullptr);
+ input_stream_.reset(new BufferedTupleStreamV2(state, child(0)->row_desc(),
+ &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+ resource_profile_.spillable_buffer_size));
+ RETURN_IF_ERROR(input_stream_->Init(id(), true));
+ bool success;
+ RETURN_IF_ERROR(input_stream_->PrepareForReadWrite(true, &success));
+ DCHECK(success) << "Had reservation: " << buffer_pool_client_.DebugString();
for (int i = 0; i < analytic_fn_evals_.size(); ++i) {
RETURN_IF_ERROR(analytic_fn_evals_[i]->Open(state));
@@ -366,8 +362,8 @@ inline Status AnalyticEvalNode::AddRow(int64_t stream_idx, TupleRow* row) {
// the stream and continue writing/reading in unpinned mode.
// TODO: Consider re-pinning later if the output stream is fully consumed.
RETURN_IF_ERROR(status);
- RETURN_IF_ERROR(
- input_stream_->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
+ RETURN_IF_ERROR(state_->StartSpilling(mem_tracker()));
+ input_stream_->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
VLOG_FILE << id() << " Unpin input stream while adding row idx=" << stream_idx;
if (!input_stream_->AddRow(row, &status)) {
// Rows should be added in unpinned mode unless an error occurs.
@@ -627,7 +623,7 @@ Status AnalyticEvalNode::ProcessChildBatch(RuntimeState* state) {
<< " tuple pool size:" << curr_tuple_pool_->total_allocated_bytes();
SCOPED_TIMER(evaluation_timer_);
- // BufferedTupleStream::num_rows() returns the total number of rows that have been
+ // BufferedTupleStreamV2::num_rows() returns the total number of rows that have been
// inserted into the stream (it does not decrease when we read rows), so the index of
// the next input row that will be inserted will be the current size of the stream.
int64_t stream_idx = input_stream_->num_rows();
@@ -857,7 +853,6 @@ Status AnalyticEvalNode::Reset(RuntimeState* state) {
void AnalyticEvalNode::Close(RuntimeState* state) {
if (is_closed()) return;
- if (client_ != nullptr) state->block_mgr()->ClearReservations(client_);
// We may need to clean up input_stream_ if an error occurred at some point.
if (input_stream_ != nullptr) {
input_stream_->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/analytic-eval-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/analytic-eval-node.h b/be/src/exec/analytic-eval-node.h
index 89c5cf3..eab9198 100644
--- a/be/src/exec/analytic-eval-node.h
+++ b/be/src/exec/analytic-eval-node.h
@@ -19,8 +19,7 @@
#define IMPALA_EXEC_ANALYTIC_EVAL_NODE_H
#include "exec/exec-node.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream-v2.h"
#include "runtime/tuple.h"
namespace impala {
@@ -189,6 +188,10 @@ class AnalyticEvalNode : public ExecNode {
/// Debug string containing the window definition.
std::string DebugWindowString() const;
+ /// The RuntimeState for the fragment instance containing this AnalyticEvalNode. Set
+ /// in Init().
+ RuntimeState* state_;
+
/// Window over which the analytic functions are evaluated. Only used if fn_scope_
/// is ROWS or RANGE.
/// TODO: fn_scope_ and window_ are candidates to be removed during codegen
@@ -254,9 +257,6 @@ class AnalyticEvalNode : public ExecNode {
boost::scoped_ptr<MemPool> curr_tuple_pool_;
boost::scoped_ptr<MemPool> prev_tuple_pool_;
- /// Block manager client used by input_stream_. Not owned.
- BufferedBlockMgr::Client* client_ = nullptr;
-
/////////////////////////////////////////
/// BEGIN: Members that must be Reset()
@@ -330,15 +330,16 @@ class AnalyticEvalNode : public ExecNode {
/// Buffers input rows added in ProcessChildBatch() until enough rows are able to
/// be returned by GetNextOutputBatch(), in which case row batches are returned from
- /// the front of the stream and the underlying buffered blocks are deleted once read.
+ /// the front of the stream and the underlying buffers are deleted once read.
/// The number of rows that must be buffered may vary from an entire partition (e.g.
- /// no order by clause) to a single row (e.g. ROWS windows). When the amount of
- /// buffered data exceeds the available memory in the underlying BufferedBlockMgr,
- /// input_stream_ is unpinned (i.e., possibly spilled to disk if necessary).
- /// The input stream owns tuple data backing rows returned in GetNext(). The blocks
- /// with tuple data are attached to an output row batch on eos or ReachedLimit().
+ /// no order by clause) to a single row (e.g. ROWS windows). If the amount of buffered
+ /// data in 'input_stream_' exceeds the ExecNode's buffer reservation and the stream
+ /// cannot increase the reservation, then 'input_stream_' is unpinned (i.e., spilled to
+ /// disk). The input stream owns tuple data backing rows returned in GetNext(). The
+ /// buffers with tuple data are attached to an output row batch on eos or
+ /// ReachedLimit().
/// TODO: Consider re-pinning unpinned streams when possible.
- boost::scoped_ptr<BufferedTupleStream> input_stream_;
+ boost::scoped_ptr<BufferedTupleStreamV2> input_stream_;
/// Pool used for O(1) allocations that live until Close() or Reset().
/// Does not own data backing tuples returned in GetNext(), so it does not
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/exec-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc
index c3d9946..61c8d40 100644
--- a/be/src/exec/exec-node.cc
+++ b/be/src/exec/exec-node.cc
@@ -34,10 +34,10 @@
#include "exec/empty-set-node.h"
#include "exec/exchange-node.h"
#include "exec/hbase-scan-node.h"
-#include "exec/hdfs-scan-node.h"
#include "exec/hdfs-scan-node-mt.h"
-#include "exec/kudu-scan-node.h"
+#include "exec/hdfs-scan-node.h"
#include "exec/kudu-scan-node-mt.h"
+#include "exec/kudu-scan-node.h"
#include "exec/kudu-util.h"
#include "exec/nested-loop-join-node.h"
#include "exec/partial-sort-node.h"
@@ -50,9 +50,14 @@
#include "exec/topn-node.h"
#include "exec/union-node.h"
#include "exec/unnest-node.h"
+#include "exprs/expr.h"
+#include "gutil/strings/substitute.h"
#include "runtime/descriptors.h"
-#include "runtime/mem-tracker.h"
+#include "runtime/exec-env.h"
+#include "runtime/initial-reservations.h"
#include "runtime/mem-pool.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-state.h"
#include "util/debug-util.h"
@@ -61,7 +66,10 @@
#include "common/names.h"
using namespace llvm;
+using strings::Substitute;
+DECLARE_int32(be_port);
+DECLARE_string(hostname);
DEFINE_bool(enable_partitioned_hash_join, true, "Deprecated - has no effect");
DEFINE_bool(enable_partitioned_aggregation, true, "Deprecated - has no effect");
@@ -116,6 +124,7 @@ ExecNode::ExecNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl
type_(tnode.node_type),
pool_(pool),
row_descriptor_(descs, tnode.row_tuples, tnode.nullable_tuples),
+ resource_profile_(tnode.resource_profile),
debug_phase_(TExecNodePhase::INVALID),
debug_action_(TDebugAction::WAIT),
limit_(tnode.limit),
@@ -195,7 +204,12 @@ void ExecNode::Close(RuntimeState* state) {
ScalarExprEvaluator::Close(conjunct_evals_, state);
ScalarExpr::Close(conjuncts_);
if (expr_mem_pool() != nullptr) expr_mem_pool_->FreeAll();
-
+ if (buffer_pool_client_.is_registered()) {
+ VLOG_FILE << id_ << " returning reservation " << resource_profile_.min_reservation;
+ state->query_state()->initial_reservations()->Return(
+ &buffer_pool_client_, resource_profile_.min_reservation);
+ state->exec_env()->buffer_pool()->DeregisterClient(&buffer_pool_client_);
+ }
if (mem_tracker() != NULL && mem_tracker()->consumption() != 0) {
LOG(WARNING) << "Query " << state->query_id() << " may have leaked memory." << endl
<< state->instance_mem_tracker()->LogUsage();
@@ -204,6 +218,29 @@ void ExecNode::Close(RuntimeState* state) {
}
}
+Status ExecNode::ClaimBufferReservation(RuntimeState* state) {
+ DCHECK(!buffer_pool_client_.is_registered());
+ BufferPool* buffer_pool = ExecEnv::GetInstance()->buffer_pool();
+ // Check the minimum buffer size in case the minimum buffer size used by the planner
+ // doesn't match this backend's.
+ if (resource_profile_.__isset.spillable_buffer_size &&
+ resource_profile_.spillable_buffer_size < buffer_pool->min_buffer_len()) {
+ return Status(Substitute("Spillable buffer size for node $0 of $1 bytes is less "
+ "than the minimum buffer pool buffer size of $2 bytes",
+ id_, resource_profile_.spillable_buffer_size, buffer_pool->min_buffer_len()));
+ }
+
+ RETURN_IF_ERROR(buffer_pool->RegisterClient(
+ Substitute("$0 id=$1 ptr=$2", PrintPlanNodeType(type_), id_, this),
+ state->query_state()->file_group(), state->instance_buffer_reservation(),
+ mem_tracker(), resource_profile_.max_reservation, runtime_profile(),
+ &buffer_pool_client_));
+ VLOG_FILE << id_ << " claiming reservation " << resource_profile_.min_reservation;
+ state->query_state()->initial_reservations()->Claim(
+ &buffer_pool_client_, resource_profile_.min_reservation);
+ return Status::OK();
+}
+
Status ExecNode::CreateTree(
RuntimeState* state, const TPlan& plan, const DescriptorTbl& descs, ExecNode** root) {
if (plan.nodes.size() == 0) {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/exec-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/exec-node.h b/be/src/exec/exec-node.h
index a107f62..60efff0 100644
--- a/be/src/exec/exec-node.h
+++ b/be/src/exec/exec-node.h
@@ -26,6 +26,8 @@
#include "common/status.h"
#include "exprs/scalar-expr-evaluator.h"
#include "gen-cpp/PlanNodes_types.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/reservation-tracker.h"
#include "runtime/descriptors.h" // for RowDescriptor
#include "util/blocking-queue.h"
#include "util/runtime-profile.h"
@@ -227,6 +229,12 @@ class ExecNode {
protected:
friend class DataSink;
+ /// Initialize 'buffer_pool_client_' and claim the initial reservation for this
+ /// ExecNode. Only needs to be called by ExecNodes that will use the client.
+ /// The client is automatically cleaned up in Close(). Should not be called if
+ /// the client is already open.
+ Status ClaimBufferReservation(RuntimeState* state);
+
/// Extends blocking queue for row batches. Row batches have a property that
/// they must be processed in the order they were produced, even in cancellation
/// paths. Preceding row batches can contain ptrs to memory in subsequent row batches
@@ -276,6 +284,9 @@ class ExecNode {
std::vector<ExecNode*> children_;
RowDescriptor row_descriptor_;
+ /// Resource information sent from the frontend.
+ const TBackendResourceProfile resource_profile_;
+
/// debug-only: if debug_action_ is not INVALID, node will perform action in
/// debug_phase_
TExecNodePhase::type debug_phase_;
@@ -298,6 +309,12 @@ class ExecNode {
/// Created in Prepare().
boost::scoped_ptr<MemPool> expr_mem_pool_;
+ /// Buffer pool client for this node. Initialized with the node's minimum reservation
+ /// in ClaimBufferReservation(). After initialization, the client must hold onto at
+ /// least the minimum reservation so that it can be returned to the initial
+ /// reservations pool in Close().
+ BufferPool::ClientHandle buffer_pool_client_;
+
bool is_closed() const { return is_closed_; }
/// Pointer to the containing SubplanNode or NULL if not inside a subplan.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table-test.cc b/be/src/exec/hash-table-test.cc
index 42bc7e1..7a6ec9d 100644
--- a/be/src/exec/hash-table-test.cc
+++ b/be/src/exec/hash-table-test.cc
@@ -17,24 +17,27 @@
#include <boost/scoped_ptr.hpp>
-#include <stdlib.h>
#include <stdio.h>
+#include <stdlib.h>
#include <iostream>
+#include <limits>
#include <vector>
-#include "testutil/gtest-util.h"
#include "common/compiler-util.h"
#include "common/init.h"
#include "exec/hash-table.inline.h"
#include "exprs/scalar-expr.h"
#include "exprs/scalar-expr-evaluator.h"
#include "exprs/slot-ref.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/reservation-tracker.h"
#include "runtime/mem-pool.h"
#include "runtime/mem-tracker.h"
#include "runtime/string-value.h"
#include "runtime/test-env.h"
#include "runtime/tuple-row.h"
#include "service/fe-support.h"
+#include "testutil/gtest-util.h"
#include "util/cpu-info.h"
#include "util/runtime-profile-counters.h"
#include "util/test-info.h"
@@ -51,9 +54,16 @@ class HashTableTest : public testing::Test {
HashTableTest() : mem_pool_(&tracker_) {}
protected:
+ /// Temporary runtime environment for the hash table.
scoped_ptr<TestEnv> test_env_;
RuntimeState* runtime_state_;
+
+ /// Hash tables and associated clients - automatically closed in TearDown().
+ vector<BufferPool::ClientHandle*> clients_;
+ vector<HashTable*> hash_tables_;
+
ObjectPool pool_;
+ /// A dummy MemTracker used for exprs and other things we don't need to have limits on.
MemTracker tracker_;
MemPool mem_pool_;
vector<ScalarExpr*> build_exprs_;
@@ -83,6 +93,8 @@ class HashTableTest : public testing::Test {
ASSERT_OK(ScalarExprEvaluator::Create(probe_exprs_, nullptr, &pool_, &mem_pool_,
&probe_expr_evals_));
ASSERT_OK(ScalarExprEvaluator::Open(probe_expr_evals_, nullptr));
+
+ CreateTestEnv();
}
virtual void TearDown() {
@@ -90,9 +102,34 @@ class HashTableTest : public testing::Test {
ScalarExprEvaluator::Close(probe_expr_evals_, nullptr);
ScalarExpr::Close(build_exprs_);
ScalarExpr::Close(probe_exprs_);
+
+ for (HashTable* hash_table : hash_tables_) hash_table->Close();
+ hash_tables_.clear();
+
+ for (BufferPool::ClientHandle* client : clients_) {
+ test_env_->exec_env()->buffer_pool()->DeregisterClient(client);
+ }
+ clients_.clear();
+
runtime_state_ = nullptr;
test_env_.reset();
mem_pool_.FreeAll();
+ pool_.Clear();
+ }
+
+ /// Initialize test_env_ and runtime_state_ with the given page size and capacity
+ /// for the given number of pages. If test_env_ was already created, then re-creates it.
+ void CreateTestEnv(int64_t min_page_size = 64 * 1024,
+ int64_t buffer_bytes_limit = 4L * 1024 * 1024 * 1024) {
+ test_env_.reset(new TestEnv());
+ test_env_->SetBufferPoolArgs(min_page_size, buffer_bytes_limit);
+ ASSERT_OK(test_env_->Init());
+
+ TQueryOptions query_options;
+ query_options.__set_default_spillable_buffer_size(min_page_size);
+ query_options.__set_min_spillable_buffer_size(min_page_size);
+ query_options.__set_buffer_pool_limit(buffer_bytes_limit);
+ ASSERT_OK(test_env_->CreateQueryState(0, &query_options, &runtime_state_));
}
TupleRow* CreateTupleRow(int32_t val) {
@@ -116,8 +153,9 @@ class HashTableTest : public testing::Test {
// Wrapper to call private methods on HashTable
// TODO: understand google testing, there must be a more natural way to do this
- void ResizeTable(HashTable* table, int64_t new_size, HashTableCtx* ht_ctx) {
- table->ResizeBuckets(new_size, ht_ctx);
+ Status ResizeTable(
+ HashTable* table, int64_t new_size, HashTableCtx* ht_ctx, bool* success) {
+ return table->ResizeBuckets(new_size, ht_ctx, success);
}
// Do a full table scan on table. All values should be between [min,max). If
@@ -188,24 +226,41 @@ class HashTableTest : public testing::Test {
}
}
- // Construct hash table with custom block manager. Returns result of HashTable::Init()
- bool CreateHashTable(bool quadratic, int64_t initial_num_buckets,
- scoped_ptr<HashTable>* table, int block_size = 8 * 1024 * 1024,
- int max_num_blocks = 100, int reserved_blocks = 10) {
- EXPECT_OK(test_env_->CreateQueryStateWithBlockMgr(
- next_query_id_++, max_num_blocks, block_size, nullptr, &runtime_state_));
+ /// Construct hash table and buffer pool client.
+ /// Returns true if HashTable::Init() was successful. Created objects
+ /// and resources (e.g. reservations) are automatically freed in TearDown().
+ bool CreateHashTable(bool quadratic, int64_t initial_num_buckets, HashTable** table,
+ int64_t block_size = 8 * 1024 * 1024, int max_num_blocks = 100,
+ int initial_reserved_blocks = 10, int64_t suballocator_buffer_len = 64 * 1024) {
+ BufferPool* buffer_pool = test_env_->exec_env()->buffer_pool();
+ RuntimeProfile* profile = pool_.Add(new RuntimeProfile(&pool_, "ht"));
+
+ // Set up memory tracking for the hash table.
MemTracker* client_tracker =
pool_.Add(new MemTracker(-1, "client", runtime_state_->instance_mem_tracker()));
- BufferedBlockMgr::Client* client;
- EXPECT_OK(runtime_state_->block_mgr()->RegisterClient(
- "", reserved_blocks, false, client_tracker, runtime_state_, &client));
+ int64_t initial_reservation_bytes = block_size * initial_reserved_blocks;
+ int64_t max_reservation_bytes = block_size * max_num_blocks;
+
+ // Set up the memory allocator.
+ BufferPool::ClientHandle* client = pool_.Add(new BufferPool::ClientHandle);
+ clients_.push_back(client);
+ EXPECT_OK(buffer_pool->RegisterClient("", nullptr,
+ runtime_state_->instance_buffer_reservation(), client_tracker,
+ max_reservation_bytes, profile, client));
+ EXPECT_TRUE(client->IncreaseReservation(initial_reservation_bytes));
+ Suballocator* allocator =
+ pool_.Add(new Suballocator(buffer_pool, client, suballocator_buffer_len));
// Initial_num_buckets must be a power of two.
EXPECT_EQ(initial_num_buckets, BitUtil::RoundUpToPowerOfTwo(initial_num_buckets));
int64_t max_num_buckets = 1L << 31;
- table->reset(new HashTable(quadratic, runtime_state_, client, true, 1, nullptr,
- max_num_buckets, initial_num_buckets));
- return (*table)->Init();
+ *table = pool_.Add(new HashTable(
+ quadratic, allocator, true, 1, nullptr, max_num_buckets, initial_num_buckets));
+ hash_tables_.push_back(*table);
+ bool success;
+ Status status = (*table)->Init(&success);
+ EXPECT_OK(status);
+ return status.ok() && success;
}
// Constructs and closes a hash table.
@@ -229,14 +284,12 @@ class HashTableTest : public testing::Test {
EXPECT_EQ(*val_row4, 4);
// Create and close the hash table.
- scoped_ptr<HashTable> hash_table;
+ HashTable* hash_table;
bool initialized = CreateHashTable(quadratic, initial_num_buckets, &hash_table);
EXPECT_EQ(too_big, !initialized);
if (initialized && initial_num_buckets > 0) {
EXPECT_NE(hash_table->ByteSize(), 0);
}
-
- hash_table->Close();
}
// IMPALA-2897: Build rows that are equivalent (where nullptrs are counted as equivalent)
@@ -246,7 +299,7 @@ class HashTableTest : public testing::Test {
for (int i = 0; i < 2; ++i) build_rows[i] = CreateNullTupleRow();
// Create the hash table and insert the build rows
- scoped_ptr<HashTable> hash_table;
+ HashTable* hash_table;
ASSERT_TRUE(CreateHashTable(true, 1024, &hash_table));
scoped_ptr<HashTableCtx> ht_ctx;
EXPECT_OK(HashTableCtx::Create(&pool_, runtime_state_,
@@ -256,13 +309,15 @@ class HashTableTest : public testing::Test {
for (int i = 0; i < 2; ++i) {
if (!ht_ctx->EvalAndHashBuild(build_rows[i])) continue;
- BufferedTupleStream::RowIdx dummy_row_idx;
+ BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
EXPECT_TRUE(hash_table->stores_tuples_);
- bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, build_rows[i]);
+ Status status;
+ bool inserted =
+ hash_table->Insert(ht_ctx.get(), dummy_flat_row, build_rows[i], &status);
EXPECT_TRUE(inserted);
+ ASSERT_OK(status);
}
EXPECT_EQ(hash_table->num_buckets() - hash_table->EmptyBuckets(), 1);
- hash_table->Close();
ht_ctx->Close(runtime_state_);
}
@@ -282,7 +337,7 @@ class HashTableTest : public testing::Test {
}
// Create the hash table and insert the build rows
- scoped_ptr<HashTable> hash_table;
+ HashTable* hash_table;
ASSERT_TRUE(CreateHashTable(quadratic, initial_num_buckets, &hash_table));
scoped_ptr<HashTableCtx> ht_ctx;
Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
@@ -290,52 +345,57 @@ class HashTableTest : public testing::Test {
vector<bool>(build_exprs_.size(), false), 1, 0, 1, &mem_pool_, &ht_ctx);
EXPECT_OK(status);
EXPECT_OK(ht_ctx->Open(runtime_state_));
- bool success = hash_table->CheckAndResize(5, ht_ctx.get());
+ bool success;
+ EXPECT_OK(hash_table->CheckAndResize(5, ht_ctx.get(), &success));
ASSERT_TRUE(success);
for (int i = 0; i < 5; ++i) {
if (!ht_ctx->EvalAndHashBuild(build_rows[i])) continue;
- BufferedTupleStream::RowIdx dummy_row_idx;
+ BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
EXPECT_TRUE(hash_table->stores_tuples_);
- bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, build_rows[i]);
+ bool inserted =
+ hash_table->Insert(ht_ctx.get(), dummy_flat_row, build_rows[i], &status);
EXPECT_TRUE(inserted);
+ ASSERT_OK(status);
}
EXPECT_EQ(hash_table->size(), 5);
// Do a full table scan and validate returned pointers
- FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+ FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
// Double the size of the hash table and scan again.
- ResizeTable(hash_table.get(), 2048, ht_ctx.get());
+ EXPECT_OK(ResizeTable(hash_table, 2048, ht_ctx.get(), &success));
+ EXPECT_TRUE(success);
EXPECT_EQ(hash_table->num_buckets(), 2048);
EXPECT_EQ(hash_table->size(), 5);
memset(scan_rows, 0, sizeof(scan_rows));
- FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+ FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
// Try to shrink and scan again.
- ResizeTable(hash_table.get(), 64, ht_ctx.get());
+ EXPECT_OK(ResizeTable(hash_table, 64, ht_ctx.get(), &success));
+ EXPECT_TRUE(success);
EXPECT_EQ(hash_table->num_buckets(), 64);
EXPECT_EQ(hash_table->size(), 5);
memset(scan_rows, 0, sizeof(scan_rows));
- FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+ FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
// Resize to 8, which is the smallest value to fit the number of filled buckets.
- ResizeTable(hash_table.get(), 8, ht_ctx.get());
+ EXPECT_OK(ResizeTable(hash_table, 8, ht_ctx.get(), &success));
+ EXPECT_TRUE(success);
EXPECT_EQ(hash_table->num_buckets(), 8);
EXPECT_EQ(hash_table->size(), 5);
memset(scan_rows, 0, sizeof(scan_rows));
- FullScan(hash_table.get(), ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, 10, false);
+ FullScan(hash_table, ht_ctx.get(), 0, 5, true, scan_rows, build_rows);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, 10, false);
- hash_table->Close();
ht_ctx->Close(runtime_state_);
}
- void ScanTest(bool quadratic, int initial_size, int rows_to_insert,
- int additional_rows) {
- scoped_ptr<HashTable> hash_table;
+ void ScanTest(
+ bool quadratic, int initial_size, int rows_to_insert, int additional_rows) {
+ HashTable* hash_table;
ASSERT_TRUE(CreateHashTable(quadratic, initial_size, &hash_table));
int total_rows = rows_to_insert + additional_rows;
@@ -347,19 +407,21 @@ class HashTableTest : public testing::Test {
EXPECT_OK(ht_ctx->Open(runtime_state_));
// Add 1 row with val 1, 2 with val 2, etc.
+ bool success;
vector<TupleRow*> build_rows;
ProbeTestData* probe_rows = new ProbeTestData[total_rows];
probe_rows[0].probe_row = CreateTupleRow(0);
for (int val = 1; val <= rows_to_insert; ++val) {
- bool success = hash_table->CheckAndResize(val, ht_ctx.get());
+ EXPECT_OK(hash_table->CheckAndResize(val, ht_ctx.get(), &success));
EXPECT_TRUE(success) << " failed to resize: " << val;
probe_rows[val].probe_row = CreateTupleRow(val);
for (int i = 0; i < val; ++i) {
TupleRow* row = CreateTupleRow(val);
if (!ht_ctx->EvalAndHashBuild(row)) continue;
- BufferedTupleStream::RowIdx dummy_row_idx;
+ BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
EXPECT_TRUE(hash_table->stores_tuples_);
- hash_table->Insert(ht_ctx.get(), dummy_row_idx, row);
+ ASSERT_TRUE(hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status));
+ ASSERT_OK(status);
build_rows.push_back(row);
probe_rows[val].expected_build_rows.push_back(row);
}
@@ -371,21 +433,22 @@ class HashTableTest : public testing::Test {
}
// Test that all the builds were found.
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, total_rows, true);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, total_rows, true);
// Resize and try again.
int target_size = BitUtil::RoundUpToPowerOfTwo(2 * total_rows);
- ResizeTable(hash_table.get(), target_size, ht_ctx.get());
+ EXPECT_OK(ResizeTable(hash_table, target_size, ht_ctx.get(), &success));
+ EXPECT_TRUE(success);
EXPECT_EQ(hash_table->num_buckets(), target_size);
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, total_rows, true);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, total_rows, true);
target_size = BitUtil::RoundUpToPowerOfTwo(total_rows + 1);
- ResizeTable(hash_table.get(), target_size, ht_ctx.get());
+ EXPECT_OK(ResizeTable(hash_table, target_size, ht_ctx.get(), &success));
+ EXPECT_TRUE(success);
EXPECT_EQ(hash_table->num_buckets(), target_size);
- ProbeTest(hash_table.get(), ht_ctx.get(), probe_rows, total_rows, true);
+ ProbeTest(hash_table, ht_ctx.get(), probe_rows, total_rows, true);
delete [] probe_rows;
- hash_table->Close();
ht_ctx->Close(runtime_state_);
}
@@ -395,9 +458,11 @@ class HashTableTest : public testing::Test {
uint64_t num_to_add = 4;
int expected_size = 0;
- MemTracker tracker(100 * 1024 * 1024);
- scoped_ptr<HashTable> hash_table;
- ASSERT_TRUE(CreateHashTable(quadratic, num_to_add, &hash_table));
+ // Need enough memory for two hash table bucket directories during resize.
+ const int64_t mem_limit_mb = 128 + 64;
+ HashTable* hash_table;
+ ASSERT_TRUE(
+ CreateHashTable(quadratic, num_to_add, &hash_table, 1024 * 1024, mem_limit_mb));
scoped_ptr<HashTableCtx> ht_ctx;
Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
probe_exprs_, false /* !stores_nulls_ */,
@@ -408,27 +473,32 @@ class HashTableTest : public testing::Test {
// entries. When num_to_add == 4, then the total number of inserts is 4194300.
int build_row_val = 0;
for (int i = 0; i < 20; ++i) {
- // Currently the mem used for the bucket is not being tracked by the mem tracker.
- // Thus the resize is expected to be successful.
- // TODO: Keep track of the mem used for the buckets and test cases where we actually
- // hit OOM.
- // TODO: Insert duplicates to also hit OOM.
- bool success = hash_table->CheckAndResize(num_to_add, ht_ctx.get());
- EXPECT_TRUE(success) << " failed to resize: " << num_to_add;
+ bool success;
+ EXPECT_OK(hash_table->CheckAndResize(num_to_add, ht_ctx.get(), &success));
+ EXPECT_TRUE(success) << " failed to resize: " << num_to_add << "\n"
+ << tracker_.LogUsage() << "\n"
+ << clients_.back()->DebugString();
for (int j = 0; j < num_to_add; ++build_row_val, ++j) {
TupleRow* row = CreateTupleRow(build_row_val);
if (!ht_ctx->EvalAndHashBuild(row)) continue;
- BufferedTupleStream::RowIdx dummy_row_idx;
+ BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
EXPECT_TRUE(hash_table->stores_tuples_);
- bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, row);
+ bool inserted = hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status);
+ ASSERT_OK(status);
if (!inserted) goto done_inserting;
}
expected_size += num_to_add;
num_to_add *= 2;
}
- done_inserting:
- EXPECT_FALSE(tracker.LimitExceeded());
+ done_inserting:
EXPECT_EQ(hash_table->size(), 4194300);
+
+ // The next allocation should put us over the limit, since we'll need 128MB for
+ // the old buckets and 256MB for the new buckets.
+ bool success;
+ EXPECT_OK(hash_table->CheckAndResize(num_to_add * 2, ht_ctx.get(), &success));
+ EXPECT_FALSE(success);
+
// Validate that we can find the entries before we went over the limit
for (int i = 0; i < expected_size * 5; i += 100000) {
TupleRow* probe_row = CreateTupleRow(i);
@@ -441,7 +511,34 @@ class HashTableTest : public testing::Test {
EXPECT_TRUE(iter.AtEnd()) << " i: " << i;
}
}
- hash_table->Close();
+
+ // Insert duplicates to also hit OOM.
+ int64_t num_duplicates_inserted = 0;
+ const int DUPLICATE_VAL = 1234;
+ while (true) {
+ TupleRow* duplicate_row = CreateTupleRow(DUPLICATE_VAL);
+ if (!ht_ctx->EvalAndHashBuild(duplicate_row)) continue;
+ BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
+ bool inserted =
+ hash_table->Insert(ht_ctx.get(), dummy_flat_row, duplicate_row, &status);
+ ASSERT_OK(status);
+ if (!inserted) break;
+ ++num_duplicates_inserted;
+ }
+
+ // Check that the duplicates that we successfully inserted are all present.
+ TupleRow* duplicate_row = CreateTupleRow(DUPLICATE_VAL);
+ ASSERT_TRUE(ht_ctx->EvalAndHashProbe(duplicate_row));
+ HashTable::Iterator iter = hash_table->FindProbeRow(ht_ctx.get());
+ ValidateMatch(duplicate_row, iter.GetRow());
+ for (int64_t i = 0; i < num_duplicates_inserted; ++i) {
+ ASSERT_FALSE(iter.AtEnd());
+ iter.NextDuplicate();
+ ValidateMatch(duplicate_row, iter.GetRow());
+ }
+ iter.NextDuplicate();
+ EXPECT_TRUE(iter.AtEnd());
+
ht_ctx->Close(runtime_state_);
}
@@ -450,7 +547,7 @@ class HashTableTest : public testing::Test {
// enough space in the hash table (it is also expected to be slow). It also expects that
// a probe for a N+1 element will return BUCKET_NOT_FOUND.
void InsertFullTest(bool quadratic, int table_size) {
- scoped_ptr<HashTable> hash_table;
+ HashTable* hash_table;
ASSERT_TRUE(CreateHashTable(quadratic, table_size, &hash_table));
EXPECT_EQ(hash_table->EmptyBuckets(), table_size);
scoped_ptr<HashTableCtx> ht_ctx;
@@ -472,10 +569,11 @@ class HashTableTest : public testing::Test {
// Insert using both Insert() and FindBucket() methods.
if (build_row_val % 2 == 0) {
- BufferedTupleStream::RowIdx dummy_row_idx;
+ BufferedTupleStreamV2::FlatRowPtr dummy_flat_row = nullptr;
EXPECT_TRUE(hash_table->stores_tuples_);
- bool inserted = hash_table->Insert(ht_ctx.get(), dummy_row_idx, row);
+ bool inserted = hash_table->Insert(ht_ctx.get(), dummy_flat_row, row, &status);
EXPECT_TRUE(inserted);
+ ASSERT_OK(status);
} else {
iter = hash_table->FindBuildRowBucket(ht_ctx.get(), &found);
EXPECT_FALSE(iter.AtEnd());
@@ -511,20 +609,20 @@ class HashTableTest : public testing::Test {
EXPECT_TRUE(iter.AtEnd());
EXPECT_FALSE(found);
- hash_table->Close();
ht_ctx->Close(runtime_state_);
}
// This test makes sure we can tolerate the low memory case where we do not have enough
// memory to allocate the array of buckets for the hash table.
void VeryLowMemTest(bool quadratic) {
- const int block_size = 2 * 1024;
+ const int64_t block_size = 2 * 1024;
const int max_num_blocks = 1;
- const int reserved_blocks = 0;
const int table_size = 1024;
- scoped_ptr<HashTable> hash_table;
- ASSERT_FALSE(CreateHashTable(quadratic, table_size, &hash_table, block_size,
- max_num_blocks, reserved_blocks));
+ CreateTestEnv(block_size, block_size * max_num_blocks);
+
+ HashTable* hash_table;
+ ASSERT_FALSE(CreateHashTable(
+ quadratic, table_size, &hash_table, block_size, max_num_blocks, 0, 1024));
scoped_ptr<HashTableCtx> ht_ctx;
Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
probe_exprs_, false /* !stores_nulls_ */, vector<bool>(build_exprs_.size(), false), 1, 0, 1,
@@ -532,7 +630,6 @@ class HashTableTest : public testing::Test {
EXPECT_OK(status);
HashTable::Iterator iter = hash_table->Begin(ht_ctx.get());
EXPECT_TRUE(iter.AtEnd());
- hash_table->Close();
ht_ctx->Close(runtime_state_);
}
};
@@ -612,8 +709,6 @@ TEST_F(HashTableTest, QuadraticInsertFullTest) {
// Test that hashing empty string updates hash value.
TEST_F(HashTableTest, HashEmpty) {
- EXPECT_OK(test_env_->CreateQueryStateWithBlockMgr(
- 0, 100, 8 * 1024 * 1024, nullptr, &runtime_state_));
scoped_ptr<HashTableCtx> ht_ctx;
Status status = HashTableCtx::Create(&pool_, runtime_state_, build_exprs_,
probe_exprs_, false /* !stores_nulls_ */,
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index a4856e9..aacedc2 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -26,7 +26,7 @@
#include "exprs/slot-ref.h"
#include "exprs/scalar-expr.h"
#include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/reservation-tracker.h"
#include "runtime/mem-tracker.h"
#include "runtime/raw-value.inline.h"
#include "runtime/runtime-state.h"
@@ -37,8 +37,17 @@
#include "common/names.h"
using namespace impala;
-using namespace llvm;
-using namespace strings;
+using llvm::APFloat;
+using llvm::ArrayRef;
+using llvm::BasicBlock;
+using llvm::ConstantFP;
+using llvm::Function;
+using llvm::LLVMContext;
+using llvm::PHINode;
+using llvm::PointerType;
+using llvm::Type;
+using llvm::Value;
+using strings::Substitute;
DEFINE_bool(enable_quadratic_probing, true, "Enable quadratic probing hash table");
@@ -85,12 +94,6 @@ static int64_t NULL_VALUE[] = {
static_assert(sizeof(NULL_VALUE) >= ColumnType::MAX_CHAR_LENGTH,
"NULL_VALUE must be at least as large as the largest possible slot");
-// The first NUM_SMALL_BLOCKS of nodes_ are made of blocks less than the IO size (of 8MB)
-// to reduce the memory footprint of small queries. In particular, we always first use a
-// 64KB and a 512KB block before starting using IO-sized blocks.
-static const int64_t INITIAL_DATA_PAGE_SIZES[] = { 64 * 1024, 512 * 1024 };
-static const int NUM_SMALL_DATA_PAGES = sizeof(INITIAL_DATA_PAGE_SIZES) / sizeof(int64_t);
-
HashTableCtx::HashTableCtx(const std::vector<ScalarExpr*>& build_exprs,
const std::vector<ScalarExpr*>& probe_exprs, bool stores_nulls,
const std::vector<bool>& finds_nulls, int32_t initial_seed,
@@ -378,21 +381,20 @@ void HashTableCtx::ExprValuesCache::ResetForRead() {
ResetIterators();
}
-const double HashTable::MAX_FILL_FACTOR = 0.75f;
+constexpr double HashTable::MAX_FILL_FACTOR;
+constexpr int64_t HashTable::DATA_PAGE_SIZE;
-HashTable* HashTable::Create(RuntimeState* state,
- BufferedBlockMgr::Client* client, bool stores_duplicates, int num_build_tuples,
- BufferedTupleStream* tuple_stream, int64_t max_num_buckets,
+HashTable* HashTable::Create(Suballocator* allocator, bool stores_duplicates,
+ int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
int64_t initial_num_buckets) {
- return new HashTable(FLAGS_enable_quadratic_probing, state, client, stores_duplicates,
+ return new HashTable(FLAGS_enable_quadratic_probing, allocator, stores_duplicates,
num_build_tuples, tuple_stream, max_num_buckets, initial_num_buckets);
}
-HashTable::HashTable(bool quadratic_probing, RuntimeState* state,
- BufferedBlockMgr::Client* client, bool stores_duplicates, int num_build_tuples,
- BufferedTupleStream* stream, int64_t max_num_buckets, int64_t num_buckets)
- : state_(state),
- block_mgr_client_(client),
+HashTable::HashTable(bool quadratic_probing, Suballocator* allocator,
+ bool stores_duplicates, int num_build_tuples, BufferedTupleStreamV2* stream,
+ int64_t max_num_buckets, int64_t num_buckets)
+ : allocator_(allocator),
tuple_stream_(stream),
stores_tuples_(num_build_tuples == 1),
stores_duplicates_(stores_duplicates),
@@ -410,26 +412,23 @@ HashTable::HashTable(bool quadratic_probing, RuntimeState* state,
has_matches_(false),
num_probes_(0), num_failed_probes_(0), travel_length_(0), num_hash_collisions_(0),
num_resizes_(0) {
- DCHECK_EQ((num_buckets & (num_buckets-1)), 0) << "num_buckets must be a power of 2";
+ DCHECK_EQ((num_buckets & (num_buckets - 1)), 0) << "num_buckets must be a power of 2";
DCHECK_GT(num_buckets, 0) << "num_buckets must be larger than 0";
DCHECK(stores_tuples_ || stream != NULL);
- DCHECK(client != NULL);
}
-bool HashTable::Init() {
+Status HashTable::Init(bool* got_memory) {
int64_t buckets_byte_size = num_buckets_ * sizeof(Bucket);
- if (!state_->block_mgr()->ConsumeMemory(block_mgr_client_, buckets_byte_size)) {
- num_buckets_ = 0;
- return false;
- }
- buckets_ = reinterpret_cast<Bucket*>(malloc(buckets_byte_size));
- if (buckets_ == NULL) {
- state_->block_mgr()->ReleaseMemory(block_mgr_client_, buckets_byte_size);
+ RETURN_IF_ERROR(allocator_->Allocate(buckets_byte_size, &bucket_allocation_));
+ if (bucket_allocation_ == nullptr) {
num_buckets_ = 0;
- return false;
+ *got_memory = false;
+ return Status::OK();
}
+ buckets_ = reinterpret_cast<Bucket*>(bucket_allocation_->data());
memset(buckets_, 0, buckets_byte_size);
- return true;
+ *got_memory = true;
+ return Status::OK();
}
void HashTable::Close() {
@@ -439,36 +438,39 @@ void HashTable::Close() {
const int64_t HEAVILY_USED = 1024 * 1024;
// TODO: These statistics should go to the runtime profile as well.
if ((num_buckets_ > LARGE_HT) || (num_probes_ > HEAVILY_USED)) VLOG(2) << PrintStats();
- for (int i = 0; i < data_pages_.size(); ++i) {
- data_pages_[i]->Delete();
- }
+ for (auto& data_page : data_pages_) allocator_->Free(move(data_page));
+ data_pages_.clear();
if (ImpaladMetrics::HASH_TABLE_TOTAL_BYTES != NULL) {
ImpaladMetrics::HASH_TABLE_TOTAL_BYTES->Increment(-total_data_page_size_);
}
- data_pages_.clear();
- if (buckets_ != NULL) free(buckets_);
- state_->block_mgr()->ReleaseMemory(block_mgr_client_, num_buckets_ * sizeof(Bucket));
+ if (bucket_allocation_ != nullptr) allocator_->Free(move(bucket_allocation_));
}
-bool HashTable::CheckAndResize(uint64_t buckets_to_fill, const HashTableCtx* ht_ctx) {
+Status HashTable::CheckAndResize(
+ uint64_t buckets_to_fill, const HashTableCtx* ht_ctx, bool* got_memory) {
uint64_t shift = 0;
while (num_filled_buckets_ + buckets_to_fill >
(num_buckets_ << shift) * MAX_FILL_FACTOR) {
- // TODO: next prime instead of double?
++shift;
}
- if (shift > 0) return ResizeBuckets(num_buckets_ << shift, ht_ctx);
- return true;
+ if (shift > 0) return ResizeBuckets(num_buckets_ << shift, ht_ctx, got_memory);
+ *got_memory = true;
+ return Status::OK();
}
-bool HashTable::ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx) {
- DCHECK_EQ((num_buckets & (num_buckets-1)), 0)
+Status HashTable::ResizeBuckets(
+ int64_t num_buckets, const HashTableCtx* ht_ctx, bool* got_memory) {
+ DCHECK_EQ((num_buckets & (num_buckets - 1)), 0)
<< "num_buckets=" << num_buckets << " must be a power of 2";
- DCHECK_GT(num_buckets, num_filled_buckets_) << "Cannot shrink the hash table to "
- "smaller number of buckets than the number of filled buckets.";
- VLOG(2) << "Resizing hash table from "
- << num_buckets_ << " to " << num_buckets << " buckets.";
- if (max_num_buckets_ != -1 && num_buckets > max_num_buckets_) return false;
+ DCHECK_GT(num_buckets, num_filled_buckets_)
+ << "Cannot shrink the hash table to smaller number of buckets than the number of "
+ << "filled buckets.";
+ VLOG(2) << "Resizing hash table from " << num_buckets_ << " to " << num_buckets
+ << " buckets.";
+ if (max_num_buckets_ != -1 && num_buckets > max_num_buckets_) {
+ *got_memory = false;
+ return Status::OK();
+ }
++num_resizes_;
// All memory that can grow proportional to the input should come from the block mgrs
@@ -476,14 +478,16 @@ bool HashTable::ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx) {
// Note that while we copying over the contents of the old hash table, we need to have
// allocated both the old and the new hash table. Once we finish, we return the memory
// of the old hash table.
- int64_t old_size = num_buckets_ * sizeof(Bucket);
+ // int64_t old_size = num_buckets_ * sizeof(Bucket);
int64_t new_size = num_buckets * sizeof(Bucket);
- if (!state_->block_mgr()->ConsumeMemory(block_mgr_client_, new_size)) return false;
- Bucket* new_buckets = reinterpret_cast<Bucket*>(malloc(new_size));
- if (new_buckets == NULL) {
- state_->block_mgr()->ReleaseMemory(block_mgr_client_, new_size);
- return false;
+
+ unique_ptr<Suballocation> new_allocation;
+ RETURN_IF_ERROR(allocator_->Allocate(new_size, &new_allocation));
+ if (new_allocation == NULL) {
+ *got_memory = false;
+ return Status::OK();
}
+ Bucket* new_buckets = reinterpret_cast<Bucket*>(new_allocation->data());
memset(new_buckets, 0, new_size);
// Walk the old table and copy all the filled buckets to the new (resized) table.
@@ -503,28 +507,22 @@ bool HashTable::ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx) {
}
num_buckets_ = num_buckets;
- free(buckets_);
- buckets_ = new_buckets;
- state_->block_mgr()->ReleaseMemory(block_mgr_client_, old_size);
- return true;
+ allocator_->Free(move(bucket_allocation_));
+ bucket_allocation_ = move(new_allocation);
+ buckets_ = reinterpret_cast<Bucket*>(bucket_allocation_->data());
+ *got_memory = true;
+ return Status::OK();
}
-bool HashTable::GrowNodeArray() {
- int64_t page_size = 0;
- page_size = state_->block_mgr()->max_block_size();
- if (data_pages_.size() < NUM_SMALL_DATA_PAGES) {
- page_size = min(page_size, INITIAL_DATA_PAGE_SIZES[data_pages_.size()]);
- }
- BufferedBlockMgr::Block* block = NULL;
- Status status = state_->block_mgr()->GetNewBlock(
- block_mgr_client_, NULL, &block, page_size);
- DCHECK(status.ok() || block == NULL);
- if (block == NULL) return false;
- data_pages_.push_back(block);
- next_node_ = block->Allocate<DuplicateNode>(page_size);
- ImpaladMetrics::HASH_TABLE_TOTAL_BYTES->Increment(page_size);
- node_remaining_current_page_ = page_size / sizeof(DuplicateNode);
- total_data_page_size_ += page_size;
+bool HashTable::GrowNodeArray(Status* status) {
+ unique_ptr<Suballocation> allocation;
+ *status = allocator_->Allocate(DATA_PAGE_SIZE, &allocation);
+ if (!status->ok() || allocation == nullptr) return false;
+ next_node_ = reinterpret_cast<DuplicateNode*>(allocation->data());
+ data_pages_.push_back(move(allocation));
+ ImpaladMetrics::HASH_TABLE_TOTAL_BYTES->Increment(DATA_PAGE_SIZE);
+ node_remaining_current_page_ = DATA_PAGE_SIZE / sizeof(DuplicateNode);
+ total_data_page_size_ += DATA_PAGE_SIZE;
return true;
}
@@ -533,8 +531,7 @@ void HashTable::DebugStringTuple(stringstream& ss, HtData& htdata,
if (stores_tuples_) {
ss << "(" << htdata.tuple << ")";
} else {
- ss << "(" << htdata.idx.block() << ", " << htdata.idx.idx()
- << ", " << htdata.idx.offset() << ")";
+ ss << "(" << htdata.flat_row << ")";
}
if (desc != NULL) {
Tuple* row[num_build_tuples_];
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.h b/be/src/exec/hash-table.h
index 9ba5b04..297e619 100644
--- a/be/src/exec/hash-table.h
+++ b/be/src/exec/hash-table.h
@@ -15,19 +15,21 @@
// specific language governing permissions and limitations
// under the License.
-
#ifndef IMPALA_EXEC_HASH_TABLE_H
#define IMPALA_EXEC_HASH_TABLE_H
+#include <memory>
#include <vector>
#include <boost/cstdint.hpp>
#include <boost/scoped_ptr.hpp>
+
#include "codegen/impala-ir.h"
-#include "common/logging.h"
#include "common/compiler-util.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "common/logging.h"
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/suballocator.h"
#include "runtime/tuple-row.h"
#include "util/bitmap.h"
#include "util/hash-util.h"
@@ -101,7 +103,6 @@ class HashTable;
/// Inserts(). We may want to optimize joins more heavily for Inserts() (in particular
/// growing).
/// TODO: Batched interface for inserts and finds.
-/// TODO: Do we need to check mem limit exceeded so often. Check once per batch?
/// TODO: as an optimization, compute variable-length data size for the agg node.
/// Control block for a hash table. This class contains the logic as well as the variables
@@ -525,13 +526,15 @@ class HashTableCtx {
/// nodes do not contain the hash value, because all the linked nodes have the same hash
/// value, the one in the bucket. The data is either a tuple stream index or a Tuple*.
/// This array of buckets is sparse, we are shooting for up to 3/4 fill factor (75%). The
-/// data allocated by the hash table comes from the BufferedBlockMgr.
+/// data allocated by the hash table comes from the BufferPool.
class HashTable {
private:
-
- /// Either the row in the tuple stream or a pointer to the single tuple of this row.
+ /// Rows are represented as pointers into the BufferedTupleStream data with one
+ /// of two formats, depending on the number of tuples in the row.
union HtData {
- BufferedTupleStream::RowIdx idx;
+ // For rows with multiple tuples per row, a pointer to the flattened TupleRow.
+ BufferedTupleStreamV2::FlatRowPtr flat_row;
+ // For rows with one tuple per row, a pointer to the Tuple itself.
Tuple* tuple;
};
@@ -584,7 +587,7 @@ class HashTable {
/// Returns a newly allocated HashTable. The probing algorithm is set by the
/// FLAG_enable_quadratic_probing.
- /// - client: block mgr client to allocate data pages from.
+ /// - allocator: allocator to allocate bucket directory and data pages from.
/// - stores_duplicates: true if rows with duplicate keys may be inserted into the
/// hash table.
/// - num_build_tuples: number of Tuples in the build tuple row.
@@ -596,31 +599,35 @@ class HashTable {
/// -1, if it unlimited.
/// - initial_num_buckets: number of buckets that the hash table should be initialized
/// with.
- static HashTable* Create(RuntimeState* state, BufferedBlockMgr::Client* client,
- bool stores_duplicates, int num_build_tuples, BufferedTupleStream* tuple_stream,
- int64_t max_num_buckets, int64_t initial_num_buckets);
+ static HashTable* Create(Suballocator* allocator, bool stores_duplicates,
+ int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+ int64_t initial_num_buckets);
- /// Allocates the initial bucket structure. Returns false if OOM.
- bool Init();
+ /// Allocates the initial bucket structure. Returns a non-OK status if an error is
+ /// encountered. If an OK status is returned , 'got_memory' is set to indicate whether
+ /// enough memory for the initial buckets was allocated from the Suballocator.
+ Status Init(bool* got_memory) WARN_UNUSED_RESULT;
/// Call to cleanup any resources. Must be called once.
void Close();
- /// Inserts the row to the hash table. Returns true if the insertion was successful.
- /// Always returns true if the table has free buckets and the key is not a duplicate.
- /// The caller is responsible for ensuring that the table has free buckets
- /// 'idx' is the index into tuple_stream_ for this row. If the row contains more than
- /// one tuple, the 'idx' is stored instead of the 'row'. The 'row' is not copied by the
- /// hash table and the caller must guarantee it stays in memory. This will not grow the
- /// hash table. In the case that there is a need to insert a duplicate node, instead of
- /// filling a new bucket, and there is not enough memory to insert a duplicate node,
- /// the insert fails and this function returns false.
- /// Used during the build phase of hash joins.
+ /// Inserts the row to the hash table. The caller is responsible for ensuring that the
+ /// table has free buckets. Returns true if the insertion was successful. Always
+ /// returns true if the table has free buckets and the key is not a duplicate. If the
+ /// key was a duplicate and memory could not be allocated for the new duplicate node,
+ /// returns false. If an error is encountered while creating a duplicate node, returns
+ /// false and sets 'status' to the error.
+ ///
+ /// 'flat_row' is a pointer to the flattened row in 'tuple_stream_' If the row contains
+ /// only one tuple, a pointer to that tuple is stored. Otherwise the 'flat_row' pointer
+ /// is stored. The 'row' is not copied by the hash table and the caller must guarantee
+ /// it stays in memory. This will not grow the hash table.
bool IR_ALWAYS_INLINE Insert(HashTableCtx* ht_ctx,
- const BufferedTupleStream::RowIdx& idx, TupleRow* row);
+ BufferedTupleStreamV2::FlatRowPtr flat_row, TupleRow* row,
+ Status* status) WARN_UNUSED_RESULT;
/// Prefetch the hash table bucket which the given hash value 'hash' maps to.
- template<const bool READ>
+ template <const bool READ>
void IR_ALWAYS_INLINE PrefetchBucket(uint32_t hash);
/// Returns an iterator to the bucket that matches the probe expression results that
@@ -680,12 +687,17 @@ class HashTable {
/// Calculates the fill factor if 'buckets_to_fill' additional buckets were to be
/// filled and resizes the hash table so that the projected fill factor is below the
/// max fill factor.
- /// If it returns true, then it is guaranteed at least 'rows_to_add' rows can be
- /// inserted without need to resize.
- bool CheckAndResize(uint64_t buckets_to_fill, const HashTableCtx* ht_ctx);
+ /// If 'got_memory' is true, then it is guaranteed at least 'rows_to_add' rows can be
+ /// inserted without need to resize. If there is not enough memory available to
+ /// resize the hash table, Status::OK() is returned and 'got_memory' is false. If a
+ /// another error occurs, an error status may be returned.
+ Status CheckAndResize(uint64_t buckets_to_fill, const HashTableCtx* ht_ctx,
+ bool* got_memory) WARN_UNUSED_RESULT;
/// Returns the number of bytes allocated to the hash table from the block manager.
- int64_t ByteSize() const { return num_buckets_ * sizeof(Bucket) + total_data_page_size_; }
+ int64_t ByteSize() const {
+ return num_buckets_ * sizeof(Bucket) + total_data_page_size_;
+ }
/// Returns an iterator at the beginning of the hash table. Advancing this iterator
/// will traverse all elements.
@@ -792,7 +804,6 @@ class HashTable {
TupleRow* scratch_row_;
/// Current bucket idx.
- /// TODO: Use uint32_t?
int64_t bucket_idx_;
/// Pointer to the current duplicate node.
@@ -807,9 +818,9 @@ class HashTable {
/// of calling this constructor directly.
/// - quadratic_probing: set to true when the probing algorithm is quadratic, as
/// opposed to linear.
- HashTable(bool quadratic_probing, RuntimeState* state, BufferedBlockMgr::Client* client,
- bool stores_duplicates, int num_build_tuples, BufferedTupleStream* tuple_stream,
- int64_t max_num_buckets, int64_t initial_num_buckets);
+ HashTable(bool quadratic_probing, Suballocator* allocator, bool stores_duplicates,
+ int num_build_tuples, BufferedTupleStreamV2* tuple_stream, int64_t max_num_buckets,
+ int64_t initial_num_buckets);
/// Performs the probing operation according to the probing algorithm (linear or
/// quadratic. Returns one of the following:
@@ -839,8 +850,10 @@ class HashTable {
HashTableCtx* ht_ctx, uint32_t hash, bool* found);
/// Performs the insert logic. Returns the HtData* of the bucket or duplicate node
- /// where the data should be inserted. Returns NULL if the insert was not successful.
- HtData* IR_ALWAYS_INLINE InsertInternal(HashTableCtx* ht_ctx);
+ /// where the data should be inserted. Returns NULL if the insert was not successful
+ /// and either sets 'status' to OK if it failed because not enough reservation was
+ /// available or the error if an error was encountered.
+ HtData* IR_ALWAYS_INLINE InsertInternal(HashTableCtx* ht_ctx, Status* status);
/// Updates 'bucket_idx' to the index of the next non-empty bucket. If the bucket has
/// duplicates, 'node' will be pointing to the head of the linked list of duplicates.
@@ -848,8 +861,8 @@ class HashTable {
/// 'bucket_idx' to BUCKET_NOT_FOUND.
void NextFilledBucket(int64_t* bucket_idx, DuplicateNode** node);
- /// Resize the hash table to 'num_buckets'. Returns false on OOM.
- bool ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx);
+ /// Resize the hash table to 'num_buckets'. 'got_memory' is false on OOM.
+ Status ResizeBuckets(int64_t num_buckets, const HashTableCtx* ht_ctx, bool* got_memory);
/// Appends the DuplicateNode pointed by next_node_ to 'bucket' and moves the next_node_
/// pointer to the next DuplicateNode in the page, updating the remaining node counter.
@@ -862,9 +875,10 @@ class HashTable {
/// the bucket is converted to a DuplicateNode. That is, the contents of 'data' of the
/// bucket are copied to a DuplicateNode and 'data' is updated to pointing to a
/// DuplicateNode.
- /// Returns NULL if the node array could not grow, i.e. there was not enough memory to
- /// allocate a new DuplicateNode.
- DuplicateNode* IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx);
+ /// Returns NULL and sets 'status' to OK if the node array could not grow, i.e. there
+ /// was not enough memory to allocate a new DuplicateNode. Returns NULL and sets
+ /// 'status' to an error if another error was encountered.
+ DuplicateNode* IR_ALWAYS_INLINE InsertDuplicateNode(int64_t bucket_idx, Status* status);
/// Resets the contents of the empty bucket with index 'bucket_idx', in preparation for
/// an insert. Sets all the fields of the bucket other than 'data'.
@@ -877,8 +891,10 @@ class HashTable {
/// returns the content of the first chained duplicate node of the bucket.
TupleRow* GetRow(Bucket* bucket, TupleRow* row) const;
- /// Grow the node array. Returns false on OOM.
- bool GrowNodeArray();
+ /// Grow the node array. Returns true and sets 'status' to OK on success. Returns false
+ /// and set 'status' to OK if we can't get sufficient reservation to allocate the next
+ /// data page. Returns false and sets 'status' if another error is encountered.
+ bool GrowNodeArray(Status* status);
/// Functions to be replaced by codegen to specialize the hash table.
bool IR_NO_INLINE stores_tuples() const { return stores_tuples_; }
@@ -887,20 +903,26 @@ class HashTable {
/// Load factor that will trigger growing the hash table on insert. This is
/// defined as the number of non-empty buckets / total_buckets
- static const double MAX_FILL_FACTOR;
+ static constexpr double MAX_FILL_FACTOR = 0.75;
+
+ /// The size in bytes of each page of duplicate nodes. Should be large enough to fit
+ /// enough DuplicateNodes to amortise the overhead of allocating each page and low
+ /// enough to not waste excessive memory to internal fragmentation.
+ static constexpr int64_t DATA_PAGE_SIZE = 64L * 1024;
RuntimeState* state_;
- /// Client to allocate data pages with.
- BufferedBlockMgr::Client* block_mgr_client_;
+ /// Suballocator to allocate data pages and hash table buckets with.
+ Suballocator* allocator_;
/// Stream contains the rows referenced by the hash table. Can be NULL if the
/// row only contains a single tuple, in which case the TupleRow indirection
/// is removed by the hash table.
- BufferedTupleStream* tuple_stream_;
+ BufferedTupleStreamV2* tuple_stream_;
- /// Constants on how the hash table should behave. Joins and aggs have slightly
- /// different behavior.
+ /// Constants on how the hash table should behave.
+
+ /// True if the HtData uses the Tuple* representation, or false if it uses FlatRowPtr.
const bool stores_tuples_;
/// True if duplicates may be inserted into hash table.
@@ -909,8 +931,9 @@ class HashTable {
/// Quadratic probing enabled (as opposed to linear).
const bool quadratic_probing_;
- /// Data pages for all nodes. These are always pinned.
- std::vector<BufferedBlockMgr::Block*> data_pages_;
+ /// Data pages for all nodes. Allocated from suballocator to reduce memory
+ /// consumption of small tables.
+ std::vector<std::unique_ptr<Suballocation>> data_pages_;
/// Byte size of all buffers in data_pages_.
int64_t total_data_page_size_;
@@ -926,8 +949,10 @@ class HashTable {
const int64_t max_num_buckets_;
- /// Array of all buckets. Owned by this node. Using c-style array to control
- /// control memory footprint.
+ /// Allocation containing all buckets.
+ std::unique_ptr<Suballocation> bucket_allocation_;
+
+ /// Pointer to the 'buckets_' array from 'bucket_allocation_'.
Bucket* buckets_;
/// Total number of buckets (filled and empty).
@@ -943,9 +968,8 @@ class HashTable {
/// Number of build tuples, used for constructing temp row* for probes.
const int num_build_tuples_;
- /// Flag used to disable spilling hash tables that already had matches in case of
- /// right joins (IMPALA-1488).
- /// TODO: Not fail when spilling hash tables with matches in right joins
+ /// Flag used to check that we don't lose stored matches when spilling hash tables
+ /// (IMPALA-1488).
bool has_matches_;
/// The stats below can be used for debugging perf.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/hash-table.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hash-table.inline.h b/be/src/exec/hash-table.inline.h
index aff7c14..ce2f784 100644
--- a/be/src/exec/hash-table.inline.h
+++ b/be/src/exec/hash-table.inline.h
@@ -90,7 +90,8 @@ inline int64_t HashTable::Probe(Bucket* buckets, int64_t num_buckets,
return Iterator::BUCKET_NOT_FOUND;
}
-inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx) {
+inline HashTable::HtData* HashTable::InsertInternal(
+ HashTableCtx* ht_ctx, Status* status) {
++num_probes_;
bool found = false;
uint32_t hash = ht_ctx->expr_values_cache()->CurExprValuesHash();
@@ -98,7 +99,7 @@ inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx) {
DCHECK_NE(bucket_idx, Iterator::BUCKET_NOT_FOUND);
if (found) {
// We need to insert a duplicate node, note that this may fail to allocate memory.
- DuplicateNode* new_node = InsertDuplicateNode(bucket_idx);
+ DuplicateNode* new_node = InsertDuplicateNode(bucket_idx, status);
if (UNLIKELY(new_node == NULL)) return NULL;
return &new_node->htdata;
} else {
@@ -108,14 +109,14 @@ inline HashTable::HtData* HashTable::InsertInternal(HashTableCtx* ht_ctx) {
}
inline bool HashTable::Insert(HashTableCtx* ht_ctx,
- const BufferedTupleStream::RowIdx& idx, TupleRow* row) {
- HtData* htdata = InsertInternal(ht_ctx);
+ BufferedTupleStreamV2::FlatRowPtr flat_row, TupleRow* row, Status* status) {
+ HtData* htdata = InsertInternal(ht_ctx, status);
// If successful insert, update the contents of the newly inserted entry with 'idx'.
if (LIKELY(htdata != NULL)) {
if (stores_tuples()) {
htdata->tuple = row->GetTuple(0);
} else {
- htdata->idx = idx;
+ htdata->flat_row = flat_row;
}
return true;
}
@@ -213,7 +214,8 @@ inline HashTable::DuplicateNode* HashTable::AppendNextNode(Bucket* bucket) {
return next_node_++;
}
-inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(int64_t bucket_idx) {
+inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(
+ int64_t bucket_idx, Status* status) {
DCHECK_GE(bucket_idx, 0);
DCHECK_LT(bucket_idx, num_buckets_);
Bucket* bucket = &buckets_[bucket_idx];
@@ -222,12 +224,12 @@ inline HashTable::DuplicateNode* HashTable::InsertDuplicateNode(int64_t bucket_i
// Allocate one duplicate node for the new data and one for the preexisting data,
// if needed.
while (node_remaining_current_page_ < 1 + !bucket->hasDuplicates) {
- if (UNLIKELY(!GrowNodeArray())) return NULL;
+ if (UNLIKELY(!GrowNodeArray(status))) return NULL;
}
if (!bucket->hasDuplicates) {
// This is the first duplicate in this bucket. It means that we need to convert
// the current entry in the bucket to a node and link it from the bucket.
- next_node_->htdata.idx = bucket->bucketData.htdata.idx;
+ next_node_->htdata.flat_row = bucket->bucketData.htdata.flat_row;
DCHECK(!bucket->matched);
next_node_->matched = false;
next_node_->next = NULL;
@@ -246,7 +248,7 @@ inline TupleRow* IR_ALWAYS_INLINE HashTable::GetRow(HtData& htdata, TupleRow* ro
return reinterpret_cast<TupleRow*>(&htdata.tuple);
} else {
// TODO: GetTupleRow() has interpreted code that iterates over the row's descriptor.
- tuple_stream_->GetTupleRow(htdata.idx, row);
+ tuple_stream_->GetTupleRow(htdata.flat_row, row);
return row;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/nested-loop-join-builder.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/nested-loop-join-builder.cc b/be/src/exec/nested-loop-join-builder.cc
index 67e6ed6..fdd94ee 100644
--- a/be/src/exec/nested-loop-join-builder.cc
+++ b/be/src/exec/nested-loop-join-builder.cc
@@ -45,8 +45,7 @@ Status NljBuilder::Send(RuntimeState* state, RowBatch* batch) {
build_batch->AcquireState(batch);
AddBuildBatch(build_batch);
- if (build_batch->needs_deep_copy() || build_batch->num_blocks() > 0
- || build_batch->num_buffers() > 0) {
+ if (build_batch->needs_deep_copy() || build_batch->num_buffers() > 0) {
// This batch and earlier batches may refer to resources passed from the child
// that aren't owned by the row batch itself. Deep copying ensures that the row
// batches are backed by memory owned by this node that is safe to hold on to.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partial-sort-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partial-sort-node.cc b/be/src/exec/partial-sort-node.cc
index 4f485d5..88b2f26 100644
--- a/be/src/exec/partial-sort-node.cc
+++ b/be/src/exec/partial-sort-node.cc
@@ -58,8 +58,10 @@ Status PartialSortNode::Prepare(RuntimeState* state) {
RETURN_IF_ERROR(ExecNode::Prepare(state));
less_than_.reset(new TupleRowComparator(ordering_exprs_, is_asc_order_, nulls_first_));
sorter_.reset(new Sorter(*less_than_, sort_tuple_exprs_, &row_descriptor_,
- mem_tracker(), runtime_profile(), state, false));
+ mem_tracker(), &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+ runtime_profile(), state, id(), false));
RETURN_IF_ERROR(sorter_->Prepare(pool_, expr_mem_pool()));
+ DCHECK_GE(resource_profile_.min_reservation, sorter_->ComputeMinReservation());
AddCodegenDisabledMessage(state);
input_batch_.reset(
new RowBatch(child(0)->row_desc(), state->batch_size(), mem_tracker()));
@@ -81,6 +83,9 @@ Status PartialSortNode::Open(RuntimeState* state) {
RETURN_IF_CANCELLED(state);
RETURN_IF_ERROR(QueryMaintenance(state));
RETURN_IF_ERROR(child(0)->Open(state));
+ if (!buffer_pool_client_.is_registered()) {
+ RETURN_IF_ERROR(ClaimBufferReservation(state));
+ }
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partial-sort-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partial-sort-node.h b/be/src/exec/partial-sort-node.h
index ab4c547..d40d653 100644
--- a/be/src/exec/partial-sort-node.h
+++ b/be/src/exec/partial-sort-node.h
@@ -19,7 +19,6 @@
#define IMPALA_EXEC_PARTIAL_SORT_NODE_H
#include "exec/exec-node.h"
-#include "runtime/buffered-block-mgr.h"
#include "runtime/sorter.h"
namespace impala {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-aggregation-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node-ir.cc b/be/src/exec/partitioned-aggregation-node-ir.cc
index cd5d336..126a2a5 100644
--- a/be/src/exec/partitioned-aggregation-node-ir.cc
+++ b/be/src/exec/partitioned-aggregation-node-ir.cc
@@ -21,7 +21,7 @@
#include "exprs/agg-fn-evaluator.h"
#include "exprs/scalar-expr.h"
#include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
#include "runtime/row-batch.h"
#include "runtime/tuple-row.h"
@@ -46,7 +46,8 @@ Status PartitionedAggregationNode::ProcessBatch(RowBatch* batch,
// will end up to the same partition.
// TODO: Once we have a histogram with the number of rows per partition, we will have
// accurate resize calls.
- RETURN_IF_ERROR(CheckAndResizeHashPartitions(batch->num_rows(), ht_ctx));
+ RETURN_IF_ERROR(
+ CheckAndResizeHashPartitions(AGGREGATED_ROWS, batch->num_rows(), ht_ctx));
HashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache();
const int cache_size = expr_vals_cache->capacity();
@@ -108,6 +109,7 @@ Status PartitionedAggregationNode::ProcessRow(TupleRow* __restrict__ row,
// so we can try again to insert the row.
HashTable* hash_tbl = GetHashTable(partition_idx);
Partition* dst_partition = hash_partitions_[partition_idx];
+ DCHECK(dst_partition != nullptr);
DCHECK_EQ(dst_partition->is_spilled(), hash_tbl == NULL);
if (hash_tbl == NULL) {
// This partition is already spilled, just append the row.
@@ -155,24 +157,13 @@ Status PartitionedAggregationNode::AddIntermediateTuple(Partition* __restrict__
}
// We did not have enough memory to add intermediate_tuple to the stream.
- RETURN_IF_ERROR(SpillPartition());
+ RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS));
if (partition->is_spilled()) {
return AppendSpilledRow<AGGREGATED_ROWS>(partition, row);
}
}
}
-template<bool AGGREGATED_ROWS>
-Status PartitionedAggregationNode::AppendSpilledRow(Partition* __restrict__ partition,
- TupleRow* __restrict__ row) {
- DCHECK(!is_streaming_preagg_);
- DCHECK(partition->is_spilled());
- BufferedTupleStream* stream = AGGREGATED_ROWS ?
- partition->aggregated_row_stream.get() :
- partition->unaggregated_row_stream.get();
- return AppendSpilledRow(stream, row);
-}
-
Status PartitionedAggregationNode::ProcessBatchStreaming(bool needs_serialize,
TPrefetchMode::type prefetch_mode, RowBatch* in_batch, RowBatch* out_batch,
HashTableCtx* __restrict__ ht_ctx, int remaining_capacity[PARTITION_FANOUT]) {
@@ -230,6 +221,7 @@ bool PartitionedAggregationNode::TryAddToHashTable(
DCHECK(remaining_capacity != NULL);
DCHECK_EQ(hash_tbl, partition->hash_tbl.get());
DCHECK_GE(*remaining_capacity, 0);
+ if (hash_tbl == nullptr) return false; // Hash table was not created - pass through.
bool found;
// This is called from ProcessBatchStreaming() so the rows are not aggregated.
HashTable::Iterator it = hash_tbl->FindBuildRowBucket(ht_ctx, &found);
[09/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-builder.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.h b/be/src/exec/partitioned-hash-join-builder.h
index e0393b5..912613d 100644
--- a/be/src/exec/partitioned-hash-join-builder.h
+++ b/be/src/exec/partitioned-hash-join-builder.h
@@ -26,8 +26,9 @@
#include "exec/data-sink.h"
#include "exec/filter-context.h"
#include "exec/hash-table.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/bufferpool/buffer-pool.h"
+#include "runtime/bufferpool/suballocator.h"
#include "gen-cpp/PlanNodes_types.h"
@@ -56,7 +57,7 @@ class ScalarExprEvaluator;
/// RepartitionBuildInput() to repartition a level n partition into multiple level n + 1
/// partitions.
///
-/// Both the PartitionedHashJoinNode and the builder share a BufferedBlockMgr client
+/// Both the PartitionedHashJoinNode and the builder share a BufferPool client
/// and the corresponding reservations. Different stages of the spilling algorithm
/// require different mixes of build and probe buffers and hash tables, so we can
/// share the reservation to minimize the combined memory requirement. Initial probe-side
@@ -72,7 +73,8 @@ class PhjBuilder : public DataSink {
class Partition;
PhjBuilder(int join_node_id, TJoinOp::type join_op, const RowDescriptor* probe_row_desc,
- const RowDescriptor* build_row_desc, RuntimeState* state);
+ const RowDescriptor* build_row_desc, RuntimeState* state,
+ BufferPool::ClientHandle* buffer_pool_client, int64_t spillable_buffer_size);
Status InitExprsAndFilters(RuntimeState* state,
const std::vector<TEqJoinCondition>& eq_join_conjuncts,
@@ -101,7 +103,7 @@ class PhjBuilder : public DataSink {
/// Transfer ownership of the probe streams to the caller. One stream was allocated per
/// spilled partition in FlushFinal(). The probe streams are empty but prepared for
/// writing with a write buffer allocated.
- std::vector<std::unique_ptr<BufferedTupleStream>> TransferProbeStreams();
+ std::vector<std::unique_ptr<BufferedTupleStreamV2>> TransferProbeStreams();
/// Clears the current list of hash partitions. Called after probing of the partitions
/// is done. The partitions are not closed or destroyed, since they may be spilled or
@@ -122,7 +124,7 @@ class PhjBuilder : public DataSink {
/// 'input_probe_rows' for reading in "delete_on_read" mode, so that the probe phase
/// has enough buffers preallocated to execute successfully.
Status RepartitionBuildInput(Partition* input_partition, int level,
- BufferedTupleStream* input_probe_rows) WARN_UNUSED_RESULT;
+ BufferedTupleStreamV2* input_probe_rows) WARN_UNUSED_RESULT;
/// Returns the largest build row count out of the current hash partitions.
int64_t LargestPartitionRows() const;
@@ -132,7 +134,6 @@ class PhjBuilder : public DataSink {
bool HashTableStoresNulls() const;
/// Accessor functions, mainly required to expose state to PartitionedHashJoinNode.
- inline BufferedBlockMgr::Client* block_mgr_client() const { return block_mgr_client_; }
inline bool non_empty_build() const { return non_empty_build_; }
inline const std::vector<bool>& is_not_distinct_from() const {
return is_not_distinct_from_;
@@ -200,24 +201,27 @@ class PhjBuilder : public DataSink {
/// Spills this partition, the partition's stream is unpinned with 'mode' and
/// its hash table is destroyed if it was built.
- Status Spill(BufferedTupleStream::UnpinMode mode) WARN_UNUSED_RESULT;
+ Status Spill(BufferedTupleStreamV2::UnpinMode mode) WARN_UNUSED_RESULT;
bool ALWAYS_INLINE IsClosed() const { return build_rows_ == NULL; }
- BufferedTupleStream* ALWAYS_INLINE build_rows() { return build_rows_.get(); }
+ BufferedTupleStreamV2* ALWAYS_INLINE build_rows() { return build_rows_.get(); }
HashTable* ALWAYS_INLINE hash_tbl() const { return hash_tbl_.get(); }
bool ALWAYS_INLINE is_spilled() const { return is_spilled_; }
int ALWAYS_INLINE level() const { return level_; }
private:
- /// Inserts each row in 'batch' into 'hash_tbl_' using 'ctx'. 'indices' is an array
- /// containing the index of each row's index into the hash table's tuple stream.
+ /// Inserts each row in 'batch' into 'hash_tbl_' using 'ctx'. 'flat_rows' is an array
+ /// containing the rows in the hash table's tuple stream.
/// 'prefetch_mode' is the prefetching mode in use. If it's not PREFETCH_NONE, hash
/// table buckets which the rows hashes to will be prefetched. This parameter is
/// replaced with a constant during codegen time. This function may be replaced with
/// a codegen'd version. Returns true if all rows in 'batch' are successfully
- /// inserted.
+ /// inserted and false otherwise. If inserting failed, 'status' indicates why it
+ /// failed: if 'status' is ok, inserting failed because not enough reservation
+ /// was available and if 'status' is an error, inserting failed because of that error.
bool InsertBatch(TPrefetchMode::type prefetch_mode, HashTableCtx* ctx,
- RowBatch* batch, const std::vector<BufferedTupleStream::RowIdx>& indices);
+ RowBatch* batch, const std::vector<BufferedTupleStreamV2::FlatRowPtr>& flat_rows,
+ Status* status);
const PhjBuilder* parent_;
@@ -235,16 +239,9 @@ class PhjBuilder : public DataSink {
/// Stream of build tuples in this partition. Initially owned by this object but
/// transferred to the parent exec node (via the row batch) when the partition
/// is closed. If NULL, ownership has been transferred and the partition is closed.
- std::unique_ptr<BufferedTupleStream> build_rows_;
+ std::unique_ptr<BufferedTupleStreamV2> build_rows_;
};
- protected:
- /// Init() function inherited from DataSink. Overridden to be a no-op for now.
- /// TODO: Merge with InitExprsAndFilters() once this class becomes a true data sink.
- virtual Status Init(const std::vector<TExpr>& thrift_output_exprs,
- const TDataSink& tsink, RuntimeState* state) override;
-
- private:
/// Computes the minimum number of buffers required to execute the spilling partitioned
/// hash algorithm successfully for any input size (assuming enough disk space is
/// available for spilled rows). The buffers are used for buffering both build and
@@ -255,15 +252,22 @@ class PhjBuilder : public DataSink {
/// For NAAJ, we need 3 additional buffers for 'null_aware_partition_',
/// 'null_aware_probe_partition_' and 'null_probe_rows_'.
int MinRequiredBuffers() const {
- // Must be kept in sync with HashJoinNode.computeResourceProfile() in fe.
+ // Must be kept in sync with HashJoinNode.computeNodeResourceProfile() in fe.
int num_reserved_buffers = PARTITION_FANOUT + 1;
if (join_op_ == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) num_reserved_buffers += 3;
return num_reserved_buffers;
}
+ protected:
+ /// Init() function inherited from DataSink. Overridden to be a no-op for now.
+ /// TODO: Merge with InitExprsAndFilters() once this class becomes a true data sink.
+ virtual Status Init(const std::vector<TExpr>& thrift_output_exprs,
+ const TDataSink& tsink, RuntimeState* state) override;
+
/// Free local allocations made from expr evaluators during hash table construction.
void FreeLocalAllocations() const;
+ private:
/// Create and initialize a set of hash partitions for partitioning level 'level'.
/// The previous hash partitions must have been cleared with ClearHashPartitions().
/// After calling this, batches are added to the new partitions by calling Send().
@@ -284,19 +288,19 @@ class PhjBuilder : public DataSink {
/// partitions. This odd return convention is used to avoid emitting unnecessary code
/// for ~Status in perf-critical code.
bool AppendRow(
- BufferedTupleStream* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
+ BufferedTupleStreamV2* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
/// Slow path for AppendRow() above. It is called when the stream has failed to append
/// the row. We need to find more memory by either switching to IO-buffers, in case the
/// stream still uses small buffers, or spilling a partition. Returns false and sets
/// 'status' if it was unable to append the row, even after spilling partitions.
- bool AppendRowStreamFull(BufferedTupleStream* stream, TupleRow* row,
+ bool AppendRowStreamFull(BufferedTupleStreamV2* stream, TupleRow* row,
Status* status) noexcept WARN_UNUSED_RESULT;
/// Frees memory by spilling one of the hash partitions. The 'mode' argument is passed
/// to the Spill() call for the selected partition. The current policy is to spill the
/// largest partition. Returns non-ok status if we couldn't spill a partition.
- Status SpillPartition(BufferedTupleStream::UnpinMode mode) WARN_UNUSED_RESULT;
+ Status SpillPartition(BufferedTupleStreamV2::UnpinMode mode) WARN_UNUSED_RESULT;
/// Tries to build hash tables for all unspilled hash partitions. Called after
/// FlushFinal() when all build rows have been partitioned and added to the appropriate
@@ -358,14 +362,20 @@ class PhjBuilder : public DataSink {
/// Pool for objects with same lifetime as builder.
ObjectPool pool_;
- /// Client to the buffered block mgr, used to allocate build partition buffers and hash
- /// tables. When probing, the spilling algorithm keeps some build partitions in memory
- /// while using memory for probe buffers for spilled partitions. To support dynamically
- /// dividing memory between build and probe, this client is owned by the builder but
- /// shared with the PartitionedHashJoinNode.
+ /// Client to the buffer pool, used to allocate build partition buffers and hash tables.
+ /// When probing, the spilling algorithm keeps some build partitions in memory while
+ /// using memory for probe buffers for spilled partitions. To support dynamically
+ /// dividing memory between build and probe, this client is shared between the builder
+ /// and the PartitionedHashJoinNode.
/// TODO: this approach to sharing will not work for spilling broadcast joins with a
/// 1:N relationship from builders to join nodes.
- BufferedBlockMgr::Client* block_mgr_client_;
+ BufferPool::ClientHandle* buffer_pool_client_;
+
+ /// The size of buffers to use in the build and probe streams.
+ const int64_t spillable_buffer_size_;
+
+ /// Allocator for hash table memory.
+ boost::scoped_ptr<Suballocator> ht_allocator_;
/// If true, the build side has at least one row.
bool non_empty_build_;
@@ -454,7 +464,7 @@ class PhjBuilder : public DataSink {
///
/// Because of this, at the end of the build phase, we always have sufficient memory
/// to execute the probe phase of the algorithm without spilling more partitions.
- std::vector<std::unique_ptr<BufferedTupleStream>> spilled_partition_probe_streams_;
+ std::vector<std::unique_ptr<BufferedTupleStreamV2>> spilled_partition_probe_streams_;
/// END: Members that must be Reset()
/////////////////////////////////////////
@@ -469,7 +479,7 @@ class PhjBuilder : public DataSink {
ProcessBuildBatchFn process_build_batch_fn_level0_;
typedef bool (*InsertBatchFn)(Partition*, TPrefetchMode::type, HashTableCtx*, RowBatch*,
- const std::vector<BufferedTupleStream::RowIdx>&);
+ const std::vector<BufferedTupleStreamV2::FlatRowPtr>&, Status*);
/// Jitted Partition::InsertBatch() function pointers. NULL if codegen is disabled.
InsertBatchFn insert_batch_fn_;
InsertBatchFn insert_batch_fn_level0_;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node-ir.cc b/be/src/exec/partitioned-hash-join-node-ir.cc
index 2c951d1..b890eb9 100644
--- a/be/src/exec/partitioned-hash-join-node-ir.cc
+++ b/be/src/exec/partitioned-hash-join-node-ir.cc
@@ -313,7 +313,7 @@ bool IR_ALWAYS_INLINE PartitionedHashJoinNode::NextProbeRow(
// The partition is not in memory, spill the probe row and move to the next row.
// Skip the current row if we manage to append to the spilled partition's BTS.
// Otherwise, we need to bail out and report the failure.
- BufferedTupleStream* probe_rows = probe_partition->probe_rows();
+ BufferedTupleStreamV2* probe_rows = probe_partition->probe_rows();
if (UNLIKELY(!AppendProbeRow(probe_rows, current_probe_row_, status))) {
DCHECK(!status->ok());
return false;
@@ -438,9 +438,8 @@ int PartitionedHashJoinNode::ProcessProbeBatch(TPrefetchMode::type prefetch_mode
}
inline bool PartitionedHashJoinNode::AppendProbeRow(
- BufferedTupleStream* stream, TupleRow* row, Status* status) {
- DCHECK(stream->has_write_block());
- DCHECK(!stream->using_small_buffers());
+ BufferedTupleStreamV2* stream, TupleRow* row, Status* status) {
+ DCHECK(stream->has_write_iterator());
DCHECK(!stream->is_pinned());
return stream->AddRow(row, status);
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.cc b/be/src/exec/partitioned-hash-join-node.cc
index a5c9897..2db9e00 100644
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -27,8 +27,7 @@
#include "exprs/scalar-expr.h"
#include "exprs/scalar-expr-evaluator.h"
#include "exprs/slot-ref.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
#include "runtime/mem-tracker.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-state.h"
@@ -47,9 +46,15 @@ static const string PREPARE_FOR_READ_FAILED_ERROR_MSG =
"successfully.";
using namespace impala;
-using namespace llvm;
-using namespace strings;
-using std::unique_ptr;
+using llvm::BasicBlock;
+using llvm::ConstantInt;
+using llvm::Function;
+using llvm::GlobalValue;
+using llvm::LLVMContext;
+using llvm::PointerType;
+using llvm::Type;
+using llvm::Value;
+using strings::Substitute;
PartitionedHashJoinNode::PartitionedHashJoinNode(
ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs)
@@ -77,8 +82,9 @@ Status PartitionedHashJoinNode::Init(const TPlanNode& tnode, RuntimeState* state
// TODO: allow PhjBuilder to be the sink of a separate fragment. For now, PhjBuilder is
// owned by this node, but duplicates some state (exprs, etc) in anticipation of it
// being separated out further.
- builder_.reset(
- new PhjBuilder(id(), join_op_, child(0)->row_desc(), child(1)->row_desc(), state));
+ builder_.reset(new PhjBuilder(id(), join_op_, child(0)->row_desc(),
+ child(1)->row_desc(), state, &buffer_pool_client_,
+ resource_profile_.spillable_buffer_size));
RETURN_IF_ERROR(
builder_->InitExprsAndFilters(state, eq_join_conjuncts, tnode.runtime_filters));
@@ -177,6 +183,11 @@ Status PartitionedHashJoinNode::Open(RuntimeState* state) {
}
Status PartitionedHashJoinNode::AcquireResourcesForBuild(RuntimeState* state) {
+ DCHECK_GE(resource_profile_.min_reservation,
+ resource_profile_.spillable_buffer_size * builder_->MinRequiredBuffers());
+ if (!buffer_pool_client_.is_registered()) {
+ RETURN_IF_ERROR(ClaimBufferReservation(state));
+ }
if (join_op_ == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
// Initialize these partitions before doing the build so that the build does not
// use the reservation intended for them.
@@ -254,12 +265,10 @@ void PartitionedHashJoinNode::Close(RuntimeState* state) {
PartitionedHashJoinNode::ProbePartition::ProbePartition(RuntimeState* state,
PartitionedHashJoinNode* parent, PhjBuilder::Partition* build_partition,
- unique_ptr<BufferedTupleStream> probe_rows)
- : parent_(parent),
- build_partition_(build_partition),
+ unique_ptr<BufferedTupleStreamV2> probe_rows)
+ : build_partition_(build_partition),
probe_rows_(std::move(probe_rows)) {
- DCHECK(probe_rows_->has_write_block());
- DCHECK(!probe_rows_->using_small_buffers());
+ DCHECK(probe_rows_->has_write_iterator());
DCHECK(!probe_rows_->is_pinned());
}
@@ -270,10 +279,7 @@ PartitionedHashJoinNode::ProbePartition::~ProbePartition() {
Status PartitionedHashJoinNode::ProbePartition::PrepareForRead() {
bool got_read_buffer;
RETURN_IF_ERROR(probe_rows_->PrepareForRead(true, &got_read_buffer));
- if (!got_read_buffer) {
- return parent_->mem_tracker()->MemLimitExceeded(parent_->runtime_state_,
- Substitute(PREPARE_FOR_READ_FAILED_ERROR_MSG, parent_->id_));
- }
+ DCHECK(got_read_buffer) << "Accounted in min reservation";
return Status::OK();
}
@@ -322,7 +328,7 @@ Status PartitionedHashJoinNode::NextSpilledProbeRowBatch(
probe_batch_pos_ = -1;
return Status::OK();
}
- BufferedTupleStream* probe_rows = input_partition_->probe_rows();
+ BufferedTupleStreamV2* probe_rows = input_partition_->probe_rows();
if (LIKELY(probe_rows->rows_returned() < probe_rows->num_rows())) {
// Continue from the current probe stream.
bool eos = false;
@@ -414,12 +420,11 @@ Status PartitionedHashJoinNode::PrepareSpilledPartitionForProbe(
ht_ctx_->set_level(next_partition_level);
// Spill to free memory from hash tables and pinned streams for use in new partitions.
- RETURN_IF_ERROR(build_partition->Spill(BufferedTupleStream::UNPIN_ALL));
+ RETURN_IF_ERROR(build_partition->Spill(BufferedTupleStreamV2::UNPIN_ALL));
// Temporarily free up the probe buffer to use when repartitioning.
- RETURN_IF_ERROR(
- input_partition_->probe_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
- DCHECK_EQ(build_partition->build_rows()->blocks_pinned(), 0) << NodeDebugString();
- DCHECK_EQ(input_partition_->probe_rows()->blocks_pinned(), 0) << NodeDebugString();
+ input_partition_->probe_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+ DCHECK_EQ(build_partition->build_rows()->BytesPinned(false), 0) << NodeDebugString();
+ DCHECK_EQ(input_partition_->probe_rows()->BytesPinned(false), 0) << NodeDebugString();
int64_t num_input_rows = build_partition->build_rows()->num_rows();
RETURN_IF_ERROR(builder_->RepartitionBuildInput(
build_partition, next_partition_level, input_partition_->probe_rows()));
@@ -430,7 +435,8 @@ Status PartitionedHashJoinNode::PrepareSpilledPartitionForProbe(
"more rows than the input";
if (UNLIKELY(num_input_rows == largest_partition_rows)) {
return Status(TErrorCode::PARTITIONED_HASH_JOIN_REPARTITION_FAILS, id_,
- next_partition_level, num_input_rows);
+ next_partition_level, num_input_rows, NodeDebugString(),
+ buffer_pool_client_.DebugString());
}
RETURN_IF_ERROR(PrepareForProbe());
@@ -816,18 +822,18 @@ static Status NullAwareAntiJoinError(bool build) {
Status PartitionedHashJoinNode::InitNullAwareProbePartition() {
RuntimeState* state = runtime_state_;
- unique_ptr<BufferedTupleStream> probe_rows = std::make_unique<BufferedTupleStream>(
- state, child(0)->row_desc(), state->block_mgr(), builder_->block_mgr_client(),
- false /* use_initial_small_buffers */, false /* read_write */);
- Status status = probe_rows->Init(id(), runtime_profile(), false);
+ unique_ptr<BufferedTupleStreamV2> probe_rows = make_unique<BufferedTupleStreamV2>(
+ state, child(0)->row_desc(), &buffer_pool_client_,
+ resource_profile_.spillable_buffer_size,
+ resource_profile_.spillable_buffer_size);
+ // TODO: this should be pinned if spilling is disabled.
+ Status status = probe_rows->Init(id(), false);
if (!status.ok()) goto error;
bool got_buffer;
status = probe_rows->PrepareForWrite(&got_buffer);
if (!status.ok()) goto error;
- if (!got_buffer) {
- status = state->block_mgr()->MemLimitTooLowError(builder_->block_mgr_client(), id());
- goto error;
- }
+ DCHECK(got_buffer)
+ << "Accounted in min reservation" << buffer_pool_client_.DebugString();
null_aware_probe_partition_.reset(new ProbePartition(
state, this, builder_->null_aware_partition(), std::move(probe_rows)));
return Status::OK();
@@ -841,15 +847,15 @@ error:
Status PartitionedHashJoinNode::InitNullProbeRows() {
RuntimeState* state = runtime_state_;
- null_probe_rows_ = std::make_unique<BufferedTupleStream>(state, child(0)->row_desc(),
- state->block_mgr(), builder_->block_mgr_client(),
- false /* use_initial_small_buffers */, false /* read_write */);
- RETURN_IF_ERROR(null_probe_rows_->Init(id(), runtime_profile(), false));
+ null_probe_rows_ = make_unique<BufferedTupleStreamV2>(state, child(0)->row_desc(),
+ &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+ resource_profile_.spillable_buffer_size);
+ // TODO: we shouldn't start with this unpinned if spilling is disabled.
+ RETURN_IF_ERROR(null_probe_rows_->Init(id(), false));
bool got_buffer;
RETURN_IF_ERROR(null_probe_rows_->PrepareForWrite(&got_buffer));
- if (!got_buffer) {
- return state->block_mgr()->MemLimitTooLowError(builder_->block_mgr_client(), id());
- }
+ DCHECK(got_buffer)
+ << "Accounted in min reservation" << buffer_pool_client_.DebugString();
return Status::OK();
}
@@ -860,8 +866,8 @@ Status PartitionedHashJoinNode::PrepareNullAwarePartition() {
DCHECK_EQ(probe_batch_pos_, -1);
DCHECK_EQ(probe_batch_->num_rows(), 0);
- BufferedTupleStream* build_stream = builder_->null_aware_partition()->build_rows();
- BufferedTupleStream* probe_stream = null_aware_probe_partition_->probe_rows();
+ BufferedTupleStreamV2* build_stream = builder_->null_aware_partition()->build_rows();
+ BufferedTupleStreamV2* probe_stream = null_aware_probe_partition_->probe_rows();
if (build_stream->num_rows() == 0) {
// There were no build rows. Nothing to do. Just prepare to output the null
@@ -874,7 +880,7 @@ Status PartitionedHashJoinNode::PrepareNullAwarePartition() {
// Bring the entire spilled build stream into memory and read into a single batch.
bool got_rows;
- RETURN_IF_ERROR(build_stream->GetRows(&nulls_build_batch_, &got_rows));
+ RETURN_IF_ERROR(build_stream->GetRows(mem_tracker(), &nulls_build_batch_, &got_rows));
if (!got_rows) return NullAwareAntiJoinError(true);
// Initialize the streams for read.
@@ -898,7 +904,7 @@ Status PartitionedHashJoinNode::OutputNullAwareProbeRows(RuntimeState* state,
int num_join_conjuncts = other_join_conjuncts_.size();
DCHECK(probe_batch_ != NULL);
- BufferedTupleStream* probe_stream = null_aware_probe_partition_->probe_rows();
+ BufferedTupleStreamV2* probe_stream = null_aware_probe_partition_->probe_rows();
if (probe_batch_pos_ == probe_batch_->num_rows()) {
probe_batch_pos_ = 0;
probe_batch_->TransferResourceOwnership(out_batch);
@@ -946,7 +952,8 @@ Status PartitionedHashJoinNode::PrepareForProbe() {
DCHECK(probe_hash_partitions_.empty());
// Initialize the probe partitions, providing them with probe streams.
- vector<unique_ptr<BufferedTupleStream>> probe_streams = builder_->TransferProbeStreams();
+ vector<unique_ptr<BufferedTupleStreamV2>> probe_streams =
+ builder_->TransferProbeStreams();
probe_hash_partitions_.resize(PARTITION_FANOUT);
for (int i = 0; i < PARTITION_FANOUT; ++i) {
PhjBuilder::Partition* build_partition = builder_->hash_partition(i);
@@ -982,16 +989,16 @@ Status PartitionedHashJoinNode::PrepareForProbe() {
}
void PartitionedHashJoinNode::CreateProbePartition(
- int partition_idx, unique_ptr<BufferedTupleStream> probe_rows) {
+ int partition_idx, unique_ptr<BufferedTupleStreamV2> probe_rows) {
DCHECK_GE(partition_idx, 0);
DCHECK_LT(partition_idx, probe_hash_partitions_.size());
DCHECK(probe_hash_partitions_[partition_idx] == NULL);
- probe_hash_partitions_[partition_idx] = std::make_unique<ProbePartition>(runtime_state_,
+ probe_hash_partitions_[partition_idx] = make_unique<ProbePartition>(runtime_state_,
this, builder_->hash_partition(partition_idx), std::move(probe_rows));
}
Status PartitionedHashJoinNode::EvaluateNullProbe(
- RuntimeState* state, BufferedTupleStream* build) {
+ RuntimeState* state, BufferedTupleStreamV2* build) {
if (null_probe_rows_ == NULL || null_probe_rows_->num_rows() == 0) {
return Status::OK();
}
@@ -1000,10 +1007,10 @@ Status PartitionedHashJoinNode::EvaluateNullProbe(
// Bring both the build and probe side into memory and do a pairwise evaluation.
bool got_rows;
scoped_ptr<RowBatch> build_rows;
- RETURN_IF_ERROR(build->GetRows(&build_rows, &got_rows));
+ RETURN_IF_ERROR(build->GetRows(mem_tracker(), &build_rows, &got_rows));
if (!got_rows) return NullAwareAntiJoinError(true);
scoped_ptr<RowBatch> probe_rows;
- RETURN_IF_ERROR(null_probe_rows_->GetRows(&probe_rows, &got_rows));
+ RETURN_IF_ERROR(null_probe_rows_->GetRows(mem_tracker(), &probe_rows, &got_rows));
if (!got_rows) return NullAwareAntiJoinError(false);
ScalarExprEvaluator* const* join_conjunct_evals = other_join_conjunct_evals_.data();
@@ -1060,11 +1067,9 @@ Status PartitionedHashJoinNode::CleanUpHashPartitions(
// can recurse the algorithm and create new hash partitions from spilled partitions.
// TODO: we shouldn't need to unpin the build stream if we stop spilling
// while probing.
- RETURN_IF_ERROR(
- build_partition->build_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
- DCHECK_EQ(build_partition->build_rows()->blocks_pinned(), 0);
- RETURN_IF_ERROR(
- probe_partition->probe_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
+ build_partition->build_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+ DCHECK_EQ(build_partition->build_rows()->BytesPinned(false), 0);
+ probe_partition->probe_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
if (probe_partition->probe_rows()->num_rows() != 0
|| NeedToProcessUnmatchedBuildRows()) {
@@ -1102,9 +1107,9 @@ Status PartitionedHashJoinNode::CleanUpHashPartitions(
// Just finished evaluating the null probe rows with all the non-spilled build
// partitions. Unpin this now to free this memory for repartitioning.
- if (null_probe_rows_ != NULL)
- RETURN_IF_ERROR(
- null_probe_rows_->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
+ if (null_probe_rows_ != NULL) {
+ null_probe_rows_->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+ }
builder_->ClearHashPartitions();
probe_hash_partitions_.clear();
@@ -1165,10 +1170,10 @@ string PartitionedHashJoinNode::NodeDebugString() const {
ss << " Probe hash partition " << i << ": ";
if (probe_partition != NULL) {
ss << "probe ptr=" << probe_partition;
- BufferedTupleStream* probe_rows = probe_partition->probe_rows();
+ BufferedTupleStreamV2* probe_rows = probe_partition->probe_rows();
if (probe_rows != NULL) {
- ss << " Probe Rows: " << probe_rows->num_rows()
- << " (Blocks pinned: " << probe_rows->blocks_pinned() << ")";
+ ss << " Probe Rows: " << probe_rows->num_rows()
+ << " (Bytes pinned: " << probe_rows->BytesPinned(false) << ")";
}
}
ss << endl;
@@ -1189,12 +1194,15 @@ string PartitionedHashJoinNode::NodeDebugString() const {
}
}
if (input_partition_ != NULL) {
- DCHECK(input_partition_->build_partition()->build_rows() != NULL);
DCHECK(input_partition_->probe_rows() != NULL);
- ss << "InputPartition: " << input_partition_.get() << endl
- << " Spilled Build Rows: "
- << input_partition_->build_partition()->build_rows()->num_rows() << endl
- << " Spilled Probe Rows: " << input_partition_->probe_rows()->num_rows() << endl;
+ ss << "InputPartition: " << input_partition_.get() << endl;
+ PhjBuilder::Partition* build_partition = input_partition_->build_partition();
+ if (build_partition->IsClosed()) {
+ ss << " Build Partition Closed" << endl;
+ } else {
+ ss << " Build Rows: " << build_partition->build_rows()->num_rows() << endl;
+ }
+ ss << " Probe Rows: " << input_partition_->probe_rows()->num_rows() << endl;
} else {
ss << "InputPartition: NULL" << endl;
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.h b/be/src/exec/partitioned-hash-join-node.h
index 73e0dd5..b3f663e 100644
--- a/be/src/exec/partitioned-hash-join-node.h
+++ b/be/src/exec/partitioned-hash-join-node.h
@@ -15,28 +15,24 @@
// specific language governing permissions and limitations
// under the License.
-
#ifndef IMPALA_EXEC_PARTITIONED_HASH_JOIN_NODE_H
#define IMPALA_EXEC_PARTITIONED_HASH_JOIN_NODE_H
-#include <boost/scoped_ptr.hpp>
-#include <boost/thread.hpp>
#include <list>
#include <memory>
#include <string>
+#include <boost/scoped_ptr.hpp>
+#include <boost/thread.hpp>
#include "exec/blocking-join-node.h"
#include "exec/exec-node.h"
#include "exec/partitioned-hash-join-builder.h"
-#include "runtime/buffered-block-mgr.h"
#include "gen-cpp/Types_types.h"
namespace impala {
class BloomFilter;
-class BufferedBlockMgr;
-class BufferedTupleStream;
class MemPool;
class RowBatch;
class RuntimeFilter;
@@ -100,8 +96,6 @@ class TupleRow;
/// NULLs into several different streams, which are processed in a separate step to
/// produce additional output rows. The NAAJ algorithm is documented in more detail in
/// header comments for the null aware functions and data structures.
-///
-/// TODO: don't copy tuple rows so often.
class PartitionedHashJoinNode : public BlockingJoinNode {
public:
PartitionedHashJoinNode(ObjectPool* pool, const TPlanNode& tnode,
@@ -168,7 +162,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
/// Creates an initialized probe partition at 'partition_idx' in
/// 'probe_hash_partitions_'.
void CreateProbePartition(
- int partition_idx, std::unique_ptr<BufferedTupleStream> probe_rows);
+ int partition_idx, std::unique_ptr<BufferedTupleStreamV2> probe_rows);
/// Append the probe row 'row' to 'stream'. The stream must be unpinned and must have
/// a write buffer allocated, so this will succeed unless an error is encountered.
@@ -176,7 +170,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
/// return convention is used to avoid emitting unnecessary code for ~Status in perf-
/// critical code.
bool AppendProbeRow(
- BufferedTupleStream* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
+ BufferedTupleStreamV2* stream, TupleRow* row, Status* status) WARN_UNUSED_RESULT;
/// Probes the hash table for rows matching the current probe row and appends
/// all the matching build rows (with probe row) to output batch. Returns true
@@ -331,7 +325,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
/// conjuncts pass (i.e. there is a match).
/// This is used for NAAJ, when there are NULL probe rows.
Status EvaluateNullProbe(
- RuntimeState* state, BufferedTupleStream* build) WARN_UNUSED_RESULT;
+ RuntimeState* state, BufferedTupleStreamV2* build) WARN_UNUSED_RESULT;
/// Prepares to output NULLs on the probe side for NAAJ. Before calling this,
/// matched_null_probe_ should have been fully evaluated.
@@ -478,7 +472,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
/// For NAAJ, this stream contains all probe rows that had NULL on the hash table
/// conjuncts. Must be unique_ptr so we can release it and transfer to output batches.
- std::unique_ptr<BufferedTupleStream> null_probe_rows_;
+ std::unique_ptr<BufferedTupleStreamV2> null_probe_rows_;
/// For each row in null_probe_rows_, true if this row has matched any build row
/// (i.e. the resulting joined row passes other_join_conjuncts).
@@ -510,7 +504,7 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
/// that has been prepared for writing with an I/O-sized write buffer.
ProbePartition(RuntimeState* state, PartitionedHashJoinNode* parent,
PhjBuilder::Partition* build_partition,
- std::unique_ptr<BufferedTupleStream> probe_rows);
+ std::unique_ptr<BufferedTupleStreamV2> probe_rows);
~ProbePartition();
/// Prepare to read the probe rows. Allocates the first read block, so reads will
@@ -523,21 +517,19 @@ class PartitionedHashJoinNode : public BlockingJoinNode {
/// resources if 'batch' is NULL. Idempotent.
void Close(RowBatch* batch);
- BufferedTupleStream* ALWAYS_INLINE probe_rows() { return probe_rows_.get(); }
+ BufferedTupleStreamV2* ALWAYS_INLINE probe_rows() { return probe_rows_.get(); }
PhjBuilder::Partition* build_partition() { return build_partition_; }
inline bool IsClosed() const { return probe_rows_ == NULL; }
private:
- PartitionedHashJoinNode* parent_;
-
/// The corresponding build partition. Not NULL. Owned by PhjBuilder.
PhjBuilder::Partition* build_partition_;
/// Stream of probe tuples in this partition. Initially owned by this object but
/// transferred to the parent exec node (via the row batch) when the partition
/// is complete. If NULL, ownership was transferred and the partition is closed.
- std::unique_ptr<BufferedTupleStream> probe_rows_;
+ std::unique_ptr<BufferedTupleStreamV2> probe_rows_;
};
/// For the below codegen'd functions, xxx_fn_level0_ uses CRC hashing when available
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-node.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.inline.h b/be/src/exec/partitioned-hash-join-node.inline.h
index a53b40e..3441aac 100644
--- a/be/src/exec/partitioned-hash-join-node.inline.h
+++ b/be/src/exec/partitioned-hash-join-node.inline.h
@@ -20,7 +20,7 @@
#include "exec/partitioned-hash-join-node.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
namespace impala {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/sort-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/sort-node.cc b/be/src/exec/sort-node.cc
index fd42124..440f809 100644
--- a/be/src/exec/sort-node.cc
+++ b/be/src/exec/sort-node.cc
@@ -52,9 +52,12 @@ Status SortNode::Prepare(RuntimeState* state) {
SCOPED_TIMER(runtime_profile_->total_time_counter());
RETURN_IF_ERROR(ExecNode::Prepare(state));
less_than_.reset(new TupleRowComparator(ordering_exprs_, is_asc_order_, nulls_first_));
- sorter_.reset(new Sorter(*less_than_, sort_tuple_exprs_,
- &row_descriptor_, mem_tracker(), runtime_profile(), state));
+ sorter_.reset(
+ new Sorter(*less_than_, sort_tuple_exprs_, &row_descriptor_, mem_tracker(),
+ &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+ runtime_profile(), state, id(), true));
RETURN_IF_ERROR(sorter_->Prepare(pool_, expr_mem_pool()));
+ DCHECK_GE(resource_profile_.min_reservation, sorter_->ComputeMinReservation());
AddCodegenDisabledMessage(state);
return Status::OK();
}
@@ -69,9 +72,13 @@ void SortNode::Codegen(RuntimeState* state) {
Status SortNode::Open(RuntimeState* state) {
SCOPED_TIMER(runtime_profile_->total_time_counter());
- // Open the child before consuming resources in this node.
- RETURN_IF_ERROR(child(0)->Open(state));
RETURN_IF_ERROR(ExecNode::Open(state));
+ RETURN_IF_ERROR(child(0)->Open(state));
+ // Claim reservation after the child has been opened to reduce the peak reservation
+ // requirement.
+ if (!buffer_pool_client_.is_registered()) {
+ RETURN_IF_ERROR(ClaimBufferReservation(state));
+ }
RETURN_IF_ERROR(less_than_->Open(pool_, state, expr_mem_pool()));
RETURN_IF_ERROR(sorter_->Open());
RETURN_IF_CANCELLED(state);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/sort-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/sort-node.h b/be/src/exec/sort-node.h
index 8b3de11..a11d424 100644
--- a/be/src/exec/sort-node.h
+++ b/be/src/exec/sort-node.h
@@ -20,13 +20,12 @@
#include "exec/exec-node.h"
#include "runtime/sorter.h"
-#include "runtime/buffered-block-mgr.h"
namespace impala {
/// Node that implements a full sort of its input with a fixed memory budget, spilling
/// to disk if the input is larger than available memory.
-/// Uses Sorter and BufferedBlockMgr for the external sort implementation.
+/// Uses Sorter for the external sort implementation.
/// Input rows to SortNode are materialized by the Sorter into a single tuple
/// using the expressions specified in sort_tuple_exprs_.
/// In GetNext(), SortNode passes in the output batch to the sorter instance created
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index 2de0f2e..92af968 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -24,8 +24,6 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime")
add_library(Runtime
- buffered-block-mgr.cc
- buffered-tuple-stream.cc
buffered-tuple-stream-v2.cc
client-cache.cc
coordinator.cc
@@ -45,6 +43,7 @@ add_library(Runtime
hbase-table.cc
hbase-table-factory.cc
hdfs-fs-cache.cc
+ initial-reservations.cc
lib-cache.cc
mem-tracker.cc
mem-pool.cc
@@ -83,7 +82,6 @@ ADD_BE_TEST(string-buffer-test)
ADD_BE_TEST(data-stream-test)
ADD_BE_TEST(timestamp-test)
ADD_BE_TEST(disk-io-mgr-test)
-ADD_BE_TEST(buffered-block-mgr-test)
ADD_BE_TEST(parallel-executor-test)
ADD_BE_TEST(raw-value-test)
ADD_BE_TEST(string-compare-test)
@@ -93,7 +91,6 @@ ADD_BE_TEST(thread-resource-mgr-test)
ADD_BE_TEST(mem-tracker-test)
ADD_BE_TEST(multi-precision-test)
ADD_BE_TEST(decimal-test)
-ADD_BE_TEST(buffered-tuple-stream-test)
ADD_BE_TEST(buffered-tuple-stream-v2-test)
ADD_BE_TEST(hdfs-fs-cache-test)
ADD_BE_TEST(tmp-file-mgr-test)
[02/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
index 8e8ddc0..54ce9b0 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/fk-pk-join-detection.test
@@ -5,7 +5,7 @@ on ss_customer_sk = c_customer_sk
where c_salutation = 'Mrs.'
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=180.46MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=180.46MB mem-reservation=8.50MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -13,7 +13,7 @@ PLAN-ROOT SINK
| hash predicates: ss_customer_sk = c_customer_sk
| fk/pk conjuncts: ss_customer_sk = c_customer_sk
| runtime filters: RF000 <- c_customer_sk
-| mem-estimate=4.46MB mem-reservation=136.00MB
+| mem-estimate=4.46MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=0,1 row-size=355B cardinality=529700
|
|--01:SCAN HDFS [tpcds.customer]
@@ -43,7 +43,7 @@ on ss_customer_sk = c_customer_sk
where c_salutation = 'Mrs.'
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=180.46MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=180.46MB mem-reservation=8.50MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -51,7 +51,7 @@ PLAN-ROOT SINK
| hash predicates: ss_customer_sk = c_customer_sk
| fk/pk conjuncts: ss_customer_sk = c_customer_sk
| other predicates: c_salutation = 'Mrs.'
-| mem-estimate=4.46MB mem-reservation=136.00MB
+| mem-estimate=4.46MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=0,1N row-size=355B cardinality=2880404
|
|--01:SCAN HDFS [tpcds.customer]
@@ -80,7 +80,7 @@ on ss_customer_sk = c_customer_sk
where c_salutation = 'Mrs.'
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=180.46MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=180.46MB mem-reservation=8.50MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -88,7 +88,7 @@ PLAN-ROOT SINK
| hash predicates: ss_customer_sk = c_customer_sk
| fk/pk conjuncts: ss_customer_sk = c_customer_sk
| runtime filters: RF000 <- c_customer_sk
-| mem-estimate=4.46MB mem-reservation=136.00MB
+| mem-estimate=4.46MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=0N,1 row-size=355B cardinality=529700
|
|--01:SCAN HDFS [tpcds.customer]
@@ -117,7 +117,7 @@ on ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number
where sr_return_quantity < 10
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=210.65MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=210.65MB mem-reservation=4.25MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -125,7 +125,7 @@ PLAN-ROOT SINK
| hash predicates: ss_item_sk = sr_item_sk, ss_ticket_number = sr_ticket_number
| fk/pk conjuncts: ss_item_sk = sr_item_sk, ss_ticket_number = sr_ticket_number
| runtime filters: RF000 <- sr_item_sk, RF001 <- sr_ticket_number
-| mem-estimate=2.65MB mem-reservation=136.00MB
+| mem-estimate=2.65MB mem-reservation=4.25MB spill-buffer=256.00KB
| tuple-ids=0,1 row-size=188B cardinality=211838
|
|--01:SCAN HDFS [tpcds.store_returns]
@@ -153,7 +153,7 @@ tpcds.store_sales inner join tpcds.web_sales
on ss_sold_time_sk = ws_sold_time_sk
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=396.67MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=396.67MB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -161,7 +161,7 @@ PLAN-ROOT SINK
| hash predicates: ss_sold_time_sk = ws_sold_time_sk
| fk/pk conjuncts: none
| runtime filters: RF000 <- ws_sold_time_sk
-| mem-estimate=108.67MB mem-reservation=136.00MB
+| mem-estimate=108.67MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=244B cardinality=44136418
|
|--01:SCAN HDFS [tpcds.web_sales]
@@ -188,7 +188,7 @@ on a.d_date_sk = b.d_date_sk
where a.d_holiday = "Y"
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=107.62MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=107.62MB mem-reservation=17.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -196,7 +196,7 @@ PLAN-ROOT SINK
| hash predicates: b.d_date_sk = a.d_date_sk
| fk/pk conjuncts: b.d_date_sk = a.d_date_sk
| runtime filters: RF000 <- a.d_date_sk
-| mem-estimate=11.62MB mem-reservation=136.00MB
+| mem-estimate=11.62MB mem-reservation=17.00MB spill-buffer=1.00MB
| tuple-ids=1,0 row-size=606B cardinality=36525
|
|--00:SCAN HDFS [tpcds.date_dim a]
@@ -229,7 +229,7 @@ where ss_item_sk = sr_item_sk and ss_ticket_number = sr_ticket_number
and d1.d_fy_week_seq = 1000
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=352.73MB mem-reservation=544.00MB
+| Per-Host Resources: mem-estimate=352.73MB mem-reservation=4.25MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -237,7 +237,7 @@ PLAN-ROOT SINK
| hash predicates: ss_addr_sk = c_current_addr_sk
| fk/pk conjuncts: none
| runtime filters: RF000 <- c_current_addr_sk
-| mem-estimate=429.69KB mem-reservation=136.00MB
+| mem-estimate=429.69KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=1,0,3,4,2 row-size=60B cardinality=19358
|
|--02:SCAN HDFS [tpcds.customer]
@@ -252,7 +252,7 @@ PLAN-ROOT SINK
| hash predicates: sr_returned_date_sk = d2.d_date_sk
| fk/pk conjuncts: sr_returned_date_sk = d2.d_date_sk
| runtime filters: RF001 <- d2.d_date_sk
-| mem-estimate=313.88KB mem-reservation=136.00MB
+| mem-estimate=313.88KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=1,0,3,4 row-size=56B cardinality=8131
|
|--04:SCAN HDFS [tpcds.date_dim d2]
@@ -267,14 +267,14 @@ PLAN-ROOT SINK
| hash predicates: sr_item_sk = ss_item_sk, sr_ticket_number = ss_ticket_number
| fk/pk conjuncts: sr_item_sk = ss_item_sk, sr_ticket_number = ss_ticket_number
| runtime filters: RF002 <- ss_item_sk, RF003 <- ss_ticket_number
-| mem-estimate=380.02KB mem-reservation=136.00MB
+| mem-estimate=380.02KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=1,0,3 row-size=52B cardinality=8131
|
|--05:HASH JOIN [INNER JOIN]
| | hash predicates: ss_sold_date_sk = d1.d_date_sk
| | fk/pk conjuncts: ss_sold_date_sk = d1.d_date_sk
| | runtime filters: RF004 <- d1.d_date_sk
-| | mem-estimate=62B mem-reservation=136.00MB
+| | mem-estimate=62B mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=0,3 row-size=32B cardinality=11055
| |
| |--03:SCAN HDFS [tpcds.date_dim d1]
@@ -311,7 +311,7 @@ tpcds.store_sales inner join tpcds.customer
on ss_customer_sk % 10 = c_customer_sk / 100
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=202.79MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=202.79MB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -319,7 +319,7 @@ PLAN-ROOT SINK
| hash predicates: ss_customer_sk % 10 = c_customer_sk / 100
| fk/pk conjuncts: assumed fk/pk
| runtime filters: RF000 <- c_customer_sk / 100
-| mem-estimate=26.79MB mem-reservation=136.00MB
+| mem-estimate=26.79MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=355B cardinality=2880404
|
|--01:SCAN HDFS [tpcds.customer]
@@ -346,7 +346,7 @@ tpcds.store_sales inner join tpcds_seq_snap.customer
on ss_customer_sk = c_customer_sk
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=2.17GB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=2.17GB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -354,7 +354,7 @@ PLAN-ROOT SINK
| hash predicates: ss_customer_sk = c_customer_sk
| fk/pk conjuncts: assumed fk/pk
| runtime filters: RF000 <- c_customer_sk
-| mem-estimate=2.00GB mem-reservation=136.00MB
+| mem-estimate=2.00GB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=8B cardinality=2880404
|
|--01:SCAN HDFS [tpcds_seq_snap.customer]
@@ -380,7 +380,7 @@ tpcds_seq_snap.store_sales inner join tpcds.customer
on ss_customer_sk = c_customer_sk
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=176.42MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=176.42MB mem-reservation=1.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -388,7 +388,7 @@ PLAN-ROOT SINK
| hash predicates: ss_customer_sk = c_customer_sk
| fk/pk conjuncts: assumed fk/pk
| runtime filters: RF000 <- c_customer_sk
-| mem-estimate=429.69KB mem-reservation=136.00MB
+| mem-estimate=429.69KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=0,1 row-size=8B cardinality=unavailable
|
|--01:SCAN HDFS [tpcds.customer]
@@ -416,7 +416,7 @@ tpcds.store_sales inner join
on ss_sold_time_sk = ws_sold_time_sk
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=298.00MB mem-reservation=400.00MB
+| Per-Host Resources: mem-estimate=298.00MB mem-reservation=2.12MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -424,12 +424,12 @@ PLAN-ROOT SINK
| hash predicates: ss_sold_time_sk = ws_sold_time_sk
| fk/pk conjuncts: none
| runtime filters: RF000 <- ws_sold_time_sk
-| mem-estimate=170.89KB mem-reservation=136.00MB
+| mem-estimate=170.89KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=0,2 row-size=104B cardinality=2440073
|
|--02:AGGREGATE [FINALIZE]
| | group by: ws_sold_time_sk
-| | mem-estimate=10.00MB mem-reservation=264.00MB
+| | mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=2 row-size=4B cardinality=39771
| |
| 01:SCAN HDFS [tpcds.web_sales]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
index f22e359..8bd09be 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test
@@ -40,7 +40,7 @@ order by cnt, bigint_col
limit 10
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=264.00MB
+| Per-Host Resources: mem-estimate=144.00MB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -52,7 +52,7 @@ PLAN-ROOT SINK
01:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: bigint_col
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypes]
@@ -78,7 +78,7 @@ PLAN-ROOT SINK
| tuple-ids=2 row-size=16B cardinality=10
|
F01:PLAN FRAGMENT [HASH(bigint_col)] hosts=3 instances=9
-Per-Host Resources: mem-estimate=384.00MB mem-reservation=792.00MB
+Per-Host Resources: mem-estimate=384.00MB mem-reservation=102.00MB
02:TOP-N [LIMIT=10]
| order by: count(int_col) ASC, bigint_col ASC
| mem-estimate=160B mem-reservation=0B
@@ -87,7 +87,7 @@ Per-Host Resources: mem-estimate=384.00MB mem-reservation=792.00MB
04:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| group by: bigint_col
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=unavailable
|
03:EXCHANGE [HASH(bigint_col)]
@@ -99,7 +99,7 @@ Per-Host Resources: mem-estimate=432.00MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: count(int_col)
| group by: bigint_col
-| mem-estimate=128.00MB mem-reservation=0B
+| mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypes, RANDOM]
@@ -119,7 +119,7 @@ from functional_parquet.alltypes
where id < 10
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=16.00MB mem-reservation=40.00MB
+| Per-Host Resources: mem-estimate=16.00MB mem-reservation=10.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -128,12 +128,12 @@ PLAN-ROOT SINK
| partition by: int_col
| order by: id ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=4,3 row-size=16B cardinality=unavailable
|
01:SORT
| order by: int_col ASC NULLS FIRST, id ASC
-| mem-estimate=0B mem-reservation=24.00MB
+| mem-estimate=0B mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=4 row-size=8B cardinality=unavailable
|
00:SCAN HDFS [functional_parquet.alltypes]
@@ -157,18 +157,18 @@ PLAN-ROOT SINK
| tuple-ids=4,3 row-size=16B cardinality=unavailable
|
F01:PLAN FRAGMENT [HASH(int_col)] hosts=3 instances=9
-Per-Host Resources: mem-estimate=0B mem-reservation=120.00MB
+Per-Host Resources: mem-estimate=0B mem-reservation=30.00MB
02:ANALYTIC
| functions: row_number()
| partition by: int_col
| order by: id ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=4,3 row-size=16B cardinality=unavailable
|
01:SORT
| order by: int_col ASC NULLS FIRST, id ASC
-| mem-estimate=0B mem-reservation=24.00MB
+| mem-estimate=0B mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=4 row-size=8B cardinality=unavailable
|
03:EXCHANGE [HASH(int_col)]
@@ -313,7 +313,7 @@ from tpch_nested_parquet.customer c, c.c_orders o1, c.c_orders o2
where o1.o_orderkey = o2.o_orderkey + 2 and o1.o_orderkey < 5
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=88.00MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=88.00MB mem-reservation=1.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -324,7 +324,7 @@ PLAN-ROOT SINK
|--06:HASH JOIN [INNER JOIN]
| | hash predicates: o1.o_orderkey = o2.o_orderkey + 2
| | fk/pk conjuncts: assumed fk/pk
-| | mem-estimate=0B mem-reservation=136.00MB
+| | mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=1,0,2 row-size=286B cardinality=10
| |
| |--04:UNNEST [c.c_orders o2]
@@ -366,7 +366,7 @@ PLAN-ROOT SINK
| tuple-ids=1,0,2 row-size=286B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=9
-Per-Host Resources: mem-estimate=264.00MB mem-reservation=408.00MB
+Per-Host Resources: mem-estimate=264.00MB mem-reservation=3.19MB
01:SUBPLAN
| mem-estimate=0B mem-reservation=0B
| tuple-ids=1,0,2 row-size=286B cardinality=1500000
@@ -374,7 +374,7 @@ Per-Host Resources: mem-estimate=264.00MB mem-reservation=408.00MB
|--06:HASH JOIN [INNER JOIN]
| | hash predicates: o1.o_orderkey = o2.o_orderkey + 2
| | fk/pk conjuncts: assumed fk/pk
-| | mem-estimate=0B mem-reservation=136.00MB
+| | mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=1,0,2 row-size=286B cardinality=10
| |
| |--04:UNNEST [c.c_orders o2]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
index 0de7109..4165e70 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test
@@ -14,7 +14,7 @@ PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
@@ -44,7 +44,7 @@ PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
@@ -75,7 +75,7 @@ PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: count(*)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [functional_parquet.alltypes]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
index f3dd19a..90a318e 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
@@ -353,18 +353,18 @@ select l_orderkey, count(*)
from tpch_parquet.lineitem
group by l_orderkey
---- PLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=106.24MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=106.24MB mem-reservation=264.00MB
+| Per-Host Resources: mem-estimate=106.24MB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:AGGREGATE [FINALIZE]
| output: count(*)
| group by: l_orderkey
-| mem-estimate=26.24MB mem-reservation=264.00MB
+| mem-estimate=26.24MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=1563438
|
00:SCAN HDFS [tpch_parquet.lineitem]
@@ -375,7 +375,7 @@ PLAN-ROOT SINK
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=8B cardinality=6001215
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=264.00MB
+Per-Host Resource Reservation: Memory=8.50MB
Per-Host Resource Estimates: Memory=116.24MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -388,11 +388,11 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=16B cardinality=1563438
|
F01:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=10.00MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=10.00MB mem-reservation=8.50MB
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: l_orderkey
-| mem-estimate=10.00MB mem-reservation=264.00MB
+| mem-estimate=10.00MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=1 row-size=16B cardinality=1563438
|
02:EXCHANGE [HASH(l_orderkey)]
@@ -404,7 +404,7 @@ Per-Host Resources: mem-estimate=106.24MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: count(*)
| group by: l_orderkey
-| mem-estimate=26.24MB mem-reservation=0B
+| mem-estimate=26.24MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=1563438
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -415,7 +415,7 @@ Per-Host Resources: mem-estimate=106.24MB mem-reservation=0B
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=8B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=528.00MB
+Per-Host Resource Reservation: Memory=8.50MB
Per-Host Resource Estimates: Memory=232.48MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -428,11 +428,11 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=16B cardinality=1563438
|
F01:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=20.00MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=20.00MB mem-reservation=8.50MB
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: l_orderkey
-| mem-estimate=10.00MB mem-reservation=264.00MB
+| mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
| tuple-ids=1 row-size=16B cardinality=1563438
|
02:EXCHANGE [HASH(l_orderkey)]
@@ -444,7 +444,7 @@ Per-Host Resources: mem-estimate=212.48MB mem-reservation=0B
01:AGGREGATE [STREAMING]
| output: count(*)
| group by: l_orderkey
-| mem-estimate=26.24MB mem-reservation=0B
+| mem-estimate=26.24MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=1563438
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -468,7 +468,7 @@ PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [tpch_parquet.lineitem]
@@ -489,7 +489,7 @@ PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
02:EXCHANGE [UNPARTITIONED]
@@ -500,7 +500,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
Per-Host Resources: mem-estimate=90.00MB mem-reservation=0B
01:AGGREGATE
| output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -521,7 +521,7 @@ PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
02:EXCHANGE [UNPARTITIONED]
@@ -532,7 +532,7 @@ F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
Per-Host Resources: mem-estimate=180.00MB mem-reservation=0B
01:AGGREGATE
| output: sum_init_zero(tpch_parquet.lineitem.parquet-stats: num_rows)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=1
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -548,17 +548,17 @@ select *
from tpch_parquet.lineitem
order by l_comment
---- PLAN
-Per-Host Resource Reservation: Memory=48.00MB
-Per-Host Resource Estimates: Memory=240.00MB
+Per-Host Resource Reservation: Memory=12.00MB
+Per-Host Resource Estimates: Memory=120.00MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=240.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=120.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: l_comment ASC
-| mem-estimate=160.00MB mem-reservation=48.00MB
+| mem-estimate=40.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
00:SCAN HDFS [tpch_parquet.lineitem]
@@ -569,8 +569,8 @@ PLAN-ROOT SINK
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=48.00MB
-Per-Host Resource Estimates: Memory=240.00MB
+Per-Host Resource Reservation: Memory=12.00MB
+Per-Host Resource Estimates: Memory=120.00MB
F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=0B mem-reservation=0B
@@ -583,10 +583,10 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=263B cardinality=6001215
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=240.00MB mem-reservation=48.00MB
+Per-Host Resources: mem-estimate=120.00MB mem-reservation=12.00MB
01:SORT
| order by: l_comment ASC
-| mem-estimate=160.00MB mem-reservation=48.00MB
+| mem-estimate=40.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -597,8 +597,8 @@ Per-Host Resources: mem-estimate=240.00MB mem-reservation=48.00MB
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=96.00MB
-Per-Host Resource Estimates: Memory=480.00MB
+Per-Host Resource Reservation: Memory=24.00MB
+Per-Host Resource Estimates: Memory=240.00MB
F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
| Per-Host Resources: mem-estimate=0B mem-reservation=0B
@@ -611,10 +611,10 @@ PLAN-ROOT SINK
| tuple-ids=1 row-size=263B cardinality=6001215
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=480.00MB mem-reservation=96.00MB
+Per-Host Resources: mem-estimate=240.00MB mem-reservation=24.00MB
01:SORT
| order by: l_comment ASC
-| mem-estimate=160.00MB mem-reservation=48.00MB
+| mem-estimate=40.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=263B cardinality=6001215
|
00:SCAN HDFS [tpch_parquet.lineitem, RANDOM]
@@ -714,11 +714,11 @@ Per-Host Resources: mem-estimate=160.05MB mem-reservation=0B
select *
from tpch.lineitem inner join tpch.orders on l_orderkey = o_orderkey
---- PLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=476.41MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=476.41MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=476.41MB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -726,7 +726,7 @@ PLAN-ROOT SINK
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=300.41MB mem-reservation=136.00MB
+| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
|--01:SCAN HDFS [tpch.orders]
@@ -746,7 +746,7 @@ PLAN-ROOT SINK
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=476.41MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -759,12 +759,12 @@ PLAN-ROOT SINK
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=300.41MB mem-reservation=136.00MB
+| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
|--03:EXCHANGE [BROADCAST]
@@ -790,7 +790,7 @@ Per-Host Resources: mem-estimate=388.41MB mem-reservation=136.00MB
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=952.83MB
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -803,13 +803,13 @@ PLAN-ROOT SINK
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=776.83MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=776.83MB mem-reservation=68.00MB
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash-table-id=00
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=300.41MB mem-reservation=136.00MB
+| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
|--F03:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -846,11 +846,11 @@ Per-Host Resources: mem-estimate=776.83MB mem-reservation=272.00MB
select *
from tpch.lineitem inner join /* +shuffle */ tpch.orders on l_orderkey = o_orderkey
---- PLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=476.41MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=476.41MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=476.41MB mem-reservation=34.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -858,7 +858,7 @@ PLAN-ROOT SINK
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=300.41MB mem-reservation=136.00MB
+| mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
|--01:SCAN HDFS [tpch.orders]
@@ -878,7 +878,7 @@ PLAN-ROOT SINK
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=34.00MB
Per-Host Resource Estimates: Memory=276.14MB
F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -891,12 +891,12 @@ PLAN-ROOT SINK
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=100.14MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=100.14MB mem-reservation=34.00MB
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=100.14MB mem-reservation=136.00MB
+| mem-estimate=100.14MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
|--04:EXCHANGE [HASH(o_orderkey)]
@@ -928,7 +928,7 @@ Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=263B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=452.14MB
F03:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -941,13 +941,13 @@ PLAN-ROOT SINK
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=100.14MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=100.14MB mem-reservation=68.00MB
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash-table-id=00
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=50.07MB mem-reservation=136.00MB
+| mem-estimate=50.07MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1 row-size=454B cardinality=5757710
|
|--F04:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -1151,24 +1151,24 @@ PLAN-ROOT SINK
select max(tinyint_col) over(partition by int_col)
from functional.alltypes
---- PLAN
-Per-Host Resource Reservation: Memory=40.00MB
-Per-Host Resource Estimates: Memory=24.00MB
+Per-Host Resource Reservation: Memory=10.00MB
+Per-Host Resource Estimates: Memory=18.00MB
Codegen disabled by planner
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=24.00MB mem-reservation=40.00MB
+| Per-Host Resources: mem-estimate=18.00MB mem-reservation=10.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
02:ANALYTIC
| functions: max(tinyint_col)
| partition by: int_col
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=3,2 row-size=6B cardinality=7300
|
01:SORT
| order by: int_col ASC NULLS FIRST
-| mem-estimate=8.00MB mem-reservation=24.00MB
+| mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=3 row-size=5B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -1179,8 +1179,8 @@ PLAN-ROOT SINK
mem-estimate=16.00MB mem-reservation=0B
tuple-ids=0 row-size=5B cardinality=7300
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=40.00MB
-Per-Host Resource Estimates: Memory=24.00MB
+Per-Host Resource Reservation: Memory=10.00MB
+Per-Host Resource Estimates: Memory=18.00MB
Codegen disabled by planner
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1193,16 +1193,16 @@ PLAN-ROOT SINK
| tuple-ids=3,2 row-size=6B cardinality=7300
|
F01:PLAN FRAGMENT [HASH(int_col)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=8.00MB mem-reservation=40.00MB
+Per-Host Resources: mem-estimate=2.00MB mem-reservation=10.00MB
02:ANALYTIC
| functions: max(tinyint_col)
| partition by: int_col
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=3,2 row-size=6B cardinality=7300
|
01:SORT
| order by: int_col ASC NULLS FIRST
-| mem-estimate=8.00MB mem-reservation=24.00MB
+| mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=3 row-size=5B cardinality=7300
|
03:EXCHANGE [HASH(int_col)]
@@ -1219,8 +1219,8 @@ Per-Host Resources: mem-estimate=16.00MB mem-reservation=0B
mem-estimate=16.00MB mem-reservation=0B
tuple-ids=0 row-size=5B cardinality=7300
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=80.00MB
-Per-Host Resource Estimates: Memory=48.00MB
+Per-Host Resource Reservation: Memory=20.00MB
+Per-Host Resource Estimates: Memory=36.00MB
Codegen disabled by planner
F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1233,16 +1233,16 @@ PLAN-ROOT SINK
| tuple-ids=3,2 row-size=6B cardinality=7300
|
F01:PLAN FRAGMENT [HASH(int_col)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=16.00MB mem-reservation=80.00MB
+Per-Host Resources: mem-estimate=4.00MB mem-reservation=20.00MB
02:ANALYTIC
| functions: max(tinyint_col)
| partition by: int_col
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=3,2 row-size=6B cardinality=7300
|
01:SORT
| order by: int_col ASC NULLS FIRST
-| mem-estimate=8.00MB mem-reservation=24.00MB
+| mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=3 row-size=5B cardinality=7300
|
03:EXCHANGE [HASH(int_col)]
@@ -1266,11 +1266,11 @@ select *, row_number() over (order by o_totalprice) rnum_price,
row_number() over (order by o_orderpriority) rnum_priority
from tpch_parquet.orders
---- PLAN
-Per-Host Resource Reservation: Memory=144.00MB
-Per-Host Resource Estimates: Memory=160.00MB
+Per-Host Resource Reservation: Memory=36.00MB
+Per-Host Resource Estimates: Memory=58.00MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=160.00MB mem-reservation=144.00MB
+| Per-Host Resources: mem-estimate=58.00MB mem-reservation=36.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -1278,36 +1278,36 @@ PLAN-ROOT SINK
| functions: row_number()
| order by: o_orderpriority ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=10,5 row-size=215B cardinality=1500000
|
05:SORT
| order by: o_orderpriority ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=10 row-size=207B cardinality=1500000
|
04:ANALYTIC
| functions: row_number()
| order by: o_orderdate ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=8,4 row-size=207B cardinality=1500000
|
03:SORT
| order by: o_orderdate ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=8 row-size=199B cardinality=1500000
|
02:ANALYTIC
| functions: row_number()
| order by: o_totalprice ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=6,3 row-size=199B cardinality=1500000
|
01:SORT
| order by: o_totalprice ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=191B cardinality=1500000
|
00:SCAN HDFS [tpch_parquet.orders]
@@ -1318,11 +1318,11 @@ PLAN-ROOT SINK
mem-estimate=40.00MB mem-reservation=0B
tuple-ids=0 row-size=191B cardinality=1500000
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=176.00MB
-Per-Host Resource Estimates: Memory=280.00MB
+Per-Host Resource Reservation: Memory=44.00MB
+Per-Host Resource Estimates: Memory=94.00MB
F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=160.00MB mem-reservation=128.00MB
+| Per-Host Resources: mem-estimate=36.00MB mem-reservation=32.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -1330,31 +1330,31 @@ PLAN-ROOT SINK
| functions: row_number()
| order by: o_orderpriority ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=10,5 row-size=215B cardinality=1500000
|
05:SORT
| order by: o_orderpriority ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=10 row-size=207B cardinality=1500000
|
04:ANALYTIC
| functions: row_number()
| order by: o_orderdate ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=8,4 row-size=207B cardinality=1500000
|
03:SORT
| order by: o_orderdate ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=8 row-size=199B cardinality=1500000
|
02:ANALYTIC
| functions: row_number()
| order by: o_totalprice ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=6,3 row-size=199B cardinality=1500000
|
07:MERGING-EXCHANGE [UNPARTITIONED]
@@ -1363,10 +1363,10 @@ PLAN-ROOT SINK
| tuple-ids=6 row-size=191B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
-Per-Host Resources: mem-estimate=120.00MB mem-reservation=48.00MB
+Per-Host Resources: mem-estimate=58.00MB mem-reservation=12.00MB
01:SORT
| order by: o_totalprice ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=191B cardinality=1500000
|
00:SCAN HDFS [tpch_parquet.orders, RANDOM]
@@ -1377,11 +1377,11 @@ Per-Host Resources: mem-estimate=120.00MB mem-reservation=48.00MB
mem-estimate=40.00MB mem-reservation=0B
tuple-ids=0 row-size=191B cardinality=1500000
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=224.00MB
-Per-Host Resource Estimates: Memory=400.00MB
+Per-Host Resource Reservation: Memory=56.00MB
+Per-Host Resource Estimates: Memory=152.00MB
F01:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=160.00MB mem-reservation=128.00MB
+| Per-Host Resources: mem-estimate=36.00MB mem-reservation=32.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -1389,31 +1389,31 @@ PLAN-ROOT SINK
| functions: row_number()
| order by: o_orderpriority ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=10,5 row-size=215B cardinality=1500000
|
05:SORT
| order by: o_orderpriority ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=10 row-size=207B cardinality=1500000
|
04:ANALYTIC
| functions: row_number()
| order by: o_orderdate ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=8,4 row-size=207B cardinality=1500000
|
03:SORT
| order by: o_orderdate ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=8 row-size=199B cardinality=1500000
|
02:ANALYTIC
| functions: row_number()
| order by: o_totalprice ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=6,3 row-size=199B cardinality=1500000
|
07:MERGING-EXCHANGE [UNPARTITIONED]
@@ -1422,10 +1422,10 @@ PLAN-ROOT SINK
| tuple-ids=6 row-size=191B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
-Per-Host Resources: mem-estimate=240.00MB mem-reservation=96.00MB
+Per-Host Resources: mem-estimate=116.00MB mem-reservation=24.00MB
01:SORT
| order by: o_totalprice ASC
-| mem-estimate=80.00MB mem-reservation=48.00MB
+| mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=191B cardinality=1500000
|
00:SCAN HDFS [tpch_parquet.orders, RANDOM]
@@ -1449,11 +1449,11 @@ select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
from tpch_parquet.lineitem join tpch_parquet.orders on l_orderkey = o_orderkey
where l_shipmode = 'F'
---- PLAN
-Per-Host Resource Reservation: Memory=400.00MB
+Per-Host Resource Reservation: Memory=51.00MB
Per-Host Resource Estimates: Memory=135.17MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=135.17MB mem-reservation=400.00MB
+| Per-Host Resources: mem-estimate=135.17MB mem-reservation=51.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -1466,7 +1466,7 @@ PLAN-ROOT SINK
| | hash predicates: l_orderkey = o_orderkey
| | fk/pk conjuncts: l_orderkey = o_orderkey
| | runtime filters: RF002 <- o_orderkey
-| | mem-estimate=12.59MB mem-reservation=136.00MB
+| | mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=5,6 row-size=99B cardinality=822530
| |
| |--09:SCAN HDFS [tpch_parquet.orders]
@@ -1493,7 +1493,7 @@ PLAN-ROOT SINK
| | hash predicates: l_orderkey = o_orderkey
| | fk/pk conjuncts: l_orderkey = o_orderkey
| | runtime filters: RF001 <- o_orderkey
-| | mem-estimate=10.20MB mem-reservation=136.00MB
+| | mem-estimate=10.20MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=3,4 row-size=103B cardinality=1151542
| |
| |--06:SCAN HDFS [tpch_parquet.orders]
@@ -1518,14 +1518,14 @@ PLAN-ROOT SINK
|
04:AGGREGATE [FINALIZE]
| group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-| mem-estimate=42.58MB mem-reservation=264.00MB
+| mem-estimate=42.58MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
03:HASH JOIN [INNER JOIN]
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=12.59MB mem-reservation=136.00MB
+| mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
| tuple-ids=0,1 row-size=86B cardinality=575772
|
|--02:SCAN HDFS [tpch_parquet.orders]
@@ -1548,7 +1548,7 @@ PLAN-ROOT SINK
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=78B cardinality=600122
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=400.00MB
+Per-Host Resource Reservation: Memory=38.25MB
Per-Host Resource Estimates: Memory=339.36MB
F09:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1561,7 +1561,7 @@ PLAN-ROOT SINK
| tuple-ids=7 row-size=70B cardinality=2549844
|
F08:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=92.59MB mem-reservation=34.00MB
00:UNION
| pass-through-operands: 14
| mem-estimate=0B mem-reservation=0B
@@ -1571,7 +1571,7 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
| | hash predicates: l_orderkey = o_orderkey
| | fk/pk conjuncts: l_orderkey = o_orderkey
| | runtime filters: RF002 <- o_orderkey
-| | mem-estimate=12.59MB mem-reservation=136.00MB
+| | mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=5,6 row-size=99B cardinality=822530
| |
| |--16:EXCHANGE [BROADCAST]
@@ -1604,7 +1604,7 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
| | hash predicates: l_orderkey = o_orderkey
| | fk/pk conjuncts: l_orderkey = o_orderkey
| | runtime filters: RF001 <- o_orderkey
-| | mem-estimate=10.20MB mem-reservation=136.00MB
+| | mem-estimate=10.20MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=3,4 row-size=103B cardinality=1151542
| |
| |--15:EXCHANGE [BROADCAST]
@@ -1635,7 +1635,7 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
|
14:AGGREGATE [FINALIZE]
| group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-| mem-estimate=42.58MB mem-reservation=264.00MB
+| mem-estimate=42.58MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
13:EXCHANGE [HASH(l_orderkey,l_partkey,l_suppkey,l_linenumber,l_comment)]
@@ -1643,17 +1643,17 @@ Per-Host Resources: mem-estimate=92.59MB mem-reservation=264.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=46.78MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=46.78MB mem-reservation=4.25MB
04:AGGREGATE [STREAMING]
| group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-| mem-estimate=42.58MB mem-reservation=0B
+| mem-estimate=42.58MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
03:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=4.20MB mem-reservation=136.00MB
+| mem-estimate=4.20MB mem-reservation=4.25MB spill-buffer=256.00KB
| tuple-ids=0,1 row-size=86B cardinality=575772
|
|--12:EXCHANGE [HASH(o_orderkey)]
@@ -1688,7 +1688,7 @@ Per-Host Resources: mem-estimate=80.00MB mem-reservation=0B
mem-estimate=80.00MB mem-reservation=0B
tuple-ids=0 row-size=78B cardinality=600122
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=800.00MB
+Per-Host Resource Reservation: Memory=72.25MB
Per-Host Resource Estimates: Memory=674.53MB
F09:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1701,7 +1701,7 @@ PLAN-ROOT SINK
| tuple-ids=7 row-size=70B cardinality=2549844
|
F08:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=185.18MB mem-reservation=68.00MB
00:UNION
| pass-through-operands: 14
| mem-estimate=0B mem-reservation=0B
@@ -1712,7 +1712,7 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
| | hash predicates: l_orderkey = o_orderkey
| | fk/pk conjuncts: l_orderkey = o_orderkey
| | runtime filters: RF002 <- o_orderkey
-| | mem-estimate=12.59MB mem-reservation=136.00MB
+| | mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=5,6 row-size=99B cardinality=822530
| |
| |--F11:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -1753,7 +1753,7 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
| | hash predicates: l_orderkey = o_orderkey
| | fk/pk conjuncts: l_orderkey = o_orderkey
| | runtime filters: RF001 <- o_orderkey
-| | mem-estimate=10.20MB mem-reservation=136.00MB
+| | mem-estimate=10.20MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=3,4 row-size=103B cardinality=1151542
| |
| |--F10:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -1791,7 +1791,7 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
|
14:AGGREGATE [FINALIZE]
| group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-| mem-estimate=42.58MB mem-reservation=264.00MB
+| mem-estimate=42.58MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
13:EXCHANGE [HASH(l_orderkey,l_partkey,l_suppkey,l_linenumber,l_comment)]
@@ -1799,10 +1799,10 @@ Per-Host Resources: mem-estimate=185.18MB mem-reservation=528.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=89.35MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=89.35MB mem-reservation=4.25MB
04:AGGREGATE [STREAMING]
| group by: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_comment
-| mem-estimate=42.58MB mem-reservation=0B
+| mem-estimate=42.58MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=2 row-size=70B cardinality=575772
|
03:HASH JOIN [INNER JOIN, PARTITIONED]
@@ -1810,7 +1810,7 @@ Per-Host Resources: mem-estimate=89.35MB mem-reservation=272.00MB
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF000 <- o_orderkey
-| mem-estimate=2.10MB mem-reservation=136.00MB
+| mem-estimate=2.10MB mem-reservation=2.12MB spill-buffer=128.00KB
| tuple-ids=0,1 row-size=86B cardinality=575772
|
|--F12:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -1888,11 +1888,11 @@ order by
o_orderdate
limit 100
---- PLAN
-Per-Host Resource Reservation: Memory=672.00MB
+Per-Host Resource Reservation: Memory=80.75MB
Per-Host Resource Estimates: Memory=391.29MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=391.29MB mem-reservation=672.00MB
+| Per-Host Resources: mem-estimate=391.29MB mem-reservation=80.75MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -1904,20 +1904,20 @@ PLAN-ROOT SINK
08:AGGREGATE [FINALIZE]
| output: sum(l_quantity)
| group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-| mem-estimate=60.40MB mem-reservation=264.00MB
+| mem-estimate=60.40MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
07:HASH JOIN [LEFT SEMI JOIN]
| hash predicates: o_orderkey = l_orderkey
| runtime filters: RF000 <- l_orderkey
-| mem-estimate=3.94MB mem-reservation=136.00MB
+| mem-estimate=3.94MB mem-reservation=4.25MB spill-buffer=256.00KB
| tuple-ids=2,1,0 row-size=108B cardinality=575772
|
|--04:AGGREGATE [FINALIZE]
| | output: sum(l_quantity)
| | group by: l_orderkey
| | having: sum(l_quantity) > 300
-| | mem-estimate=10.00MB mem-reservation=264.00MB
+| | mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
| | tuple-ids=4 row-size=24B cardinality=156344
| |
| 03:SCAN HDFS [tpch.lineitem]
@@ -1932,7 +1932,7 @@ PLAN-ROOT SINK
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF001 <- c_custkey
-| mem-estimate=6.61MB mem-reservation=136.00MB
+| mem-estimate=6.61MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=2,1,0 row-size=108B cardinality=5757710
|
|--00:SCAN HDFS [tpch.customer]
@@ -1947,7 +1947,7 @@ PLAN-ROOT SINK
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF002 <- o_orderkey
-| mem-estimate=78.68MB mem-reservation=136.00MB
+| mem-estimate=78.68MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=2,1 row-size=66B cardinality=5757710
|
|--01:SCAN HDFS [tpch.orders]
@@ -1968,7 +1968,7 @@ PLAN-ROOT SINK
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=2 row-size=16B cardinality=6001215
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=936.00MB
+Per-Host Resource Reservation: Memory=82.88MB
Per-Host Resource Estimates: Memory=500.32MB
F07:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -1983,7 +1983,7 @@ PLAN-ROOT SINK
| tuple-ids=7 row-size=100B cardinality=100
|
F06:PLAN FRAGMENT [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=60.41MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=60.41MB mem-reservation=34.00MB
09:TOP-N [LIMIT=100]
| order by: o_totalprice DESC, o_orderdate ASC
| mem-estimate=9.77KB mem-reservation=0B
@@ -1992,7 +1992,7 @@ Per-Host Resources: mem-estimate=60.41MB mem-reservation=264.00MB
16:AGGREGATE [FINALIZE]
| output: sum:merge(l_quantity)
| group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-| mem-estimate=60.40MB mem-reservation=264.00MB
+| mem-estimate=60.40MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
15:EXCHANGE [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)]
@@ -2000,24 +2000,24 @@ Per-Host Resources: mem-estimate=60.41MB mem-reservation=264.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
+Per-Host Resources: mem-estimate=104.55MB mem-reservation=48.88MB
08:AGGREGATE [STREAMING]
| output: sum(l_quantity)
| group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-| mem-estimate=60.40MB mem-reservation=0B
+| mem-estimate=60.40MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
07:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]
| hash predicates: o_orderkey = l_orderkey
| runtime filters: RF000 <- l_orderkey
-| mem-estimate=1.31MB mem-reservation=136.00MB
+| mem-estimate=1.31MB mem-reservation=2.12MB spill-buffer=128.00KB
| tuple-ids=2,1,0 row-size=108B cardinality=575772
|
|--14:AGGREGATE [FINALIZE]
| | output: sum:merge(l_quantity)
| | group by: l_orderkey
| | having: sum(l_quantity) > 300
-| | mem-estimate=10.00MB mem-reservation=264.00MB
+| | mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
| | tuple-ids=4 row-size=24B cardinality=156344
| |
| 13:EXCHANGE [HASH(l_orderkey)]
@@ -2029,7 +2029,7 @@ Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
| 04:AGGREGATE [STREAMING]
| | output: sum(l_quantity)
| | group by: l_orderkey
-| | mem-estimate=39.36MB mem-reservation=0B
+| | mem-estimate=39.36MB mem-reservation=0B spill-buffer=2.00MB
| | tuple-ids=4 row-size=24B cardinality=1563438
| |
| 03:SCAN HDFS [tpch.lineitem, RANDOM]
@@ -2044,7 +2044,7 @@ Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF001 <- c_custkey
-| mem-estimate=6.61MB mem-reservation=136.00MB
+| mem-estimate=6.61MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=2,1,0 row-size=108B cardinality=5757710
|
|--12:EXCHANGE [BROADCAST]
@@ -2065,7 +2065,7 @@ Per-Host Resources: mem-estimate=104.55MB mem-reservation=672.00MB
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF002 <- o_orderkey
-| mem-estimate=26.23MB mem-reservation=136.00MB
+| mem-estimate=26.23MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=2,1 row-size=66B cardinality=5757710
|
|--11:EXCHANGE [HASH(o_orderkey)]
@@ -2098,7 +2098,7 @@ Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=2 row-size=16B cardinality=6001215
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=1.31GB
+Per-Host Resource Reservation: Memory=121.12MB
Per-Host Resource Estimates: Memory=953.10MB
F07:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -2113,7 +2113,7 @@ PLAN-ROOT SINK
| tuple-ids=7 row-size=100B cardinality=100
|
F06:PLAN FRAGMENT [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=120.82MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=120.82MB mem-reservation=68.00MB
09:TOP-N [LIMIT=100]
| order by: o_totalprice DESC, o_orderdate ASC
| mem-estimate=9.77KB mem-reservation=0B
@@ -2122,7 +2122,7 @@ Per-Host Resources: mem-estimate=120.82MB mem-reservation=528.00MB
16:AGGREGATE [FINALIZE]
| output: sum:merge(l_quantity)
| group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-| mem-estimate=60.40MB mem-reservation=264.00MB
+| mem-estimate=60.40MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
15:EXCHANGE [HASH(c_name,c_custkey,o_orderkey,o_orderdate,o_totalprice)]
@@ -2130,18 +2130,18 @@ Per-Host Resources: mem-estimate=120.82MB mem-reservation=528.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
F02:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
+Per-Host Resources: mem-estimate=161.56MB mem-reservation=53.12MB
08:AGGREGATE [STREAMING]
| output: sum(l_quantity)
| group by: c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
-| mem-estimate=60.40MB mem-reservation=0B
+| mem-estimate=60.40MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=6 row-size=100B cardinality=575772
|
07:HASH JOIN [LEFT SEMI JOIN, PARTITIONED]
| hash-table-id=00
| hash predicates: o_orderkey = l_orderkey
| runtime filters: RF000 <- l_orderkey
-| mem-estimate=671.79KB mem-reservation=136.00MB
+| mem-estimate=671.79KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=2,1,0 row-size=108B cardinality=575772
|
|--F08:PLAN FRAGMENT [HASH(l_orderkey)] hosts=3 instances=6
@@ -2155,7 +2155,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
| | output: sum:merge(l_quantity)
| | group by: l_orderkey
| | having: sum(l_quantity) > 300
-| | mem-estimate=10.00MB mem-reservation=264.00MB
+| | mem-estimate=10.00MB mem-reservation=4.25MB spill-buffer=256.00KB
| | tuple-ids=4 row-size=24B cardinality=156344
| |
| 13:EXCHANGE [HASH(l_orderkey)]
@@ -2167,7 +2167,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
| 04:AGGREGATE [STREAMING]
| | output: sum(l_quantity)
| | group by: l_orderkey
-| | mem-estimate=39.36MB mem-reservation=0B
+| | mem-estimate=39.36MB mem-reservation=0B spill-buffer=2.00MB
| | tuple-ids=4 row-size=24B cardinality=1563438
| |
| 03:SCAN HDFS [tpch.lineitem, RANDOM]
@@ -2183,7 +2183,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
| hash predicates: o_custkey = c_custkey
| fk/pk conjuncts: o_custkey = c_custkey
| runtime filters: RF001 <- c_custkey
-| mem-estimate=6.61MB mem-reservation=136.00MB
+| mem-estimate=6.61MB mem-reservation=8.50MB spill-buffer=512.00KB
| tuple-ids=2,1,0 row-size=108B cardinality=5757710
|
|--F09:PLAN FRAGMENT [HASH(l_orderkey)] hosts=1 instances=2
@@ -2212,7 +2212,7 @@ Per-Host Resources: mem-estimate=161.56MB mem-reservation=816.00MB
| hash predicates: l_orderkey = o_orderkey
| fk/pk conjuncts: l_orderkey = o_orderkey
| runtime filters: RF002 <- o_orderkey
-| mem-estimate=13.11MB mem-reservation=136.00MB
+| mem-estimate=13.11MB mem-reservation=17.00MB spill-buffer=1.00MB
| tuple-ids=2,1 row-size=66B cardinality=5757710
|
|--F10:PLAN FRAGMENT [HASH(l_orderkey)] hosts=2 instances=4
@@ -2390,19 +2390,19 @@ from tpch_nested_parquet.customer c,
join c.c_orders o2 on o1.o_orderkey = o2.o_orderkey
order by o1.o_orderkey limit 100) v
---- PLAN
-Per-Host Resource Reservation: Memory=664.00MB
+Per-Host Resource Reservation: Memory=69.06MB
Per-Host Resource Estimates: Memory=344.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
tpch_nested_parquet.customer
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=344.00MB mem-reservation=664.00MB
+| Per-Host Resources: mem-estimate=344.00MB mem-reservation=69.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
09:AGGREGATE [FINALIZE]
| group by: c_name, o1.o_orderkey, o2.o_orderstatus
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
01:SUBPLAN
@@ -2425,13 +2425,13 @@ PLAN-ROOT SINK
| |
| 06:AGGREGATE [FINALIZE]
| | group by: o1.o_orderkey, o2.o_orderstatus
-| | mem-estimate=128.00MB mem-reservation=264.00MB
+| | mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| | tuple-ids=3 row-size=24B cardinality=10
| |
| 05:HASH JOIN [INNER JOIN]
| | hash predicates: o1.o_orderkey = o2.o_orderkey
| | fk/pk conjuncts: assumed fk/pk
-| | mem-estimate=0B mem-reservation=136.00MB
+| | mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=1,2 row-size=32B cardinality=10
| |
| |--04:UNNEST [c.c_orders o2]
@@ -2452,7 +2452,7 @@ PLAN-ROOT SINK
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=66B cardinality=150000
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=664.00MB
+Per-Host Resource Reservation: Memory=69.06MB
Per-Host Resource Estimates: Memory=472.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
tpch_nested_parquet.customer
@@ -2467,10 +2467,10 @@ PLAN-ROOT SINK
| tuple-ids=6 row-size=58B cardinality=1500000
|
F01:PLAN FRAGMENT [HASH(c_name,v.o_orderkey,v.o_orderstatus)] hosts=3 instances=3
-Per-Host Resources: mem-estimate=128.00MB mem-reservation=264.00MB
+Per-Host Resources: mem-estimate=128.00MB mem-reservation=34.00MB
11:AGGREGATE [FINALIZE]
| group by: c_name, v.o_orderkey, v.o_orderstatus
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
10:EXCHANGE [HASH(c_name,v.o_orderkey,v.o_orderstatus)]
@@ -2478,10 +2478,10 @@ Per-Host Resources: mem-estimate=128.00MB mem-reservation=264.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=344.00MB mem-reservation=400.00MB
+Per-Host Resources: mem-estimate=344.00MB mem-reservation=35.06MB
09:AGGREGATE [STREAMING]
| group by: c_name, o1.o_orderkey, o2.o_orderstatus
-| mem-estimate=128.00MB mem-reservation=0B
+| mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
01:SUBPLAN
@@ -2504,13 +2504,13 @@ Per-Host Resources: mem-estimate=344.00MB mem-reservation=400.00MB
| |
| 06:AGGREGATE [FINALIZE]
| | group by: o1.o_orderkey, o2.o_orderstatus
-| | mem-estimate=128.00MB mem-reservation=264.00MB
+| | mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| | tuple-ids=3 row-size=24B cardinality=10
| |
| 05:HASH JOIN [INNER JOIN]
| | hash predicates: o1.o_orderkey = o2.o_orderkey
| | fk/pk conjuncts: assumed fk/pk
-| | mem-estimate=0B mem-reservation=136.00MB
+| | mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=1,2 row-size=32B cardinality=10
| |
| |--04:UNNEST [c.c_orders o2]
@@ -2531,7 +2531,7 @@ Per-Host Resources: mem-estimate=344.00MB mem-reservation=400.00MB
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=66B cardinality=150000
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=1.30GB
+Per-Host Resource Reservation: Memory=138.12MB
Per-Host Resource Estimates: Memory=944.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
tpch_nested_parquet.customer
@@ -2546,10 +2546,10 @@ PLAN-ROOT SINK
| tuple-ids=6 row-size=58B cardinality=1500000
|
F01:PLAN FRAGMENT [HASH(c_name,v.o_orderkey,v.o_orderstatus)] hosts=3 instances=6
-Per-Host Resources: mem-estimate=256.00MB mem-reservation=528.00MB
+Per-Host Resources: mem-estimate=256.00MB mem-reservation=68.00MB
11:AGGREGATE [FINALIZE]
| group by: c_name, v.o_orderkey, v.o_orderstatus
-| mem-estimate=128.00MB mem-reservation=264.00MB
+| mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
10:EXCHANGE [HASH(c_name,v.o_orderkey,v.o_orderstatus)]
@@ -2557,10 +2557,10 @@ Per-Host Resources: mem-estimate=256.00MB mem-reservation=528.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=688.00MB mem-reservation=800.00MB
+Per-Host Resources: mem-estimate=688.00MB mem-reservation=70.12MB
09:AGGREGATE [STREAMING]
| group by: c_name, o1.o_orderkey, o2.o_orderstatus
-| mem-estimate=128.00MB mem-reservation=0B
+| mem-estimate=128.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=6 row-size=58B cardinality=1500000
|
01:SUBPLAN
@@ -2583,13 +2583,13 @@ Per-Host Resources: mem-estimate=688.00MB mem-reservation=800.00MB
| |
| 06:AGGREGATE [FINALIZE]
| | group by: o1.o_orderkey, o2.o_orderstatus
-| | mem-estimate=128.00MB mem-reservation=264.00MB
+| | mem-estimate=128.00MB mem-reservation=34.00MB spill-buffer=2.00MB
| | tuple-ids=3 row-size=24B cardinality=10
| |
| 05:HASH JOIN [INNER JOIN]
| | hash predicates: o1.o_orderkey = o2.o_orderkey
| | fk/pk conjuncts: assumed fk/pk
-| | mem-estimate=0B mem-reservation=136.00MB
+| | mem-estimate=0B mem-reservation=1.06MB spill-buffer=64.00KB
| | tuple-ids=1,2 row-size=32B cardinality=10
| |
| |--04:UNNEST [c.c_orders o2]
@@ -2619,13 +2619,13 @@ from tpch_nested_parquet.customer c,
row_number() over (order by o_orderpriority) rnum_priority
from c.c_orders) v;
---- PLAN
-Per-Host Resource Reservation: Memory=192.00MB
-Per-Host Resource Estimates: Memory=136.00MB
+Per-Host Resource Reservation: Memory=48.00MB
+Per-Host Resource Estimates: Memory=94.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
tpch_nested_parquet.customer
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
+| Per-Host Resources: mem-estimate=94.00MB mem-reservation=48.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -2646,36 +2646,36 @@ PLAN-ROOT SINK
| | functions: row_number()
| | order by: o_orderpriority ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=12,7 row-size=164B cardinality=10
| |
| 08:SORT
| | order by: o_orderpriority ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=12 row-size=156B cardinality=10
| |
| 07:ANALYTIC
| | functions: row_number()
| | order by: o_orderdate ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=10,6 row-size=156B cardinality=10
| |
| 06:SORT
| | order by: o_orderdate ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=10 row-size=148B cardinality=10
| |
| 05:ANALYTIC
| | functions: row_number()
| | order by: o_totalprice ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=8,5 row-size=148B cardinality=10
| |
| 04:SORT
| | order by: o_totalprice ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=8 row-size=140B cardinality=10
| |
| 03:UNNEST [c.c_orders]
@@ -2691,8 +2691,8 @@ PLAN-ROOT SINK
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=254B cardinality=150000
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=192.00MB
-Per-Host Resource Estimates: Memory=136.00MB
+Per-Host Resource Reservation: Memory=48.00MB
+Per-Host Resource Estimates: Memory=94.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
tpch_nested_parquet.customer
@@ -2706,7 +2706,7 @@ PLAN-ROOT SINK
| tuple-ids=12,7,0 row-size=418B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3
-Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
+Per-Host Resources: mem-estimate=94.00MB mem-reservation=48.00MB
01:SUBPLAN
| mem-estimate=0B mem-reservation=0B
| tuple-ids=12,7,0 row-size=418B cardinality=1500000
@@ -2724,36 +2724,36 @@ Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
| | functions: row_number()
| | order by: o_orderpriority ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=12,7 row-size=164B cardinality=10
| |
| 08:SORT
| | order by: o_orderpriority ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=12 row-size=156B cardinality=10
| |
| 07:ANALYTIC
| | functions: row_number()
| | order by: o_orderdate ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=10,6 row-size=156B cardinality=10
| |
| 06:SORT
| | order by: o_orderdate ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=10 row-size=148B cardinality=10
| |
| 05:ANALYTIC
| | functions: row_number()
| | order by: o_totalprice ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=8,5 row-size=148B cardinality=10
| |
| 04:SORT
| | order by: o_totalprice ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=8 row-size=140B cardinality=10
| |
| 03:UNNEST [c.c_orders]
@@ -2769,8 +2769,8 @@ Per-Host Resources: mem-estimate=136.00MB mem-reservation=192.00MB
mem-estimate=88.00MB mem-reservation=0B
tuple-ids=0 row-size=254B cardinality=150000
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=384.00MB
-Per-Host Resource Estimates: Memory=272.00MB
+Per-Host Resource Reservation: Memory=96.00MB
+Per-Host Resource Estimates: Memory=188.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
tpch_nested_parquet.customer
@@ -2784,7 +2784,7 @@ PLAN-ROOT SINK
| tuple-ids=12,7,0 row-size=418B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=6
-Per-Host Resources: mem-estimate=272.00MB mem-reservation=384.00MB
+Per-Host Resources: mem-estimate=188.00MB mem-reservation=96.00MB
01:SUBPLAN
| mem-estimate=0B mem-reservation=0B
| tuple-ids=12,7,0 row-size=418B cardinality=1500000
@@ -2802,36 +2802,36 @@ Per-Host Resources: mem-estimate=272.00MB mem-reservation=384.00MB
| | functions: row_number()
| | order by: o_orderpriority ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=12,7 row-size=164B cardinality=10
| |
| 08:SORT
| | order by: o_orderpriority ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=12 row-size=156B cardinality=10
| |
| 07:ANALYTIC
| | functions: row_number()
| | order by: o_orderdate ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=10,6 row-size=156B cardinality=10
| |
| 06:SORT
| | order by: o_orderdate ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=10 row-size=148B cardinality=10
| |
| 05:ANALYTIC
| | functions: row_number()
| | order by: o_totalprice ASC
| | window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| | mem-estimate=0B mem-reservation=16.00MB
+| | mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| | tuple-ids=8,5 row-size=148B cardinality=10
| |
| 04:SORT
| | order by: o_totalprice ASC
-| | mem-estimate=16.00MB mem-reservation=48.00MB
+| | mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| | tuple-ids=8 row-size=140B cardinality=10
| |
| 03:UNNEST [c.c_orders]
@@ -2861,11 +2861,11 @@ join (
) v2 on v2.k3 = t2.o_orderkey
) v1 on v1.k3 = t1.o_orderkey
---- PLAN
-Per-Host Resource Reservation: Memory=272.00MB
+Per-Host Resource Reservation: Memory=68.00MB
Per-Host Resource Estimates: Memory=172.59MB
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=172.59MB mem-reservation=272.00MB
+| Per-Host Resources: mem-estimate=172.59MB mem-reservation=68.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -2873,21 +2873,21 @@ PLAN-ROOT SINK
| hash predicates: t1.o_orderkey = t3.o_orderkey
| fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
| runtime filters: RF000 <- t3.o_orderkey
-| mem-estimate=37.77MB mem-reservation=136.00MB
+| mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
|
|--05:HASH JOIN [INNER JOIN]
| | hash predicates: t2.o_orderkey = t3.o_orderkey
| | fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
| | runtime filters: RF001 <- t3.o_orderkey
-| | mem-estimate=25.18MB mem-reservation=136.00MB
+| | mem-estimate=25.18MB mem-reservation=34.00MB spill-buffer=2.00MB
| | tuple-ids=1,2,3 row-size=24B cardinality=1500000
| |
| |--04:HASH JOIN [INNER JOIN]
| | | hash predicates: t3.o_orderkey = t4.o_orderkey
| | | fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
| | | runtime filters: RF002 <- t4.o_orderkey
-| | | mem-estimate=12.59MB mem-reservation=136.00MB
+| | | mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
| | | tuple-ids=2,3 row-size=16B cardinality=1500000
| | |
| | |--03:SCAN HDFS [tpch_parquet.orders t4]
@@ -2925,7 +2925,7 @@ PLAN-ROOT SINK
mem-estimate=40.00MB mem-reservation=0B
tuple-ids=0 row-size=191B cardinality=1500000
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=408.00MB
+Per-Host Resource Reservation: Memory=59.50MB
Per-Host Resource Estimates: Memory=216.65MB
F05:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -2938,12 +2938,12 @@ PLAN-ROOT SINK
| tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=2
-Per-Host Resources: mem-estimate=77.77MB mem-reservation=136.00MB
+Per-Host Resources: mem-estimate=77.77MB mem-reservation=34.00MB
06:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: t1.o_orderkey = t3.o_orderkey
| fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
| runtime filters: RF000 <- t3.o_orderkey
-| mem-estimate=37.77MB mem-reservation=136.00MB
+| mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
|
|--10:EXCHANGE [BROADCAST]
@@ -2951,19 +2951,19 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=136.00MB
| | tuple-ids=1,2,3 row-size=24B cardinality=1500000
| |
| F04:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=2
-| Per-Host Resources: mem-estimate=18.88MB mem-reservation=272.00MB
+| Per-Host Resources: mem-estimate=18.88MB mem-reservation=25.50MB
| 05:HASH JOIN [INNER JOIN, PARTITIONED]
| | hash predicates: t2.o_orderkey = t3.o_orderkey
| | fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
| | runtime filters: RF001 <- t3.o_orderkey
-| | mem-estimate=12.59MB mem-reservation=136.00MB
+| | mem-estimate=12.59MB mem-reservation=17.00MB spill-buffer=1.00MB
| | tuple-ids=1,2,3 row-size=24B cardinality=1500000
| |
| |--04:HASH JOIN [INNER JOIN, PARTITIONED]
| | | hash predicates: t3.o_orderkey = t4.o_orderkey
| | | fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
| | | runtime filters: RF002 <- t4.o_orderkey
-| | | mem-estimate=6.29MB mem-reservation=136.00MB
+| | | mem-estimate=6.29MB mem-reservation=8.50MB spill-buffer=512.00KB
| | | tuple-ids=2,3 row-size=16B cardinality=1500000
| | |
| | |--08:EXCHANGE [HASH(t4.o_orderkey)]
@@ -3019,7 +3019,7 @@ Per-Host Resources: mem-estimate=77.77MB mem-reservation=136.00MB
mem-estimate=40.00MB mem-reservation=0B
tuple-ids=0 row-size=191B cardinality=1500000
---- PARALLELPLANS
-Per-Host Resource Reservation: Memory=816.00MB
+Per-Host Resource Reservation: Memory=93.50MB
Per-Host Resource Estimates: Memory=414.41MB
F05:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
@@ -3032,13 +3032,13 @@ PLAN-ROOT SINK
| tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
|
F00:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
-Per-Host Resources: mem-estimate=155.53MB mem-reservation=272.00MB
+Per-Host Resources: mem-estimate=155.53MB mem-reservation=68.00MB
06:HASH JOIN [INNER JOIN, BROADCAST]
| hash-table-id=00
| hash predicates: t1.o_orderkey = t3.o_orderkey
| fk/pk conjuncts: t1.o_orderkey = t3.o_orderkey
| runtime filters: RF000 <- t3.o_orderkey
-| mem-estimate=37.77MB mem-reservation=136.00MB
+| mem-estimate=37.77MB mem-reservation=34.00MB spill-buffer=2.00MB
| tuple-ids=0,1,2,3 row-size=215B cardinality=1500000
|
|--F06:PLAN FRAGMENT [RANDOM] hosts=2 instances=4
@@ -3053,13 +3053,13 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=272.00MB
| | tuple-ids=1,2,3 row-size=24B cardinality=1500000
| |
| F04:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=4
-| Per-Host Resources: mem-estimate=18.88MB mem-reservation=544.00MB
+| Per-Host Resources: mem-estimate=18.88MB mem-reservation=25.50MB
| 05:HASH JOIN [INNER JOIN, PARTITIONED]
| | hash-table-id=01
| | hash predicates: t2.o_orderkey = t3.o_orderkey
| | fk/pk conjuncts: t2.o_orderkey = t3.o_orderkey
| | runtime filters: RF001 <- t3.o_orderkey
-| | mem-estimate=6.29MB mem-reservation=136.00MB
+| | mem-estimate=6.29MB mem-reservation=8.50MB spill-buffer=512.00KB
| | tuple-ids=1,2,3 row-size=24B cardinality=1500000
| |
| |--F07:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=4
@@ -3074,7 +3074,7 @@ Per-Host Resources: mem-estimate=155.53MB mem-reservation=272.00MB
| | | hash predicates: t3.o_orderkey = t4.o_orderkey
| | | fk/pk conjuncts: t3.o_orderkey = t4.o_orderkey
| | | runtime filters: RF002 <- t4.o_orderkey
-| | | mem-estimate=3.15MB mem-reservation=136.00MB
+| | | mem-estimate=3.15MB mem-reservation=4.25MB spill-buffer=256.00KB
| | | tuple-ids=2,3 row-size=16B cardinality=1500000
| | |
| | |--F08:PLAN FRAGMENT [HASH(t3.o_orderkey)] hosts=2 instances=4
@@ -3387,12 +3387,12 @@ sum(smallint_col) over (partition by tinyint_col order by smallint_col
rows between 1 following and 2 following)
from functional.alltypesagg
---- PLAN
-Per-Host Resource Reservation: Memory=72.00MB
-Per-Host Resource Estimates: Memory=24.00MB
+Per-Host Resource Reservation: Memory=18.00MB
+Per-Host Resource Estimates: Memory=18.00MB
Codegen disabled by planner
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=24.00MB mem-reservation=72.00MB
+| Per-Host Resources: mem-estimate=18.00MB mem-reservation=18.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -3401,7 +3401,7 @@ PLAN-ROOT SINK
| partition by: tinyint_col
| order by: smallint_col ASC
| window: ROWS BETWEEN 1 FOLLOWING AND 2 FOLLOWING
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=5,2,3,4 row-size=27B cardinality=11000
|
03:ANALYTIC
@@ -3409,7 +3409,7 @@ PLAN-ROOT SINK
| partition by: tinyint_col
| order by: smallint_col ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=5,2,3 row-size=19B cardinality=11000
|
02:ANALYTIC
@@ -3417,12 +3417,12 @@ PLAN-ROOT SINK
| partition by: tinyint_col
| order by: smallint_col ASC
| window: RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=5,2 row-size=11B cardinality=11000
|
01:SORT
| order by: tinyint_col ASC NULLS FIRST, smallint_col ASC
-| mem-estimate=8.00MB mem-reservation=24.00MB
+| mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=5 row-size=3B cardinality=11000
|
00:SCAN HDFS [functional.alltypesagg]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test b/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
index 3e5fb05..66f8167 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test
@@ -2,14 +2,14 @@
select * from functional.alltypes order by random()
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: random() ASC
| materialized: random()
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=105B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -24,14 +24,14 @@ PLAN-ROOT SINK
select * from functional.alltypes order by abs(id) + abs(id)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: abs(id) + abs(id) ASC
| materialized: abs(id) + abs(id)
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=105B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -46,13 +46,13 @@ PLAN-ROOT SINK
select * from functional.alltypes order by tinyint_col + 1
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: tinyint_col + 1 ASC
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=97B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -68,14 +68,14 @@ select * from functional.alltypes
order by dayofweek(timestamp_col), true, id + 1, string_col = date_string_col, id = tinyint_col
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: dayofweek(timestamp_col) ASC, TRUE ASC, id + 1 ASC, string_col = date_string_col ASC, id = tinyint_col ASC
| materialized: dayofweek(timestamp_col), string_col = date_string_col
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=102B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -91,7 +91,7 @@ select last_value(id) over (order by to_date(timestamp_col), bool_col is null)
from functional.alltypes
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=64.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=16.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -99,13 +99,13 @@ PLAN-ROOT SINK
| functions: last_value(id)
| order by: to_date(timestamp_col) ASC, bool_col IS NULL ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=3,2 row-size=41B cardinality=7300
|
01:SORT
| order by: to_date(timestamp_col) ASC, bool_col IS NULL ASC
| materialized: to_date(timestamp_col)
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=3 row-size=37B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -143,14 +143,14 @@ PLAN-ROOT SINK
select * from functional.alltypes order by TestFn(double_col)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: default.testfn(double_col) ASC
| materialized: default.testfn(double_col)
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=101B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -165,14 +165,14 @@ PLAN-ROOT SINK
select concat(date_string_col, string_col) c from functional.alltypes order by c
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=48.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=12.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: concat(date_string_col, string_col) ASC
| materialized: concat(date_string_col, string_col)
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=16B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
[03/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/sorter.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.h b/be/src/runtime/sorter.h
index 80c5558..cafab72 100644
--- a/be/src/runtime/sorter.h
+++ b/be/src/runtime/sorter.h
@@ -20,7 +20,7 @@
#include <deque>
-#include "runtime/buffered-block-mgr.h"
+#include "runtime/bufferpool/buffer-pool.h"
#include "util/tuple-row-compare.h"
namespace impala {
@@ -31,8 +31,7 @@ class RowBatch;
/// Sorter contains the external sort implementation. Its purpose is to sort arbitrarily
/// large input data sets with a fixed memory budget by spilling data to disk if
-/// necessary. BufferedBlockMgr is used to allocate and manage blocks of data to be
-/// sorted.
+/// necessary.
//
/// The client API for Sorter is as follows:
/// AddBatch() is used to add input rows to be sorted. Multiple tuples in an input row are
@@ -52,20 +51,20 @@ class RowBatch;
/// GetNext() is used to retrieve sorted rows. It can be called multiple times.
/// AddBatch()/AddBatchNoSpill(), InputDone() and GetNext() must be called in that order.
//
-/// Batches of input rows are collected into a sequence of pinned BufferedBlockMgr blocks
-/// called a run. The maximum size of a run is determined by the number of blocks that
+/// Batches of input rows are collected into a sequence of pinned BufferPool pages
+/// called a run. The maximum size of a run is determined by the number of pages that
/// can be pinned by the Sorter. After the run is full, it is sorted in memory, unpinned
/// and the next run is constructed. The variable-length column data (e.g. string slots)
-/// in the materialized sort tuples are stored in a separate sequence of blocks from the
-/// tuples themselves. When the blocks containing tuples in a run are unpinned, the
+/// in the materialized sort tuples are stored in a separate sequence of pages from the
+/// tuples themselves. When the pages containing tuples in a run are unpinned, the
/// var-len slot pointers are converted to offsets from the start of the first var-len
-/// data block. When a block is read back, these offsets are converted back to pointers.
+/// data page. When a page is read back, these offsets are converted back to pointers.
/// The in-memory sorter sorts the fixed-length tuples in-place. The output rows have the
/// same schema as the materialized sort tuples.
//
/// After the input is consumed, the sorter is left with one or more sorted runs. If
/// there are multiple runs, the runs are merged using SortedRunMerger. At least one
-/// block per run (two if there are var-length slots) must be pinned in memory during
+/// page per run (two if there are var-length slots) must be pinned in memory during
/// a merge, so multiple merges may be necessary if the number of runs is too large.
/// First a series of intermediate merges are performed, until the number of runs is
/// small enough to do a single final merge that returns batches of sorted rows to the
@@ -73,7 +72,7 @@ class RowBatch;
///
/// If there is a single sorted run (i.e. no merge required), only tuple rows are
/// copied into the output batch supplied by GetNext(), and the data itself is left in
-/// pinned blocks held by the sorter.
+/// pinned pages held by the sorter.
///
/// When merges are performed, one input batch is created to hold tuple rows for each
/// input run, and one batch is created to hold deep copied rows (i.e. ptrs + data) from
@@ -84,7 +83,7 @@ class RowBatch;
/// During a merge, one row batch is created for each input run, and one batch is created
/// for the output of the merge (if is not the final merge). It is assumed that the memory
/// for these batches have already been accounted for in the memory budget for the sort.
-/// That is, the memory for these batches does not come out of the block buffer manager.
+/// That is, the memory for these batches does not come out of the buffer pool.
//
/// TODO: Not necessary to actually copy var-len data - instead take ownership of the
/// var-length data in the input batch. Copying can be deferred until a run is unpinned.
@@ -96,17 +95,23 @@ class Sorter {
/// 'sort_tuple_exprs' are the slot exprs used to materialize the tuples to be
/// sorted. 'compare_less_than' is a comparator for the sort tuples (returns true if
/// lhs < rhs). 'merge_batch_size_' is the size of the batches created to provide rows
- /// to the merger and retrieve rows from an intermediate merger. 'enable_spilling'
- /// should be set to false to reduce the number of requested buffers if the caller will
- /// use AddBatchNoSpill().
+ /// to the merger and retrieve rows from an intermediate merger. 'node_id' is the ID of
+ /// the exec node using the sorter for error reporting. 'enable_spilling' should be set
+ /// to false to reduce the number of requested buffers if the caller will use
+ /// AddBatchNoSpill().
+ ///
+ /// The Sorter assumes that it has exclusive use of the client's
+ /// reservations for sorting, and may increase the size of the client's reservation.
+ /// The caller is responsible for ensuring that the minimum reservation (returned from
+ /// ComputeMinReservation()) is available.
Sorter(const TupleRowComparator& compare_less_than,
const std::vector<ScalarExpr*>& sort_tuple_exprs, RowDescriptor* output_row_desc,
- MemTracker* mem_tracker, RuntimeProfile* profile, RuntimeState* state,
- bool enable_spilling = true);
-
+ MemTracker* mem_tracker, BufferPool::ClientHandle* client, int64_t page_len,
+ RuntimeProfile* profile, RuntimeState* state, int node_id,
+ bool enable_spilling);
~Sorter();
- /// Initial set-up of the sorter for execution. Registers with the block mgr.
+ /// Initial set-up of the sorter for execution.
/// The evaluators for 'sort_tuple_exprs_' will be created and stored in 'obj_pool'.
/// All allocation from the evaluators will be from 'expr_mem_pool'.
Status Prepare(ObjectPool* obj_pool, MemPool* expr_mem_pool) WARN_UNUSED_RESULT;
@@ -143,24 +148,29 @@ class Sorter {
/// Close the Sorter and free resources.
void Close(RuntimeState* state);
+ /// Compute the minimum amount of buffer memory in bytes required to execute a
+ /// sort with the current sorter.
+ int64_t ComputeMinReservation();
+
private:
+ class Page;
class Run;
class TupleIterator;
class TupleSorter;
/// Create a SortedRunMerger from sorted runs in 'sorted_runs_' and assign it to
/// 'merger_'. Attempts to set up merger with 'max_num_runs' runs but may set it
- /// up with fewer if it cannot pin the initial blocks of all of the runs. Fails
+ /// up with fewer if it cannot pin the initial pages of all of the runs. Fails
/// if it cannot merge at least two runs. The runs to be merged are removed from
/// 'sorted_runs_'. The Sorter sets the 'deep_copy_input' flag to true for the
- /// merger, since the blocks containing input run data will be deleted as input
+ /// merger, since the pages containing input run data will be deleted as input
/// runs are read.
Status CreateMerger(int max_num_runs) WARN_UNUSED_RESULT;
/// Repeatedly replaces multiple smaller runs in sorted_runs_ with a single larger
/// merged run until there are few enough runs to be merged with a single merger.
/// Returns when 'merger_' is set up to merge the final runs.
- /// At least 1 (2 if var-len slots) block from each sorted run must be pinned for
+ /// At least 1 (2 if var-len slots) page from each sorted run must be pinned for
/// a merge. If the number of sorted runs is too large, merge sets of smaller runs
/// into large runs until a final merge can be performed. An intermediate row batch
/// containing deep copied rows is used for the output of each intermediate merge.
@@ -177,6 +187,9 @@ class Sorter {
/// Helper that cleans up all runs in the sorter.
void CleanupAllRuns();
+ /// ID of the ExecNode that owns the sorter, used for error reporting.
+ const int node_id_;
+
/// Runtime state instance used to check for cancellation. Not owned.
RuntimeState* const state_;
@@ -184,11 +197,11 @@ class Sorter {
const TupleRowComparator& compare_less_than_;
boost::scoped_ptr<TupleSorter> in_mem_tuple_sorter_;
- /// Block manager object used to allocate, pin and release runs. Not owned by Sorter.
- BufferedBlockMgr* block_mgr_;
+ /// Client used to allocate pages from the buffer pool. Not owned.
+ BufferPool::ClientHandle* const buffer_pool_client_;
- /// Handle to block mgr to make allocations from.
- BufferedBlockMgr::Client* block_mgr_client_;
+ /// The length of page to use.
+ const int64_t page_len_;
/// True if the tuples to be sorted have var-length slots.
bool has_var_len_slots_;
@@ -211,7 +224,7 @@ class Sorter {
/// BEGIN: Members that must be Reset()
/// The current unsorted run that is being collected. Is sorted and added to
- /// sorted_runs_ after it is full (i.e. number of blocks allocated == max available
+ /// sorted_runs_ after it is full (i.e. number of pages allocated == max available
/// buffers) or after the input is complete. Owned and placed in obj_pool_.
/// When it is added to sorted_runs_, it is set to NULL.
Run* unsorted_run_;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/test-env.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.cc b/be/src/runtime/test-env.cc
index 37b4363..23dfa4c 100644
--- a/be/src/runtime/test-env.cc
+++ b/be/src/runtime/test-env.cc
@@ -20,7 +20,6 @@
#include <limits>
#include <memory>
-#include "runtime/buffered-block-mgr.h"
#include "runtime/query-exec-mgr.h"
#include "runtime/tmp-file-mgr.h"
#include "runtime/query-state.h"
@@ -38,8 +37,8 @@ scoped_ptr<MetricGroup> TestEnv::static_metrics_;
TestEnv::TestEnv()
: have_tmp_file_mgr_args_(false),
- buffer_pool_min_buffer_len_(-1),
- buffer_pool_capacity_(-1) {}
+ buffer_pool_min_buffer_len_(64 * 1024),
+ buffer_pool_capacity_(0) {}
Status TestEnv::Init() {
if (static_metrics_ == NULL) {
@@ -59,9 +58,7 @@ Status TestEnv::Init() {
} else {
RETURN_IF_ERROR(tmp_file_mgr()->Init(metrics()));
}
- if (buffer_pool_min_buffer_len_ != -1 && buffer_pool_capacity_ != -1) {
- exec_env_->InitBufferPool(buffer_pool_min_buffer_len_, buffer_pool_capacity_);
- }
+ exec_env_->InitBufferPool(buffer_pool_min_buffer_len_, buffer_pool_capacity_);
return Status::OK();
}
@@ -88,6 +85,7 @@ void TestEnv::TearDownQueries() {
for (RuntimeState* runtime_state : runtime_states_) runtime_state->ReleaseResources();
runtime_states_.clear();
for (QueryState* query_state : query_states_) {
+ query_state->ReleaseInitialReservationRefcount();
exec_env_->query_exec_mgr()->ReleaseQueryState(query_state);
}
query_states_.clear();
@@ -137,17 +135,4 @@ Status TestEnv::CreateQueryState(
*runtime_state = rs;
return Status::OK();
}
-
-Status TestEnv::CreateQueryStateWithBlockMgr(int64_t query_id, int max_buffers,
- int block_size, const TQueryOptions* query_options, RuntimeState** runtime_state) {
- RETURN_IF_ERROR(CreateQueryState(query_id, query_options, runtime_state));
-
- shared_ptr<BufferedBlockMgr> mgr;
- RETURN_IF_ERROR(BufferedBlockMgr::Create(*runtime_state,
- (*runtime_state)->query_state()->query_mem_tracker(),
- (*runtime_state)->runtime_profile(), tmp_file_mgr(),
- CalculateMemLimit(max_buffers, block_size), block_size, &mgr));
- (*runtime_state)->set_block_mgr(mgr);
- return Status::OK();
-}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/test-env.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/test-env.h b/be/src/runtime/test-env.h
index f314452..e721510 100644
--- a/be/src/runtime/test-env.h
+++ b/be/src/runtime/test-env.h
@@ -55,13 +55,8 @@ class TestEnv {
Status CreateQueryState(
int64_t query_id, const TQueryOptions* query_options, RuntimeState** runtime_state);
- /// Same as CreateQueryState() but also creates a BufferedBlockMgr with the provided
- /// parameters. If 'max_buffers' is -1, there is no limit, otherwise the limit is
- /// max_buffers * block_size.
- Status CreateQueryStateWithBlockMgr(int64_t query_id, int max_buffers, int block_size,
- const TQueryOptions* query_options, RuntimeState** runtime_state);
- /// Destroy all query states and associated RuntimeStates, BufferedBlockMgrs,
- /// etc, that were created since the last TearDownQueries() call.
+ /// Destroy all query states and associated RuntimeStates, etc, that were created since
+ /// the last TearDownQueries() call.
void TearDownQueries();
/// Calculate memory limit accounting for overflow and negative values.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/tmp-file-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr-test.cc b/be/src/runtime/tmp-file-mgr-test.cc
index c94ba1d..343ec93 100644
--- a/be/src/runtime/tmp-file-mgr-test.cc
+++ b/be/src/runtime/tmp-file-mgr-test.cc
@@ -145,6 +145,12 @@ class TmpFileMgrTest : public ::testing::Test {
return bytes_allocated;
}
+ /// Helpers to call WriteHandle methods.
+ void Cancel(TmpFileMgr::WriteHandle* handle) { handle->Cancel(); }
+ void WaitForWrite(TmpFileMgr::WriteHandle* handle) {
+ handle->WaitForWrite();
+ }
+
// Write callback, which signals 'cb_cv_' and increments 'cb_counter_'.
void SignalCallback(Status write_status) {
{
@@ -481,8 +487,8 @@ TEST_F(TmpFileMgrTest, TestEncryptionDuringCancellation) {
string file_path = handle->TmpFilePath();
// Cancel the write - prior to the IMPALA-4820 fix decryption could race with the write.
- handle->Cancel();
- handle->WaitForWrite();
+ Cancel(handle.get());
+ WaitForWrite(handle.get());
ASSERT_OK(file_group.RestoreData(move(handle), data_mem_range));
WaitForCallbacks(1);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/tmp-file-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.h b/be/src/runtime/tmp-file-mgr.h
index c71c370..ba7210d 100644
--- a/be/src/runtime/tmp-file-mgr.h
+++ b/be/src/runtime/tmp-file-mgr.h
@@ -281,21 +281,9 @@ class TmpFileMgr {
DCHECK(read_range_ == nullptr);
}
- /// Cancels any in-flight writes or reads. Reads are cancelled synchronously and
- /// writes are cancelled asynchronously. After Cancel() is called, writes are not
- /// retried. The write callback may be called with a CANCELLED status (unless
- /// it succeeded or encountered a different error first).
- /// TODO: IMPALA-3200: make this private once BufferedBlockMgr doesn't need it.
- void Cancel();
-
/// Cancel any in-flight read synchronously.
void CancelRead();
- /// Blocks until the write completes either successfully or unsuccessfully.
- /// May return before the write callback has been called.
- /// TODO: IMPALA-3200: make this private once BufferedBlockMgr doesn't need it.
- void WaitForWrite();
-
/// Path of temporary file backing the block. Intended for use in testing.
/// Returns empty string if no backing file allocated.
std::string TmpFilePath() const;
@@ -307,6 +295,7 @@ class TmpFileMgr {
private:
friend class FileGroup;
+ friend class TmpFileMgrTest;
WriteHandle(RuntimeProfile::Counter* encryption_timer, WriteDoneCallback cb);
@@ -327,6 +316,16 @@ class TmpFileMgr {
/// then calls 'cb_'.
void WriteComplete(const Status& write_status);
+ /// Cancels any in-flight writes or reads. Reads are cancelled synchronously and
+ /// writes are cancelled asynchronously. After Cancel() is called, writes are not
+ /// retried. The write callback may be called with a CANCELLED status (unless
+ /// it succeeded or encountered a different error first).
+ void Cancel();
+
+ /// Blocks until the write completes either successfully or unsuccessfully.
+ /// May return before the write callback has been called.
+ void WaitForWrite();
+
/// Encrypts the data in 'buffer' in-place and computes 'hash_'.
Status EncryptAndHash(MemRange buffer) WARN_UNUSED_RESULT;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/service/client-request-state.cc
----------------------------------------------------------------------
diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index 6be04f6..bf0f9b4 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -399,9 +399,9 @@ Status ClientRequestState::ExecQueryOrDmlRequest(
ss << query_exec_request.per_host_mem_estimate;
summary_profile_.AddInfoString(PER_HOST_MEM_KEY, ss.str());
}
- if (query_exec_request.__isset.per_host_min_reservation) {
+ if (query_exec_request.query_ctx.__isset.per_host_min_reservation) {
stringstream ss;
- ss << query_exec_request.per_host_min_reservation;
+ ss << query_exec_request.query_ctx.per_host_min_reservation;
summary_profile_.AddInfoString(PER_HOST_MEMORY_RESERVATION_KEY, ss.str());
}
if (!query_exec_request.query_ctx.__isset.parent_query_id &&
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/service/query-options.cc
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 8dcd7af..c123902 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -261,10 +261,10 @@ Status impala::SetQueryOption(const string& key, const string& value,
case TImpalaQueryOptions::QUERY_TIMEOUT_S:
query_options->__set_query_timeout_s(atoi(value.c_str()));
break;
- case TImpalaQueryOptions::MAX_BLOCK_MGR_MEMORY: {
+ case TImpalaQueryOptions::BUFFER_POOL_LIMIT: {
int64_t mem;
- RETURN_IF_ERROR(ParseMemValue(value, "block mgr memory limit", &mem));
- query_options->__set_max_block_mgr_memory(mem);
+ RETURN_IF_ERROR(ParseMemValue(value, "buffer pool limit", &mem));
+ query_options->__set_buffer_pool_limit(mem);
break;
}
case TImpalaQueryOptions::APPX_COUNT_DISTINCT: {
@@ -505,6 +505,28 @@ Status impala::SetQueryOption(const string& key, const string& value,
query_options->__set_disable_codegen_rows_threshold(val);
break;
}
+ case TImpalaQueryOptions::DEFAULT_SPILLABLE_BUFFER_SIZE: {
+ int64_t buffer_size_bytes;
+ RETURN_IF_ERROR(
+ ParseMemValue(value, "Spillable buffer size", &buffer_size_bytes));
+ if (!BitUtil::IsPowerOf2(buffer_size_bytes)) {
+ return Status(
+ Substitute("Buffer size must be a power of two: $0", buffer_size_bytes));
+ }
+ query_options->__set_default_spillable_buffer_size(buffer_size_bytes);
+ break;
+ }
+ case TImpalaQueryOptions::MIN_SPILLABLE_BUFFER_SIZE: {
+ int64_t buffer_size_bytes;
+ RETURN_IF_ERROR(
+ ParseMemValue(value, "Spillable buffer size", &buffer_size_bytes));
+ if (!BitUtil::IsPowerOf2(buffer_size_bytes)) {
+ return Status(
+ Substitute("Buffer size must be a power of two: $0", buffer_size_bytes));
+ }
+ query_options->__set_min_spillable_buffer_size(buffer_size_bytes);
+ break;
+ }
default:
// We hit this DCHECK(false) if we forgot to add the corresponding entry here
// when we add a new query option.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/service/query-options.h
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 603c783..8d6af02 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -35,7 +35,7 @@ class TQueryOptions;
// the DCHECK.
#define QUERY_OPTS_TABLE\
DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
- TImpalaQueryOptions::DISABLE_CODEGEN_ROWS_THRESHOLD + 1);\
+ TImpalaQueryOptions::MIN_SPILLABLE_BUFFER_SIZE + 1);\
QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR)\
QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -62,7 +62,7 @@ class TQueryOptions;
QUERY_OPT_FN(v_cpu_cores, V_CPU_CORES)\
QUERY_OPT_FN(rm_initial_mem, RM_INITIAL_MEM)\
QUERY_OPT_FN(query_timeout_s, QUERY_TIMEOUT_S)\
- QUERY_OPT_FN(max_block_mgr_memory, MAX_BLOCK_MGR_MEMORY)\
+ QUERY_OPT_FN(buffer_pool_limit, BUFFER_POOL_LIMIT)\
QUERY_OPT_FN(appx_count_distinct, APPX_COUNT_DISTINCT)\
QUERY_OPT_FN(disable_unsafe_spills, DISABLE_UNSAFE_SPILLS)\
QUERY_OPT_FN(seq_compression_mode, SEQ_COMPRESSION_MODE)\
@@ -93,6 +93,8 @@ class TQueryOptions;
QUERY_OPT_FN(parquet_read_statistics, PARQUET_READ_STATISTICS)\
QUERY_OPT_FN(default_join_distribution_mode, DEFAULT_JOIN_DISTRIBUTION_MODE)\
QUERY_OPT_FN(disable_codegen_rows_threshold, DISABLE_CODEGEN_ROWS_THRESHOLD)\
+ QUERY_OPT_FN(default_spillable_buffer_size, DEFAULT_SPILLABLE_BUFFER_SIZE)\
+ QUERY_OPT_FN(min_spillable_buffer_size, MIN_SPILLABLE_BUFFER_SIZE)\
;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/util/bloom-filter.h
----------------------------------------------------------------------
diff --git a/be/src/util/bloom-filter.h b/be/src/util/bloom-filter.h
index 5ebd9b5..913b331 100644
--- a/be/src/util/bloom-filter.h
+++ b/be/src/util/bloom-filter.h
@@ -28,7 +28,7 @@
#include "common/compiler-util.h"
#include "gen-cpp/ImpalaInternalService_types.h"
#include "gutil/macros.h"
-#include "runtime/buffered-block-mgr.h"
+#include "util/cpu-info.h"
#include "util/hash-util.h"
namespace impala {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/util/static-asserts.cc
----------------------------------------------------------------------
diff --git a/be/src/util/static-asserts.cc b/be/src/util/static-asserts.cc
index cf12e36..7662906 100644
--- a/be/src/util/static-asserts.cc
+++ b/be/src/util/static-asserts.cc
@@ -18,7 +18,6 @@
#include <boost/static_assert.hpp>
#include "common/hdfs.h"
-#include "runtime/buffered-tuple-stream.h"
#include "runtime/string-value.h"
#include "runtime/timestamp-value.h"
#include "udf/udf.h"
@@ -37,7 +36,6 @@ class UnusedClass {
BOOST_STATIC_ASSERT(sizeof(boost::gregorian::date) == 4);
BOOST_STATIC_ASSERT(sizeof(hdfsFS) == sizeof(void*));
BOOST_STATIC_ASSERT(sizeof(hdfsFile) == sizeof(void*));
- BOOST_STATIC_ASSERT(sizeof(BufferedTupleStream::RowIdx) == sizeof(void*));
// If the memory layout of any of these types changes, it will be necessary to change
// LlvmCodeGen::GetUdfValType(), and we may also run into calling convention problems
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/Frontend.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/Frontend.thrift b/common/thrift/Frontend.thrift
index 79da0d6..3a88915 100644
--- a/common/thrift/Frontend.thrift
+++ b/common/thrift/Frontend.thrift
@@ -389,23 +389,11 @@ struct TQueryExecRequest {
// Estimated per-host peak memory consumption in bytes. Used for resource management.
8: optional i64 per_host_mem_estimate
- // Minimum query-wide buffer reservation required per host in bytes. This is the peak
- // minimum reservation that may be required by the concurrently-executing operators at
- // any point in query execution. It may be less than the initial reservation total
- // claims (below) if execution of some operators never overlaps, which allows reuse of
- // reservations.
- 9: optional i64 per_host_min_reservation;
-
- // Total of the initial buffer reservations that we expect to be claimed per host.
- // I.e. the sum over all operators in all fragment instances that execute on that host.
- // Measured in bytes.
- 10: optional i64 per_host_initial_reservation_total_claims;
-
// List of replica hosts. Used by the host_idx field of TScanRangeLocation.
- 11: required list<Types.TNetworkAddress> host_list
+ 9: required list<Types.TNetworkAddress> host_list
// Column lineage graph
- 12: optional LineageGraph.TLineageGraph lineage_graph
+ 10: optional LineageGraph.TLineageGraph lineage_graph
}
enum TCatalogOpType {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/ImpalaInternalService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index 4aefe55..b477299 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -130,7 +130,7 @@ struct TQueryOptions {
26: optional i32 query_timeout_s = 0
// test hook to cap max memory for spilling operators (to force them to spill).
- 27: optional i64 max_block_mgr_memory
+ 27: optional i64 buffer_pool_limit
// If true, transforms all count(distinct) aggregations into NDV()
28: optional bool appx_count_distinct = 0
@@ -255,6 +255,14 @@ struct TQueryOptions {
// If the number of rows processed per node is below the threshold codegen will be
// automatically disabled by the planner.
57: optional i32 disable_codegen_rows_threshold = 50000
+
+ // The default spillable buffer size in bytes, which may be overridden by the planner.
+ // Defaults to 2MB.
+ 58: optional i64 default_spillable_buffer_size = 2097152;
+
+ // The minimum spillable buffer to use. The planner will not choose a size smaller than
+ // this. Defaults to 64KB.
+ 59: optional i64 min_spillable_buffer_size = 65536;
}
// Impala currently has two types of sessions: Beeswax and HiveServer2
@@ -375,6 +383,18 @@ struct TQueryCtx {
// String containing a timestamp (in UTC) set as the query submission time. It
// represents the same point in time as now_string
17: required string utc_timestamp_string
+
+ // Minimum query-wide buffer reservation required per host in bytes. This is the peak
+ // minimum reservation that may be required by the concurrently-executing operators at
+ // any point in query execution. It may be less than the initial reservation total
+ // claims (below) if execution of some operators never overlaps, which allows reuse of
+ // reservations.
+ 18: optional i64 per_host_min_reservation;
+
+ // Total of the initial buffer reservations that we expect to be claimed per host.
+ // I.e. the sum over all operators in all fragment instances that execute on that host.
+ // Measured in bytes.
+ 19: optional i64 per_host_initial_reservation_total_claims;
}
// Specification of one output destination of a plan fragment
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/ImpalaService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index ec82bf1..ced884b 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -159,7 +159,7 @@ enum TImpalaQueryOptions {
QUERY_TIMEOUT_S,
// Test hook for spill to disk operators
- MAX_BLOCK_MGR_MEMORY,
+ BUFFER_POOL_LIMIT,
// Transforms all count(distinct) aggregations into NDV()
APPX_COUNT_DISTINCT,
@@ -279,6 +279,12 @@ enum TImpalaQueryOptions {
// If the number of rows processed per node is below the threshold and disable_codegen
// is unset, codegen will be automatically be disabled by the planner.
DISABLE_CODEGEN_ROWS_THRESHOLD,
+
+ // The default spillable buffer size, in bytes.
+ DEFAULT_SPILLABLE_BUFFER_SIZE,
+
+ // The minimum spillable buffer size, in bytes.
+ MIN_SPILLABLE_BUFFER_SIZE,
}
// The summary of a DML statement.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/PlanNodes.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/PlanNodes.thrift b/common/thrift/PlanNodes.thrift
index c1ff302..468ca44 100644
--- a/common/thrift/PlanNodes.thrift
+++ b/common/thrift/PlanNodes.thrift
@@ -481,6 +481,21 @@ struct TUnnestNode {
1: required Exprs.TExpr collection_expr
}
+// This contains all of the information computed by the plan as part of the resource
+// profile that is needed by the backend to execute.
+struct TBackendResourceProfile {
+ // The minimum reservation for this plan node in bytes.
+ 1: required i64 min_reservation
+
+ // The maximum reservation for this plan node in bytes. MAX_INT64 means effectively
+ // unlimited.
+ 2: required i64 max_reservation
+
+ // The spillable buffer size in bytes to use for this node, chosen by the planner.
+ // Set iff the node uses spillable buffers.
+ 3: optional i64 spillable_buffer_size
+}
+
// This is essentially a union of all messages corresponding to subclasses
// of PlanNode.
struct TPlanNode {
@@ -526,6 +541,9 @@ struct TPlanNode {
// Runtime filters assigned to this plan node
24: optional list<TRuntimeFilterDesc> runtime_filters
+
+ // Resource profile for this plan node.
+ 25: required TBackendResourceProfile resource_profile
}
// A flattened representation of a tree of PlanNodes, obtained by depth-first
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 80e054e..ccd713c 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -240,11 +240,11 @@ error_codes = (
("PARTITIONED_HASH_JOIN_REPARTITION_FAILS", 76, "Cannot perform hash join at node with "
"id $0. Repartitioning did not reduce the size of a spilled partition. Repartitioning "
- "level $1. Number of rows $2."),
+ "level $1. Number of rows $2:\\n$3\\n$4"),
("PARTITIONED_AGG_REPARTITION_FAILS", 77, "Cannot perform aggregation at node with "
"id $0. Repartitioning did not reduce the size of a spilled partition. Repartitioning "
- "level $1. Number of rows $2."),
+ "level $1. Number of rows $2:\\n$3\\n$4"),
("AVRO_TRUNCATED_BLOCK", 78, "File '$0' is corrupt: truncated data block at offset $1"),
@@ -322,10 +322,14 @@ error_codes = (
# TODO: IMPALA-3200: make sure that this references the correct query option.
("MAX_ROW_SIZE", 104, "Row of size $0 could not be materialized in plan node with "
- "id $1. Limit is $2, which can be increased with query option max_row_size"),
+ "id $1. Increase the <TBD> query option (currently $2) to process larger rows."),
("IR_VERIFY_FAILED", 105,
"Failed to verify generated IR function $0, see log for more details."),
+
+ ("MINIMUM_RESERVATION_UNAVAILABLE", 106, "Failed to get minimum memory reservation of "
+ "$0 on daemon $1:$2 for query $3 because it would exceed an applicable query, "
+ "request pool or process memory limit. Memory usage:\\n$4"),
)
import sys
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java b/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
index 2041090..3c33bf1 100644
--- a/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
+++ b/fe/src/main/java/org/apache/impala/common/RuntimeEnv.java
@@ -17,8 +17,6 @@
package org.apache.impala.common;
-import org.apache.impala.service.BackendConfig;
-
/**
* Contains runtime-specific parameters such as the number of CPU cores. Currently only
* used in Plan cost estimation. The static RuntimeEnv members can be set so that tests
@@ -33,9 +31,6 @@ public class RuntimeEnv {
// PlanNode.computeResourceProfile(). Currently the backend only support a single
// spillable buffer size, so this is equal to PlanNode.DEFAULT_SPILLABLE_BUFFER_BYTES,
// except in planner tests.
- // TODO: IMPALA-3200: this get from query option
- private long minSpillableBufferBytes_;
-
// Indicates whether this is an environment for testing.
private boolean isTestEnv_;
@@ -48,15 +43,10 @@ public class RuntimeEnv {
*/
public void reset() {
numCores_ = Runtime.getRuntime().availableProcessors();
- minSpillableBufferBytes_ = BackendConfig.INSTANCE.getReadSize();
}
public int getNumCores() { return numCores_; }
public void setNumCores(int numCores) { this.numCores_ = numCores; }
- public long getMinSpillableBufferBytes() { return minSpillableBufferBytes_; }
- public void setMinSpillableBufferBytes(long minSpillableBufferBytes) {
- minSpillableBufferBytes_ = minSpillableBufferBytes;
- }
public void setTestEnv(boolean v) { isTestEnv_ = v; }
public boolean isTestEnv() { return isTestEnv_; }
public boolean isKuduSupported() {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/AggregationNode.java b/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
index 004c84e..c938f76 100644
--- a/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/AggregationNode.java
@@ -30,7 +30,6 @@ import org.apache.impala.analysis.Expr;
import org.apache.impala.analysis.FunctionCallExpr;
import org.apache.impala.analysis.SlotId;
import org.apache.impala.common.InternalException;
-import org.apache.impala.common.RuntimeEnv;
import org.apache.impala.thrift.TAggregationNode;
import org.apache.impala.thrift.TExplainLevel;
import org.apache.impala.thrift.TExpr;
@@ -302,24 +301,24 @@ public class AggregationNode extends PlanNode {
// Must be kept in sync with PartitionedAggregationNode::MinRequiredBuffers() in be.
long perInstanceMinBuffers;
+ long bufferSize = queryOptions.getDefault_spillable_buffer_size();
if (aggInfo_.getGroupingExprs().isEmpty() || useStreamingPreagg_) {
perInstanceMinBuffers = 0;
} else {
final int PARTITION_FANOUT = 16;
- long minBuffers = 2 * PARTITION_FANOUT + 1 + (aggInfo_.needsSerialize() ? 1 : 0);
- long bufferSize = getDefaultSpillableBufferBytes();
+ long minBuffers = PARTITION_FANOUT + 1 + (aggInfo_.needsSerialize() ? 1 : 0);
if (perInstanceDataBytes != -1) {
long bytesPerBuffer = perInstanceDataBytes / PARTITION_FANOUT;
// Scale down the buffer size if we think there will be excess free space with the
// default buffer size, e.g. with small dimension tables.
bufferSize = Math.min(bufferSize, Math.max(
- RuntimeEnv.INSTANCE.getMinSpillableBufferBytes(),
+ queryOptions.getMin_spillable_buffer_size(),
BitUtil.roundUpToPowerOf2(bytesPerBuffer)));
}
perInstanceMinBuffers = bufferSize * minBuffers;
}
- nodeResourceProfile_ =
- new ResourceProfile(perInstanceMemEstimate, perInstanceMinBuffers);
+ nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+ perInstanceMemEstimate, perInstanceMinBuffers, bufferSize);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java b/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
index d4bafcf..0322d88 100644
--- a/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/AnalyticEvalNode.java
@@ -248,8 +248,11 @@ public class AnalyticEvalNode extends PlanNode {
// TODO: come up with estimate based on window
long perInstanceMemEstimate = 0;
+ // Analytic always uses the default spillable buffer size.
+ long bufferSize = queryOptions.getDefault_spillable_buffer_size();
// Must be kept in sync with MIN_REQUIRED_BUFFERS in AnalyticEvalNode in be.
- long perInstanceMinBufferBytes = 2 * getDefaultSpillableBufferBytes();
- nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, perInstanceMinBufferBytes);
+ long perInstanceMinBufferBytes = 2 * bufferSize;
+ nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+ perInstanceMemEstimate, perInstanceMinBufferBytes, bufferSize);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
index 879d9d8..cea9b53 100644
--- a/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/DataSourceScanNode.java
@@ -332,7 +332,7 @@ public class DataSourceScanNode extends ScanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: What's a good estimate of memory consumption?
- nodeResourceProfile_ = new ResourceProfile(1024L * 1024L * 1024L, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(1024L * 1024L * 1024L);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java b/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
index d1369f5..af4f9a6 100644
--- a/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/DataStreamSink.java
@@ -51,7 +51,7 @@ public class DataStreamSink extends DataSink {
@Override
public void computeResourceProfile(TQueryOptions queryOptions) {
- resourceProfile_ = new ResourceProfile(0, 0);
+ resourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java b/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
index 0d0acc9..3fb8bae 100644
--- a/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/EmptySetNode.java
@@ -62,7 +62,7 @@ public class EmptySetNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java b/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
index 478a054..87d2fd2 100644
--- a/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/ExchangeNode.java
@@ -184,7 +184,7 @@ public class ExchangeNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
index bbecbf1..d56aa98 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
@@ -497,7 +497,7 @@ public class HBaseScanNode extends ScanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: What's a good estimate of memory consumption?
- nodeResourceProfile_ = new ResourceProfile(1024L * 1024L * 1024L, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(1024L * 1024L * 1024L);
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java b/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
index 947665e..28939ed 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseTableSink.java
@@ -44,7 +44,7 @@ public class HBaseTableSink extends TableSink {
@Override
public void computeResourceProfile(TQueryOptions queryOptions) {
- resourceProfile_ = new ResourceProfile(0, 0);
+ resourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java b/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
index e828125..5ff17c5 100644
--- a/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HashJoinNode.java
@@ -28,7 +28,6 @@ import org.apache.impala.catalog.Type;
import org.apache.impala.common.AnalysisException;
import org.apache.impala.common.ImpalaException;
import org.apache.impala.common.InternalException;
-import org.apache.impala.common.RuntimeEnv;
import org.apache.impala.thrift.TEqJoinCondition;
import org.apache.impala.thrift.TExplainLevel;
import org.apache.impala.thrift.THashJoinNode;
@@ -223,17 +222,18 @@ public class HashJoinNode extends JoinNode {
long minBuffers = PARTITION_FANOUT + 1
+ (joinOp_ == JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN ? 3 : 0);
- long bufferSize = getDefaultSpillableBufferBytes();
+ long bufferSize = queryOptions.getDefault_spillable_buffer_size();
if (perInstanceDataBytes != -1) {
long bytesPerBuffer = perInstanceDataBytes / PARTITION_FANOUT;
// Scale down the buffer size if we think there will be excess free space with the
// default buffer size, e.g. if the right side is a small dimension table.
bufferSize = Math.min(bufferSize, Math.max(
- RuntimeEnv.INSTANCE.getMinSpillableBufferBytes(),
+ queryOptions.getMin_spillable_buffer_size(),
BitUtil.roundUpToPowerOf2(bytesPerBuffer)));
}
long perInstanceMinBufferBytes = bufferSize * minBuffers;
- nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, perInstanceMinBufferBytes);
+ nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+ perInstanceMemEstimate, perInstanceMinBufferBytes, bufferSize);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 0ba5bc6..bf183be 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1021,7 +1021,7 @@ public class HdfsScanNode extends ScanNode {
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
Preconditions.checkNotNull(scanRanges_, "Cost estimation requires scan ranges.");
if (scanRanges_.isEmpty()) {
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
return;
}
Preconditions.checkState(0 < numNodes_ && numNodes_ <= scanRanges_.size());
@@ -1075,7 +1075,7 @@ public class HdfsScanNode extends ScanNode {
PrintUtils.printBytes(perHostUpperBound)));
perInstanceMemEstimate = perHostUpperBound;
}
- nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
index fed4ffd..46709c0 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java
@@ -99,7 +99,7 @@ public class HdfsTableSink extends TableSink {
PlanNode.checkedMultiply(numPartitionsPerInstance, perPartitionMemReq);
perInstanceMemEstimate = Math.min(perInstanceInputBytes, perInstanceMemReq);
}
- resourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+ resourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java b/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
index 69cc133..14acb26 100644
--- a/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/JoinBuildSink.java
@@ -87,6 +87,6 @@ public class JoinBuildSink extends DataSink {
@Override
public void computeResourceProfile(TQueryOptions queryOptions) {
// The memory consumption is counted against the join PlanNode.
- resourceProfile_ = new ResourceProfile(0, 0);
+ resourceProfile_ = ResourceProfile.noReservation(0);
}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
index 57403e4..37a4e5c 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduScanNode.java
@@ -270,7 +270,7 @@ public class KuduScanNode extends ScanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java b/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
index b7dcdd8..f75b170 100644
--- a/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/KuduTableSink.java
@@ -60,7 +60,7 @@ public class KuduTableSink extends TableSink {
@Override
public void computeResourceProfile(TQueryOptions queryOptions) {
// TODO: add a memory estimate
- resourceProfile_ = new ResourceProfile(0, 0);
+ resourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java b/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
index 0ec8e4f..16a3caf 100644
--- a/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/NestedLoopJoinNode.java
@@ -20,9 +20,6 @@ package org.apache.impala.planner;
import java.util.Collections;
import java.util.List;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.impala.analysis.Analyzer;
import org.apache.impala.analysis.BinaryPredicate;
import org.apache.impala.analysis.Expr;
@@ -86,7 +83,7 @@ public class NestedLoopJoinNode extends JoinNode {
perInstanceMemEstimate =
(long) Math.ceil(getChild(1).cardinality_ * getChild(1).avgRowSize_);
}
- nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/PlanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index 9723c4a..2557f98 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -33,7 +33,7 @@ import org.apache.impala.common.ImpalaException;
import org.apache.impala.common.PrintUtils;
import org.apache.impala.common.TreeNode;
import org.apache.impala.planner.RuntimeFilterGenerator.RuntimeFilter;
-import org.apache.impala.service.BackendConfig;
+import org.apache.impala.thrift.TBackendResourceProfile;
import org.apache.impala.thrift.TExecStats;
import org.apache.impala.thrift.TExplainLevel;
import org.apache.impala.thrift.TPlan;
@@ -408,6 +408,8 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
msg.addToRuntime_filters(filter.toThrift());
}
msg.setDisable_codegen(disableCodegen_);
+ Preconditions.checkState(nodeResourceProfile_.isValid());
+ msg.resource_profile = nodeResourceProfile_.toThrift();
toThrift(msg);
container.addToNodes(msg);
// For the purpose of the BE consider ExchangeNodes to have no children.
@@ -677,16 +679,6 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
}
/**
- * The default size of buffer used in spilling nodes. Used in
- * computeNodeResourceProfile().
- */
- protected final static long getDefaultSpillableBufferBytes() {
- // BufferedBlockMgr uses --read_size to determine buffer size.
- // TODO: IMPALA-3200: get from query option
- return BackendConfig.INSTANCE.getReadSize();
- }
-
- /**
* The input cardinality is the sum of output cardinalities of its children.
* For scan nodes the input cardinality is the expected number of rows scanned.
*/
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java b/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
index fba9149..07eb58b 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanRootSink.java
@@ -37,7 +37,7 @@ public class PlanRootSink extends DataSink {
@Override
public void computeResourceProfile(TQueryOptions queryOptions) {
// TODO: add a memory estimate
- resourceProfile_ = new ResourceProfile(0, 0);
+ resourceProfile_ = ResourceProfile.noReservation(0);
}
protected TDataSink toThrift() {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/Planner.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/Planner.java b/fe/src/main/java/org/apache/impala/planner/Planner.java
index 4cfd57e..ed6e8df 100644
--- a/fe/src/main/java/org/apache/impala/planner/Planner.java
+++ b/fe/src/main/java/org/apache/impala/planner/Planner.java
@@ -63,7 +63,7 @@ public class Planner {
public static final long MIN_PER_HOST_MEM_ESTIMATE_BYTES = 10 * 1024 * 1024;
public static final ResourceProfile MIN_PER_HOST_RESOURCES =
- new ResourceProfile(MIN_PER_HOST_MEM_ESTIMATE_BYTES, 0);
+ ResourceProfile.withMinReservation(MIN_PER_HOST_MEM_ESTIMATE_BYTES, 0);
private final PlannerContext ctx_;
@@ -262,9 +262,9 @@ public class Planner {
TQueryExecRequest request, TExplainLevel explainLevel) {
StringBuilder str = new StringBuilder();
boolean hasHeader = false;
- if (request.isSetPer_host_min_reservation()) {
+ if (request.query_ctx.isSetPer_host_min_reservation()) {
str.append(String.format("Per-Host Resource Reservation: Memory=%s\n",
- PrintUtils.printBytes(request.getPer_host_min_reservation()))) ;
+ PrintUtils.printBytes(request.query_ctx.getPer_host_min_reservation())));
hasHeader = true;
}
if (request.isSetPer_host_mem_estimate()) {
@@ -344,7 +344,7 @@ public class Planner {
* per-host resource values in 'request'.
*/
public void computeResourceReqs(List<PlanFragment> planRoots,
- TQueryExecRequest request) {
+ TQueryCtx queryCtx, TQueryExecRequest request) {
Preconditions.checkState(!planRoots.isEmpty());
Preconditions.checkNotNull(request);
TQueryOptions queryOptions = ctx_.getRootAnalyzer().getQueryOptions();
@@ -389,8 +389,8 @@ public class Planner {
perHostPeakResources = MIN_PER_HOST_RESOURCES.max(perHostPeakResources);
request.setPer_host_mem_estimate(perHostPeakResources.getMemEstimateBytes());
- request.setPer_host_min_reservation(perHostPeakResources.getMinReservationBytes());
- request.setPer_host_initial_reservation_total_claims(perHostInitialReservationTotal);
+ queryCtx.setPer_host_min_reservation(perHostPeakResources.getMinReservationBytes());
+ queryCtx.setPer_host_initial_reservation_total_claims(perHostInitialReservationTotal);
if (LOG.isTraceEnabled()) {
LOG.trace("Per-host min buffer : " + perHostPeakResources.getMinReservationBytes());
LOG.trace(
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java b/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
index 18cde7e..3c13812 100644
--- a/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
+++ b/fe/src/main/java/org/apache/impala/planner/ResourceProfile.java
@@ -18,6 +18,7 @@
package org.apache.impala.planner;
import org.apache.impala.common.PrintUtils;
+import org.apache.impala.thrift.TBackendResourceProfile;
import org.apache.impala.util.MathUtil;
/**
@@ -35,25 +36,56 @@ public class ResourceProfile {
private final long memEstimateBytes_;
// Minimum buffer reservation required to execute in bytes.
+ // The valid range is [0, maxReservationBytes_].
private final long minReservationBytes_;
- private ResourceProfile(boolean isValid, long memEstimateBytes, long minReservationBytes) {
+ // Maximum buffer reservation allowed for this plan node.
+ // The valid range is [minReservationBytes_, Long.MAX_VALUE].
+ private final long maxReservationBytes_;
+
+ // The spillable buffer size to use in a plan node. Only valid for resource profiles
+ // for spilling PlanNodes. Operations like sum(), max(), etc., produce profiles without
+ // valid spillableBufferBytes_ values. -1 means invalid.
+ private final long spillableBufferBytes_;
+
+ private ResourceProfile(boolean isValid, long memEstimateBytes,
+ long minReservationBytes, long maxReservationBytes, long spillableBufferBytes) {
isValid_ = isValid;
memEstimateBytes_ = memEstimateBytes;
minReservationBytes_ = minReservationBytes;
+ maxReservationBytes_ = maxReservationBytes;
+ spillableBufferBytes_ = spillableBufferBytes;
+ }
+
+ // Create a resource profile with zero min or max reservation.
+ public static ResourceProfile noReservation(long memEstimateBytes) {
+ return new ResourceProfile(true, memEstimateBytes, 0, 0, -1);
+ }
+
+ // Create a resource profile with a minimum reservation (but no maximum).
+ public static ResourceProfile withMinReservation(long memEstimateBytes,
+ long minReservationBytes) {
+ return new ResourceProfile(
+ true, memEstimateBytes, minReservationBytes, Long.MAX_VALUE, -1);
}
- public ResourceProfile(long memEstimateBytes, long minReservationBytes) {
- this(true, memEstimateBytes, minReservationBytes);
+ // Create a resource profile with a minimum reservation (but no maximum) and a
+ // spillable buffer size.
+ public static ResourceProfile spillableWithMinReservation(long memEstimateBytes,
+ long minReservationBytes, long spillableBufferBytes) {
+ return new ResourceProfile(true, memEstimateBytes, minReservationBytes,
+ Long.MAX_VALUE, spillableBufferBytes);
}
public static ResourceProfile invalid() {
- return new ResourceProfile(false, -1, -1);
+ return new ResourceProfile(false, -1, -1, -1, -1);
}
public boolean isValid() { return isValid_; }
public long getMemEstimateBytes() { return memEstimateBytes_; }
public long getMinReservationBytes() { return minReservationBytes_; }
+ public long getMaxReservationBytes() { return maxReservationBytes_; }
+ public long getSpillableBufferBytes() { return spillableBufferBytes_; }
// Return a string with the resource profile information suitable for display in an
// explain plan in a format like: "resource1=value resource2=value"
@@ -63,6 +95,12 @@ public class ResourceProfile {
output.append(isValid_ ? PrintUtils.printBytes(memEstimateBytes_) : "invalid");
output.append(" mem-reservation=");
output.append(isValid_ ? PrintUtils.printBytes(minReservationBytes_) : "invalid");
+ // TODO: output maxReservation_ here if the planner becomes more sophisticated in
+ // choosing it (beyond 0/unlimited).
+ if (isValid_ && spillableBufferBytes_ != -1) {
+ output.append(" spill-buffer=");
+ output.append(PrintUtils.printBytes(spillableBufferBytes_));
+ }
return output.toString();
}
@@ -70,25 +108,39 @@ public class ResourceProfile {
public ResourceProfile max(ResourceProfile other) {
if (!isValid()) return other;
if (!other.isValid()) return this;
- return new ResourceProfile(
+ return new ResourceProfile(true,
Math.max(getMemEstimateBytes(), other.getMemEstimateBytes()),
- Math.max(getMinReservationBytes(), other.getMinReservationBytes()));
+ Math.max(getMinReservationBytes(), other.getMinReservationBytes()),
+ Math.max(getMaxReservationBytes(), other.getMaxReservationBytes()), -1);
}
// Returns a profile with the sum of each value in 'this' and 'other'.
public ResourceProfile sum(ResourceProfile other) {
if (!isValid()) return other;
if (!other.isValid()) return this;
- return new ResourceProfile(
+ return new ResourceProfile(true,
MathUtil.saturatingAdd(getMemEstimateBytes(), other.getMemEstimateBytes()),
- MathUtil.saturatingAdd(getMinReservationBytes(), other.getMinReservationBytes()));
+ MathUtil.saturatingAdd(getMinReservationBytes(),other.getMinReservationBytes()),
+ MathUtil.saturatingAdd(getMaxReservationBytes(), other.getMaxReservationBytes()),
+ -1);
}
// Returns a profile with all values multiplied by 'factor'.
public ResourceProfile multiply(int factor) {
if (!isValid()) return this;
- return new ResourceProfile(
+ return new ResourceProfile(true,
MathUtil.saturatingMultiply(memEstimateBytes_, factor),
- MathUtil.saturatingMultiply(minReservationBytes_, factor));
+ MathUtil.saturatingMultiply(minReservationBytes_, factor),
+ MathUtil.saturatingMultiply(maxReservationBytes_, factor), -1);
+ }
+
+ public TBackendResourceProfile toThrift() {
+ TBackendResourceProfile result = new TBackendResourceProfile();
+ result.setMin_reservation(minReservationBytes_);
+ result.setMax_reservation(maxReservationBytes_);
+ if (spillableBufferBytes_ != -1) {
+ result.setSpillable_buffer_size(spillableBufferBytes_);
+ }
+ return result;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SelectNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SelectNode.java b/fe/src/main/java/org/apache/impala/planner/SelectNode.java
index 97dfa5b..3ffc975 100644
--- a/fe/src/main/java/org/apache/impala/planner/SelectNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SelectNode.java
@@ -84,7 +84,7 @@ public class SelectNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java b/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
index bed1c9a..bdf3a01 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
@@ -68,7 +68,7 @@ public class SingularRowSrcNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SortNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SortNode.java b/fe/src/main/java/org/apache/impala/planner/SortNode.java
index aee8fda..75e8034 100644
--- a/fe/src/main/java/org/apache/impala/planner/SortNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SortNode.java
@@ -29,7 +29,6 @@ import org.apache.impala.analysis.SlotDescriptor;
import org.apache.impala.analysis.SlotRef;
import org.apache.impala.analysis.SortInfo;
import org.apache.impala.common.InternalException;
-import org.apache.impala.service.BackendConfig;
import org.apache.impala.thrift.TExplainLevel;
import org.apache.impala.thrift.TPlanNode;
import org.apache.impala.thrift.TPlanNodeType;
@@ -255,7 +254,7 @@ public class SortNode extends PlanNode {
if (type_ == TSortType.TOPN) {
long perInstanceMemEstimate =
(long) Math.ceil((cardinality_ + offset_) * avgRowSize_);
- nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(perInstanceMemEstimate);
return;
}
@@ -265,44 +264,40 @@ public class SortNode extends PlanNode {
// size sqrt(N) blocks, and we could merge sqrt(N) such runs with sqrt(N) blocks
// of memory.
double fullInputSize = getChild(0).cardinality_ * avgRowSize_;
- boolean hasVarLenSlots = false;
+ boolean usesVarLenBlocks = false;
for (SlotDescriptor slotDesc: info_.getSortTupleDescriptor().getSlots()) {
if (slotDesc.isMaterialized() && !slotDesc.getType().isFixedLengthType()) {
- hasVarLenSlots = true;
+ usesVarLenBlocks = true;
break;
}
}
- // The block size used by the sorter is the same as the configured I/O read size.
- long blockSize = BackendConfig.INSTANCE.getReadSize();
- // The external sorter writes fixed-len and var-len data in separate sequences of
- // blocks on disk and reads from both sequences when merging. This effectively
- // doubles the block size when there are var-len columns present.
- if (hasVarLenSlots) blockSize *= 2;
+ // Sort always uses the default spillable buffer size.
+ long bufferSize = queryOptions.getDefault_spillable_buffer_size();
+ // The external sorter writes fixed-len and var-len data in separate sequences of
+ // pages on disk and reads from both sequences when merging. This effectively
+ // doubles the number of pages required when there are var-len columns present.
+ // Must be kept in sync with ComputeMinReservation() in Sorter in be.
+ int pageMultiplier = usesVarLenBlocks ? 2 : 1;
+ long perInstanceMemEstimate;
+ long perInstanceMinReservation;
if (type_ == TSortType.PARTIAL) {
// The memory limit cannot be less than the size of the required blocks.
- long mem_limit =
- PARTIAL_SORT_MEM_LIMIT > blockSize ? PARTIAL_SORT_MEM_LIMIT : blockSize;
+ long mem_limit = Math.max(PARTIAL_SORT_MEM_LIMIT, bufferSize * pageMultiplier);
// 'fullInputSize' will be negative if stats are missing, just use the limit.
- long perInstanceMemEstimate = fullInputSize < 0 ?
+ perInstanceMemEstimate = fullInputSize < 0 ?
mem_limit :
Math.min((long) Math.ceil(fullInputSize), mem_limit);
- nodeResourceProfile_ = new ResourceProfile(perInstanceMemEstimate, blockSize);
+ perInstanceMinReservation = bufferSize * pageMultiplier;
} else {
- Preconditions.checkState(type_ == TSortType.TOTAL);
- double numInputBlocks = Math.ceil(fullInputSize / blockSize);
- long perInstanceMemEstimate =
- blockSize * (long) Math.ceil(Math.sqrt(numInputBlocks));
-
- // Must be kept in sync with min_buffers_required in Sorter in be.
- long perInstanceMinReservation = 3 * getDefaultSpillableBufferBytes();
- if (info_.getSortTupleDescriptor().hasVarLenSlots()) {
- perInstanceMinReservation *= 2;
- }
- nodeResourceProfile_ =
- new ResourceProfile(perInstanceMemEstimate, perInstanceMinReservation);
+ double numInputBlocks = Math.ceil(fullInputSize / (bufferSize * pageMultiplier));
+ perInstanceMemEstimate =
+ bufferSize * (long) Math.ceil(Math.sqrt(numInputBlocks));
+ perInstanceMinReservation = 3 * bufferSize * pageMultiplier;
}
+ nodeResourceProfile_ = ResourceProfile.spillableWithMinReservation(
+ perInstanceMemEstimate, perInstanceMinReservation, bufferSize);
}
private static String getDisplayName(TSortType type) {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/SubplanNode.java b/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
index c09efe5..e41290e 100644
--- a/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SubplanNode.java
@@ -95,7 +95,7 @@ public class SubplanNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/UnionNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/UnionNode.java b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
index 44e2967..302f62d 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnionNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
@@ -131,7 +131,7 @@ public class UnionNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/UnnestNode.java b/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
index 695ec24..7e0a87e 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
@@ -76,7 +76,7 @@ public class UnnestNode extends PlanNode {
@Override
public void computeNodeResourceProfile(TQueryOptions queryOptions) {
// TODO: add an estimate
- nodeResourceProfile_ = new ResourceProfile(0, 0);
+ nodeResourceProfile_ = ResourceProfile.noReservation(0);
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/main/java/org/apache/impala/service/Frontend.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/Frontend.java b/fe/src/main/java/org/apache/impala/service/Frontend.java
index 2c71a9b..60e84b4 100644
--- a/fe/src/main/java/org/apache/impala/service/Frontend.java
+++ b/fe/src/main/java/org/apache/impala/service/Frontend.java
@@ -1005,7 +1005,7 @@ public class Frontend {
}
// Compute resource requirements of the final plans.
- planner.computeResourceReqs(planRoots, result);
+ planner.computeResourceReqs(planRoots, queryCtx, result);
// create per-plan exec info;
// also assemble list of names of tables with missing or corrupt stats for
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
index 8289ee8..b0f1e2e 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -425,8 +425,6 @@ public class PlannerTest extends PlannerTestBase {
TQueryOptions options = defaultQueryOptions();
options.setExplain_level(TExplainLevel.EXTENDED);
options.setNum_scanner_threads(1); // Required so that output doesn't vary by machine
- // TODO: IMPALA-3200 - this should become a query option.
- RuntimeEnv.INSTANCE.setMinSpillableBufferBytes(64 * 1024);
runPlannerTestFile("spillable-buffer-sizing", options, false);
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
index ed4f684..f4ae6c3 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
@@ -102,7 +102,7 @@ having 1024 * 1024 * count(*) % 2 = 0
and (sm between 5 and 10)
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=138.00MB mem-reservation=264.00MB
+| Per-Host Resources: mem-estimate=138.00MB mem-reservation=1.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -110,7 +110,7 @@ PLAN-ROOT SINK
| output: sum(2 + id), count(*)
| group by: timestamp_col = TIMESTAMP '2016-11-15 00:00:00'
| having: sum(2 + id) <= 10, sum(2 + id) > 1, sum(2 + id) >= 5, 1048576 * count(*) % 2 = 0
-| mem-estimate=10.00MB mem-reservation=264.00MB
+| mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=1 row-size=17B cardinality=0
|
00:SCAN HDFS [functional.alltypes]
@@ -129,7 +129,7 @@ left outer join functional.alltypes b
where round(1.11 + 2.22 + 3.33 + 4.44, 1) < cast(b.double_col as decimal(3, 2))
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=256.02MB mem-reservation=136.00MB
+| Per-Host Resources: mem-estimate=256.02MB mem-reservation=1.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -138,7 +138,7 @@ PLAN-ROOT SINK
| fk/pk conjuncts: assumed fk/pk
| other join predicates: a.int_col <= b.bigint_col + 97, a.int_col >= 0 + b.bigint_col
| other predicates: CAST(b.double_col AS DECIMAL(3,2)) > 11.1
-| mem-estimate=15.68KB mem-reservation=136.00MB
+| mem-estimate=15.68KB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=0,1N row-size=28B cardinality=7300
|
|--01:SCAN HDFS [functional.alltypes b]
@@ -203,7 +203,7 @@ group by timestamp_col = cast('2015-11-15' as timestamp) + interval 1 year
having 1024 * 1024 * count(*) % 2 = 0
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=138.00MB mem-reservation=528.00MB
+| Per-Host Resources: mem-estimate=138.00MB mem-reservation=2.12MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -211,13 +211,13 @@ PLAN-ROOT SINK
| output: sum(2 + id), count:merge(*)
| group by: timestamp_col = TIMESTAMP '2016-11-15 00:00:00'
| having: 1048576 * count(*) % 2 = 0
-| mem-estimate=10.00MB mem-reservation=264.00MB
+| mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=2 row-size=17B cardinality=0
|
01:AGGREGATE
| output: count(*)
| group by: timestamp_col = TIMESTAMP '2016-11-15 00:00:00', 2 + id
-| mem-estimate=10.00MB mem-reservation=264.00MB
+| mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=1 row-size=17B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -234,20 +234,20 @@ from functional.alltypes
having 1024 * 1024 * count(*) % 2 = 0
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=138.00MB mem-reservation=264.00MB
+| Per-Host Resources: mem-estimate=138.00MB mem-reservation=1.06MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
02:AGGREGATE [FINALIZE]
| output: sum(2 + id), count:merge(*)
| having: 1048576 * zeroifnull(count(*)) % 2 = 0
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=2 row-size=16B cardinality=0
|
01:AGGREGATE
| output: count(*)
| group by: 2 + id
-| mem-estimate=10.00MB mem-reservation=264.00MB
+| mem-estimate=10.00MB mem-reservation=1.06MB spill-buffer=64.00KB
| tuple-ids=1 row-size=16B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -265,7 +265,7 @@ select first_value(1 + 1 + int_col - (1 - 1)) over
from functional.alltypes
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=144.00MB mem-reservation=64.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=16.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
@@ -274,13 +274,13 @@ PLAN-ROOT SINK
| partition by: concat('ab', string_col)
| order by: greatest(20, bigint_col) ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
-| mem-estimate=0B mem-reservation=16.00MB
+| mem-estimate=0B mem-reservation=4.00MB spill-buffer=2.00MB
| tuple-ids=3,2 row-size=61B cardinality=7300
|
01:SORT
| order by: concat('ab', string_col) ASC NULLS FIRST, greatest(20, bigint_col) ASC
| materialized: concat('ab', string_col), greatest(20, bigint_col)
-| mem-estimate=16.00MB mem-reservation=48.00MB
+| mem-estimate=2.00MB mem-reservation=12.00MB spill-buffer=2.00MB
| tuple-ids=3 row-size=53B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -296,13 +296,13 @@ select int_col from functional.alltypes
order by id * abs((factorial(5) / power(2, 4)))
---- PLAN
F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-| Per-Host Resources: mem-estimate=136.00MB mem-reservation=24.00MB
+| Per-Host Resources: mem-estimate=130.00MB mem-reservation=6.00MB
PLAN-ROOT SINK
| mem-estimate=0B mem-reservation=0B
|
01:SORT
| order by: id * 7.5 ASC
-| mem-estimate=8.00MB mem-reservation=24.00MB
+| mem-estimate=2.00MB mem-reservation=6.00MB spill-buffer=2.00MB
| tuple-ids=1 row-size=8B cardinality=7300
|
00:SCAN HDFS [functional.alltypes]
@@ -347,7 +347,7 @@ PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum(id + 10 + 20 + 30)
-| mem-estimate=10.00MB mem-reservation=0B
+| mem-estimate=10.00MB mem-reservation=0B spill-buffer=2.00MB
| tuple-ids=4 row-size=8B cardinality=1
|
00:SCAN HDFS [functional.alltypes]
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test b/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
index e64691a..d367424 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
@@ -78,7 +78,7 @@ select count(*)
from functional.alltypes t1
join functional.alltypestiny t2 on t1.id = t2.id
---- DISTRIBUTEDPLAN
-Per-Host Resource Reservation: Memory=136.00MB
+Per-Host Resource Reservation: Memory=1.06MB
Per-Host Resource Estimates: Memory=180.00MB
Codegen disabled by planner
[07/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-block-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr.cc b/be/src/runtime/buffered-block-mgr.cc
deleted file mode 100644
index e4737c2..0000000
--- a/be/src/runtime/buffered-block-mgr.cc
+++ /dev/null
@@ -1,1254 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/mem-pool.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/runtime-state.h"
-#include "runtime/tmp-file-mgr.h"
-#include "util/debug-util.h"
-#include "util/disk-info.h"
-#include "util/filesystem-util.h"
-#include "util/impalad-metrics.h"
-#include "util/runtime-profile-counters.h"
-#include "util/uid-util.h"
-
-#include <gutil/strings/substitute.h>
-
-#include "common/names.h"
-
-using namespace strings; // for Substitute
-
-namespace impala {
-
-BufferedBlockMgr::BlockMgrsMap BufferedBlockMgr::query_to_block_mgrs_;
-SpinLock BufferedBlockMgr::static_block_mgrs_lock_;
-
-
-struct BufferedBlockMgr::Client {
- Client(const string& debug_info, BufferedBlockMgr* mgr, int num_reserved_buffers,
- bool tolerates_oversubscription, MemTracker* tracker,
- RuntimeState* state)
- : debug_info_(debug_info),
- mgr_(mgr),
- state_(state),
- tracker_(tracker),
- query_tracker_(mgr_->mem_tracker_->parent()),
- num_reserved_buffers_(num_reserved_buffers),
- tolerates_oversubscription_(tolerates_oversubscription),
- num_tmp_reserved_buffers_(0),
- num_pinned_buffers_(0),
- logged_large_allocation_warning_(false) {
- DCHECK(tracker != NULL);
- }
-
- /// A string that will be printed to identify the client, e.g. which exec node it
- /// belongs to.
- string debug_info_;
-
- /// Unowned.
- BufferedBlockMgr* mgr_;
-
- /// Unowned.
- RuntimeState* state_;
-
- /// Tracker for this client. Unowned.
- /// When the client gets a buffer, we update the consumption on this tracker. However,
- /// we don't want to transfer the buffer from the block mgr to the client (i.e. release
- /// from the block mgr), since the block mgr is where the block mem usage limit is
- /// enforced. Even when we give a buffer to a client, the buffer is still owned and
- /// counts against the block mgr tracker (i.e. there is a fixed pool of buffers
- /// regardless of if they are in the block mgr or the clients).
- MemTracker* tracker_;
-
- /// This is the common ancestor between the block mgr tracker and the client tracker.
- /// When memory is transferred to the client, we want it to stop at this tracker.
- MemTracker* query_tracker_;
-
- /// Number of buffers reserved by this client.
- int num_reserved_buffers_;
-
- /// If false, return MEM_LIMIT_EXCEEDED when a reserved buffer cannot be allocated.
- /// If true, return Status::OK() as with a non-reserved buffer.
- bool tolerates_oversubscription_;
-
- /// Number of buffers temporarily reserved.
- int num_tmp_reserved_buffers_;
-
- /// Number of buffers pinned by this client.
- int num_pinned_buffers_;
-
- /// Whether a warning about a large allocation has been made for this client. Used
- /// to avoid producing excessive log messages.
- bool logged_large_allocation_warning_;
-
- void PinBuffer(BufferDescriptor* buffer) {
- DCHECK(buffer != NULL);
- if (buffer->len == mgr_->max_block_size()) {
- ++num_pinned_buffers_;
- tracker_->ConsumeLocal(buffer->len, query_tracker_);
- }
- }
-
- void UnpinBuffer(BufferDescriptor* buffer) {
- DCHECK(buffer != NULL);
- if (buffer->len == mgr_->max_block_size()) {
- DCHECK_GT(num_pinned_buffers_, 0);
- --num_pinned_buffers_;
- tracker_->ReleaseLocal(buffer->len, query_tracker_);
- }
- }
-
- string DebugString() const {
- stringstream ss;
- ss << "Client " << this << endl
- << " " << debug_info_ << endl
- << " num_reserved_buffers=" << num_reserved_buffers_ << endl
- << " num_tmp_reserved_buffers=" << num_tmp_reserved_buffers_ << endl
- << " num_pinned_buffers=" << num_pinned_buffers_;
- return ss.str();
- }
-};
-
-// BufferedBlockMgr::Block methods.
-BufferedBlockMgr::Block::Block(BufferedBlockMgr* block_mgr)
- : buffer_desc_(NULL),
- block_mgr_(block_mgr),
- client_(NULL),
- valid_data_len_(0),
- num_rows_(0) {}
-
-Status BufferedBlockMgr::Block::Pin(bool* pinned, Block* release_block, bool unpin) {
- return block_mgr_->PinBlock(this, pinned, release_block, unpin);
-}
-
-Status BufferedBlockMgr::Block::Unpin() {
- return block_mgr_->UnpinBlock(this);
-}
-
-void BufferedBlockMgr::Block::Delete() {
- block_mgr_->DeleteBlock(this);
-}
-
-void BufferedBlockMgr::Block::Init() {
- // No locks are taken because the block is new or has previously been deleted.
- is_pinned_ = false;
- in_write_ = false;
- is_deleted_ = false;
- valid_data_len_ = 0;
- client_ = NULL;
- num_rows_ = 0;
-}
-
-bool BufferedBlockMgr::Block::Validate() const {
- if (is_deleted_ && (is_pinned_ || (!in_write_ && buffer_desc_ != NULL))) {
- LOG(ERROR) << "Deleted block in use - " << DebugString();
- return false;
- }
-
- if (buffer_desc_ == NULL && (is_pinned_ || in_write_)) {
- LOG(ERROR) << "Block without buffer in use - " << DebugString();
- return false;
- }
-
- if (buffer_desc_ == NULL && block_mgr_->unpinned_blocks_.Contains(this)) {
- LOG(ERROR) << "Unpersisted block without buffer - " << DebugString();
- return false;
- }
-
- if (buffer_desc_ != NULL && (buffer_desc_->block != this)) {
- LOG(ERROR) << "Block buffer inconsistency - " << DebugString();
- return false;
- }
-
- return true;
-}
-
-string BufferedBlockMgr::Block::TmpFilePath() const {
- if (write_handle_ == NULL) return "";
- return write_handle_->TmpFilePath();
-}
-
-string BufferedBlockMgr::Block::DebugString() const {
- stringstream ss;
- ss << "Block: " << this << endl
- << " Buffer Desc: " << buffer_desc_ << endl
- << " Data Len: " << valid_data_len_ << endl
- << " Num Rows: " << num_rows_ << endl;
- if (is_pinned_) ss << " Buffer Len: " << buffer_len() << endl;
- ss << " Deleted: " << is_deleted_ << endl
- << " Pinned: " << is_pinned_ << endl
- << " Write Issued: " << in_write_ << endl
- << " Client Local: " << client_local_ << endl;
- if (write_handle_ != NULL) {
- ss << " Write handle: " << write_handle_->DebugString() << endl;
- }
- if (client_ != NULL) ss << " Client: " << client_->DebugString();
- return ss.str();
-}
-
-BufferedBlockMgr::BufferedBlockMgr(RuntimeState* state, TmpFileMgr* tmp_file_mgr,
- int64_t block_size, int64_t scratch_limit)
- : max_block_size_(block_size),
- // Keep two writes in flight per scratch disk so the disks can stay busy.
- block_write_threshold_(tmp_file_mgr->NumActiveTmpDevices() * 2),
- disable_spill_(state->query_ctx().disable_spilling || block_write_threshold_ == 0
- || scratch_limit == 0),
- query_id_(state->query_id()),
- initialized_(false),
- unfullfilled_reserved_buffers_(0),
- total_pinned_buffers_(0),
- non_local_outstanding_writes_(0),
- tmp_file_group_(NULL),
- is_cancelled_(false),
- writes_issued_(0),
- debug_write_delay_ms_(0) {}
-
-Status BufferedBlockMgr::Create(RuntimeState* state, MemTracker* parent,
- RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, int64_t mem_limit,
- int64_t block_size, shared_ptr<BufferedBlockMgr>* block_mgr) {
- DCHECK(parent != NULL);
- int64_t scratch_limit = state->query_options().scratch_limit;
- block_mgr->reset();
- {
- lock_guard<SpinLock> lock(static_block_mgrs_lock_);
- BlockMgrsMap::iterator it = query_to_block_mgrs_.find(state->query_id());
- if (it != query_to_block_mgrs_.end()) *block_mgr = it->second.lock();
- if (*block_mgr == NULL) {
- // weak_ptr::lock returns NULL if the weak_ptr is expired. This means
- // all shared_ptr references have gone to 0 and it is in the process of
- // being deleted. This can happen if the last shared reference is released
- // but before the weak ptr is removed from the map.
- block_mgr->reset(
- new BufferedBlockMgr(state, tmp_file_mgr, block_size, scratch_limit));
- query_to_block_mgrs_[state->query_id()] = *block_mgr;
- }
- }
- (*block_mgr)
- ->Init(state->io_mgr(), tmp_file_mgr, profile, parent, mem_limit, scratch_limit);
- return Status::OK();
-}
-
-int64_t BufferedBlockMgr::available_buffers(Client* client) const {
- int64_t unused_reserved = client->num_reserved_buffers_ +
- client->num_tmp_reserved_buffers_ - client->num_pinned_buffers_;
- return max<int64_t>(0, remaining_unreserved_buffers()) +
- max<int64_t>(0, unused_reserved);
-}
-
-int64_t BufferedBlockMgr::remaining_unreserved_buffers() const {
- int64_t num_buffers = free_io_buffers_.size() +
- unpinned_blocks_.size() + non_local_outstanding_writes_;
- num_buffers += mem_tracker_->SpareCapacity() / max_block_size();
- num_buffers -= unfullfilled_reserved_buffers_;
- return num_buffers;
-}
-
-Status BufferedBlockMgr::RegisterClient(const string& debug_info,
- int num_reserved_buffers, bool tolerates_oversubscription, MemTracker* tracker,
- RuntimeState* state, Client** client) {
- DCHECK_GE(num_reserved_buffers, 0);
- Client* aClient = new Client(debug_info, this, num_reserved_buffers,
- tolerates_oversubscription, tracker, state);
- lock_guard<mutex> lock(lock_);
- *client = obj_pool_.Add(aClient);
- unfullfilled_reserved_buffers_ += num_reserved_buffers;
- return Status::OK();
-}
-
-void BufferedBlockMgr::ClearReservations(Client* client) {
- lock_guard<mutex> lock(lock_);
- if (client->num_pinned_buffers_ < client->num_reserved_buffers_) {
- unfullfilled_reserved_buffers_ -=
- client->num_reserved_buffers_ - client->num_pinned_buffers_;
- }
- client->num_reserved_buffers_ = 0;
-
- unfullfilled_reserved_buffers_ -= client->num_tmp_reserved_buffers_;
- client->num_tmp_reserved_buffers_ = 0;
-}
-
-bool BufferedBlockMgr::TryAcquireTmpReservation(Client* client, int num_buffers) {
- lock_guard<mutex> lock(lock_);
- DCHECK_EQ(client->num_tmp_reserved_buffers_, 0);
- if (client->num_pinned_buffers_ < client->num_reserved_buffers_) {
- // If client has unused reserved buffers, we use those first.
- num_buffers -= client->num_reserved_buffers_ - client->num_pinned_buffers_;
- }
- if (num_buffers < 0) return true;
- if (available_buffers(client) < num_buffers) return false;
-
- client->num_tmp_reserved_buffers_ = num_buffers;
- unfullfilled_reserved_buffers_ += num_buffers;
- return true;
-}
-
-bool BufferedBlockMgr::ConsumeMemory(Client* client, int64_t size) {
- int64_t buffers_needed = BitUtil::Ceil(size, max_block_size());
- if (UNLIKELY(!BitUtil::IsNonNegative32Bit(buffers_needed))) {
- VLOG_QUERY << "Trying to consume " << size << " which is out of range.";
- return false;
- }
- DCHECK_GT(buffers_needed, 0) << "Trying to consume 0 memory";
-
- unique_lock<mutex> lock(lock_);
- if (size < max_block_size() && mem_tracker_->TryConsume(size)) {
- // For small allocations (less than a block size), just let the allocation through.
- client->tracker_->ConsumeLocal(size, client->query_tracker_);
- return true;
- }
-
- if (max<int64_t>(0, remaining_unreserved_buffers()) +
- client->num_tmp_reserved_buffers_ < buffers_needed) {
- return false;
- }
-
- if (mem_tracker_->TryConsume(size)) {
- // There was still unallocated memory, don't need to recycle allocated blocks.
- client->tracker_->ConsumeLocal(size, client->query_tracker_);
- return true;
- }
-
- // Bump up client->num_tmp_reserved_buffers_ to satisfy this request. We don't want
- // another client to grab the buffer.
- int additional_tmp_reservations = 0;
- if (client->num_tmp_reserved_buffers_ < buffers_needed) {
- additional_tmp_reservations = buffers_needed - client->num_tmp_reserved_buffers_;
- client->num_tmp_reserved_buffers_ += additional_tmp_reservations;
- unfullfilled_reserved_buffers_ += additional_tmp_reservations;
- }
-
- // Loop until we have freed enough memory.
- // We free all the memory at the end. We don't want another component to steal the
- // memory.
- int buffers_acquired = 0;
- do {
- BufferDescriptor* buffer_desc = NULL;
- Status s = FindBuffer(lock, &buffer_desc); // This waits on the lock.
- if (buffer_desc == NULL) break;
- DCHECK(s.ok());
- all_io_buffers_.erase(buffer_desc->all_buffers_it);
- if (buffer_desc->block != NULL) buffer_desc->block->buffer_desc_ = NULL;
- delete[] buffer_desc->buffer;
- ++buffers_acquired;
- } while (buffers_acquired != buffers_needed);
-
- Status status = Status::OK();
- if (buffers_acquired == buffers_needed) status = WriteUnpinnedBlocks();
- // If we either couldn't acquire enough buffers or WriteUnpinnedBlocks() failed, undo
- // the reservation.
- if (buffers_acquired != buffers_needed || !status.ok()) {
- if (!status.ok() && !status.IsCancelled()) {
- VLOG_QUERY << "Query: " << query_id_ << " write unpinned buffers failed.";
- client->state_->LogError(status.msg());
- }
- client->num_tmp_reserved_buffers_ -= additional_tmp_reservations;
- unfullfilled_reserved_buffers_ -= additional_tmp_reservations;
- mem_tracker_->Release(buffers_acquired * max_block_size());
- return false;
- }
-
- client->num_tmp_reserved_buffers_ -= buffers_acquired;
- unfullfilled_reserved_buffers_ -= buffers_acquired;
-
- DCHECK_GE(buffers_acquired * max_block_size(), size);
- mem_tracker_->Release(buffers_acquired * max_block_size());
- if (!mem_tracker_->TryConsume(size)) return false;
- client->tracker_->ConsumeLocal(size, client->query_tracker_);
- DCHECK(Validate()) << endl << DebugInternal();
- return true;
-}
-
-void BufferedBlockMgr::ReleaseMemory(Client* client, int64_t size) {
- mem_tracker_->Release(size);
- client->tracker_->ReleaseLocal(size, client->query_tracker_);
-}
-
-void BufferedBlockMgr::Cancel() {
- {
- lock_guard<mutex> lock(lock_);
- if (is_cancelled_) return;
- is_cancelled_ = true;
- }
-}
-
-bool BufferedBlockMgr::IsCancelled() {
- lock_guard<mutex> lock(lock_);
- return is_cancelled_;
-}
-
-Status BufferedBlockMgr::MemLimitTooLowError(Client* client, int node_id) {
- VLOG_QUERY << "Query: " << query_id_ << ". Node=" << node_id
- << " ran out of memory: " << endl
- << DebugInternal() << endl << client->DebugString();
- int64_t min_memory = client->num_reserved_buffers_ * max_block_size();
- string msg = Substitute(
- "The memory limit is set too low to initialize spilling operator (id=$0). The "
- "minimum required memory to spill this operator is $1.",
- node_id, PrettyPrinter::Print(min_memory, TUnit::BYTES));
- return client->tracker_->MemLimitExceeded(client->state_, msg);
-}
-
-Status BufferedBlockMgr::GetNewBlock(Client* client, Block* unpin_block, Block** block,
- int64_t len) {
- DCHECK_LE(len, max_block_size_) << "Cannot request block bigger than max_len";
- DCHECK_NE(len, 0) << "Cannot request block of zero size";
- *block = NULL;
- Block* new_block = NULL;
- Status status;
-
- {
- lock_guard<mutex> lock(lock_);
- if (is_cancelled_) return Status::CANCELLED;
- new_block = GetUnusedBlock(client);
- DCHECK(new_block->Validate()) << endl << new_block->DebugString();
- DCHECK_EQ(new_block->client_, client);
- DCHECK_NE(new_block, unpin_block);
-
- if (len > 0 && len < max_block_size_) {
- DCHECK(unpin_block == NULL);
- if (client->tracker_->TryConsume(len)) {
- uint8_t* buffer = new uint8_t[len];
- // Descriptors for non-I/O sized buffers are deleted when the block is deleted.
- new_block->buffer_desc_ = new BufferDescriptor(buffer, len);
- new_block->buffer_desc_->block = new_block;
- new_block->is_pinned_ = true;
- client->PinBuffer(new_block->buffer_desc_);
- ++total_pinned_buffers_;
- *block = new_block;
- return Status::OK();
- } else {
- status = Status::OK();
- goto no_buffer_avail;
- }
- }
- }
-
- bool in_mem;
- status = FindBufferForBlock(new_block, &in_mem);
- if (!status.ok()) goto no_buffer_avail;
- DCHECK(!in_mem) << "A new block cannot start in mem.";
- DCHECK(!new_block->is_pinned() || new_block->buffer_desc_ != NULL)
- << new_block->DebugString();
-
- if (!new_block->is_pinned()) {
- if (unpin_block == NULL) {
- // We couldn't get a new block and no unpin block was provided. Can't return
- // a block.
- status = Status::OK();
- goto no_buffer_avail;
- } else {
- // We need to transfer the buffer from unpin_block to new_block.
- status = TransferBuffer(new_block, unpin_block, true);
- if (!status.ok()) goto no_buffer_avail;
- }
- } else if (unpin_block != NULL) {
- // Got a new block without needing to transfer. Just unpin this block.
- status = unpin_block->Unpin();
- if (!status.ok()) goto no_buffer_avail;
- }
-
- DCHECK(new_block->is_pinned());
- *block = new_block;
- return Status::OK();
-
-no_buffer_avail:
- DCHECK(new_block != NULL);
- DeleteBlock(new_block);
- return status;
-}
-
-Status BufferedBlockMgr::TransferBuffer(Block* dst, Block* src, bool unpin) {
- Status status = Status::OK();
- DCHECK(dst != NULL);
- DCHECK(src != NULL);
- unique_lock<mutex> lock(lock_);
-
- DCHECK(src->is_pinned_);
- DCHECK(!dst->is_pinned_);
- DCHECK(dst->buffer_desc_ == NULL);
- DCHECK_EQ(src->buffer_desc_->len, max_block_size_);
-
- // Ensure that there aren't any writes in flight for 'src'.
- WaitForWrite(lock, src);
- src->is_pinned_ = false;
-
- if (unpin) {
- // First write out the src block so we can grab its buffer.
- src->client_local_ = true;
- status = WriteUnpinnedBlock(src);
- if (!status.ok()) {
- // The transfer failed, return the buffer to src.
- src->is_pinned_ = true;
- return status;
- }
- // Wait for the write to complete.
- WaitForWrite(lock, src);
- if (is_cancelled_) {
- // We can't be sure the write succeeded, so return the buffer to src.
- src->is_pinned_ = true;
- return Status::CANCELLED;
- }
- DCHECK(!src->in_write_);
- }
- // Assign the buffer to the new block.
- dst->buffer_desc_ = src->buffer_desc_;
- dst->buffer_desc_->block = dst;
- src->buffer_desc_ = NULL;
- dst->is_pinned_ = true;
- if (!unpin) DeleteBlockLocked(lock, src);
- return Status::OK();
-}
-
-BufferedBlockMgr::~BufferedBlockMgr() {
- shared_ptr<BufferedBlockMgr> other_mgr_ptr;
- {
- lock_guard<SpinLock> lock(static_block_mgrs_lock_);
- BlockMgrsMap::iterator it = query_to_block_mgrs_.find(query_id_);
- // IMPALA-2286: Another fragment may have called Create() for this query_id_ and
- // saw that this BufferedBlockMgr is being destructed. That fragement will
- // overwrite the map entry for query_id_, pointing it to a different
- // BufferedBlockMgr object. We should let that object's destructor remove the
- // entry. On the other hand, if the second BufferedBlockMgr is destructed before
- // this thread acquires the lock, then we'll remove the entry (because we can't
- // distinguish between the two expired pointers), and when the other
- // ~BufferedBlockMgr() call occurs, it won't find an entry for this query_id_.
- if (it != query_to_block_mgrs_.end()) {
- other_mgr_ptr = it->second.lock();
- if (other_mgr_ptr.get() == NULL) {
- // The BufferBlockMgr object referenced by this entry is being deconstructed.
- query_to_block_mgrs_.erase(it);
- } else {
- // The map references another (still valid) BufferedBlockMgr.
- DCHECK_NE(other_mgr_ptr.get(), this);
- }
- }
- }
- // IMPALA-4274: releasing the reference count can recursively call ~BufferedBlockMgr().
- // Do not do that with 'static_block_mgrs_lock_' held.
- other_mgr_ptr.reset();
-
- // Delete tmp files and cancel any in-flight writes.
- tmp_file_group_->Close();
-
- // If there are any outstanding writes and we are here it means that when the
- // WriteComplete() callback gets executed it is going to access invalid memory.
- // See IMPALA-1890.
- DCHECK_EQ(non_local_outstanding_writes_, 0) << endl << DebugInternal();
-
- // Validate that clients deleted all of their blocks. Since all writes have
- // completed at this point, any deleted blocks should be in unused_blocks_.
- for (auto it = all_blocks_.begin(); it != all_blocks_.end(); ++it) {
- Block* block = *it;
- DCHECK(block->Validate()) << block->DebugString();
- DCHECK(unused_blocks_.Contains(block)) << block->DebugString();
- }
-
- // Free memory resources.
- for (BufferDescriptor* buffer: all_io_buffers_) {
- mem_tracker_->Release(buffer->len);
- delete[] buffer->buffer;
- }
- DCHECK_EQ(mem_tracker_->consumption(), 0);
- mem_tracker_->UnregisterFromParent();
- mem_tracker_.reset();
-}
-
-int64_t BufferedBlockMgr::bytes_allocated() const {
- return mem_tracker_->consumption();
-}
-
-int BufferedBlockMgr::num_pinned_buffers(Client* client) const {
- return client->num_pinned_buffers_;
-}
-
-int BufferedBlockMgr::num_reserved_buffers_remaining(Client* client) const {
- return max<int>(client->num_reserved_buffers_ - client->num_pinned_buffers_, 0);
-}
-
-MemTracker* BufferedBlockMgr::get_tracker(Client* client) const {
- return client->tracker_;
-}
-
-int64_t BufferedBlockMgr::GetNumWritesOutstanding() {
- // Acquire lock to avoid returning mid-way through WriteComplete() when the
- // state may be inconsistent.
- lock_guard<mutex> lock(lock_);
- return profile()->GetCounter("BlockWritesOutstanding")->value();
-}
-
-Status BufferedBlockMgr::DeleteOrUnpinBlock(Block* block, bool unpin) {
- if (block == NULL) {
- return IsCancelled() ? Status::CANCELLED : Status::OK();
- }
- if (unpin) {
- return block->Unpin();
- } else {
- block->Delete();
- return IsCancelled() ? Status::CANCELLED : Status::OK();
- }
-}
-
-Status BufferedBlockMgr::PinBlock(Block* block, bool* pinned, Block* release_block,
- bool unpin) {
- DCHECK(block != NULL);
- DCHECK(!block->is_deleted_);
- Status status;
- *pinned = false;
- if (block->is_pinned_) {
- *pinned = true;
- return DeleteOrUnpinBlock(release_block, unpin);
- }
-
- bool in_mem = false;
- status = FindBufferForBlock(block, &in_mem);
- if (!status.ok()) goto error;
- *pinned = block->is_pinned_;
-
- if (in_mem) {
- // The block's buffer is still in memory with the original data.
- status = CancelWrite(block);
- if (!status.ok()) goto error;
- return DeleteOrUnpinBlock(release_block, unpin);
- }
-
- if (!block->is_pinned_) {
- if (release_block == NULL) return Status::OK();
-
- if (block->buffer_desc_ != NULL) {
- // The block's buffer is still in memory but we couldn't get an additional buffer
- // because it would eat into another client's reservation. However, we can use
- // release_block's reservation, so reclaim the buffer.
- {
- lock_guard<mutex> lock(lock_);
- if (free_io_buffers_.Contains(block->buffer_desc_)) {
- DCHECK(!block->is_pinned_ && !block->in_write_ &&
- !unpinned_blocks_.Contains(block)) << endl << block->DebugString();
- free_io_buffers_.Remove(block->buffer_desc_);
- } else if (unpinned_blocks_.Contains(block)) {
- unpinned_blocks_.Remove(block);
- } else {
- DCHECK(block->in_write_);
- }
- block->is_pinned_ = true;
- *pinned = true;
- block->client_->PinBuffer(block->buffer_desc_);
- ++total_pinned_buffers_;
- status = WriteUnpinnedBlocks();
- if (!status.ok()) goto error;
- }
- status = CancelWrite(block);
- if (!status.ok()) goto error;
- return DeleteOrUnpinBlock(release_block, unpin);
- }
- // FindBufferForBlock() wasn't able to find a buffer so transfer the one from
- // 'release_block'.
- status = TransferBuffer(block, release_block, unpin);
- if (!status.ok()) goto error;
- DCHECK(!release_block->is_pinned_);
- release_block = NULL; // Handled by transfer.
- DCHECK(block->is_pinned_);
- *pinned = true;
- }
-
- DCHECK(block->write_handle_ != NULL) << block->DebugString() << endl << release_block;
-
- // The block is on disk - read it back into memory.
- if (block->valid_data_len() > 0) {
- status = tmp_file_group_->Read(block->write_handle_.get(), block->valid_data());
- if (!status.ok()) goto error;
- }
- tmp_file_group_->DestroyWriteHandle(move(block->write_handle_));
- return DeleteOrUnpinBlock(release_block, unpin);
-
-error:
- DCHECK(!status.ok());
- // Make sure to delete the block if we hit an error before calling DeleteOrUnpin().
- if (release_block != NULL && !unpin) DeleteBlock(release_block);
- return status;
-}
-
-Status BufferedBlockMgr::CancelWrite(Block* block) {
- {
- unique_lock<mutex> lock(lock_);
- DCHECK(block->buffer_desc_ != NULL);
- // If there is an in-flight write, wait for it to finish. This is sub-optimal
- // compared to just cancelling the write, but reduces the number of possible
- // code paths in this legacy code.
- WaitForWrite(lock, block);
- if (is_cancelled_) return Status::CANCELLED;
- }
- if (block->write_handle_ != NULL) {
- // Make sure the write is not in-flight.
- block->write_handle_->Cancel();
- block->write_handle_->WaitForWrite();
- // Restore the in-memory data without reading from disk (e.g. decrypt it).
- RETURN_IF_ERROR(
- tmp_file_group_->RestoreData(move(block->write_handle_), block->valid_data()));
- }
- return Status::OK();
-}
-
-Status BufferedBlockMgr::UnpinBlock(Block* block) {
- DCHECK(!block->is_deleted_) << "Unpin for deleted block.";
-
- lock_guard<mutex> unpinned_lock(lock_);
- if (is_cancelled_) return Status::CANCELLED;
- DCHECK(block->Validate()) << endl << block->DebugString();
- if (!block->is_pinned_) return Status::OK();
- DCHECK_EQ(block->buffer_desc_->len, max_block_size_) << "Can only unpin io blocks.";
- DCHECK(Validate()) << endl << DebugInternal();
- // Add 'block' to the list of unpinned blocks and set is_pinned_ to false.
- // Cache its position in the list for later removal.
- block->is_pinned_ = false;
- DCHECK(!unpinned_blocks_.Contains(block)) << " Unpin for block in unpinned list";
- if (!block->in_write_) unpinned_blocks_.Enqueue(block);
- block->client_->UnpinBuffer(block->buffer_desc_);
- if (block->client_->num_pinned_buffers_ < block->client_->num_reserved_buffers_) {
- ++unfullfilled_reserved_buffers_;
- }
- --total_pinned_buffers_;
- RETURN_IF_ERROR(WriteUnpinnedBlocks());
- DCHECK(Validate()) << endl << DebugInternal();
- DCHECK(block->Validate()) << endl << block->DebugString();
- return Status::OK();
-}
-
-Status BufferedBlockMgr::WriteUnpinnedBlocks() {
- if (disable_spill_) return Status::OK();
-
- // Assumes block manager lock is already taken.
- while (non_local_outstanding_writes_ + free_io_buffers_.size() < block_write_threshold_
- && !unpinned_blocks_.empty()) {
- // Pop a block from the back of the list (LIFO).
- Block* write_block = unpinned_blocks_.PopBack();
- write_block->client_local_ = false;
- RETURN_IF_ERROR(WriteUnpinnedBlock(write_block));
- ++non_local_outstanding_writes_;
- }
- DCHECK(Validate()) << endl << DebugInternal();
- return Status::OK();
-}
-
-Status BufferedBlockMgr::WriteUnpinnedBlock(Block* block) {
- // Assumes block manager lock is already taken.
- DCHECK(!block->is_pinned_) << block->DebugString();
- DCHECK(!block->in_write_) << block->DebugString();
- DCHECK(block->write_handle_ == NULL) << block->DebugString();
- DCHECK_EQ(block->buffer_desc_->len, max_block_size_);
-
- // The block is on disk - read it back into memory.
- RETURN_IF_ERROR(tmp_file_group_->Write(block->valid_data(),
- [this, block](const Status& write_status) { WriteComplete(block, write_status); },
- &block->write_handle_));
-
- block->in_write_ = true;
- DCHECK(block->Validate()) << endl << block->DebugString();
- outstanding_writes_counter_->Add(1);
- ++writes_issued_;
- if (writes_issued_ == 1) {
- if (ImpaladMetrics::NUM_QUERIES_SPILLED != NULL) {
- ImpaladMetrics::NUM_QUERIES_SPILLED->Increment(1);
- }
- }
- return Status::OK();
-}
-
-void BufferedBlockMgr::WaitForWrite(unique_lock<mutex>& lock, Block* block) {
- DCHECK(!block->is_deleted_);
- while (block->in_write_ && !is_cancelled_) {
- block->write_complete_cv_.wait(lock);
- }
-}
-
-void BufferedBlockMgr::WriteComplete(Block* block, const Status& write_status) {
-#ifndef NDEBUG
- if (debug_write_delay_ms_ > 0) {
- usleep(static_cast<int64_t>(debug_write_delay_ms_) * 1000);
- }
-#endif
- Status status = Status::OK();
- lock_guard<mutex> lock(lock_);
- DCHECK(Validate()) << endl << DebugInternal();
- DCHECK(is_cancelled_ || block->in_write_) << "WriteComplete() for block not in write."
- << endl
- << block->DebugString();
- DCHECK(block->buffer_desc_ != NULL);
-
- outstanding_writes_counter_->Add(-1);
- if (!block->client_local_) {
- DCHECK_GT(non_local_outstanding_writes_, 0) << block->DebugString();
- --non_local_outstanding_writes_;
- }
- block->in_write_ = false;
-
- // ReturnUnusedBlock() will clear the block, so save required state in local vars.
- // state is not valid if the block was deleted because the state may be torn down
- // after the state's fragment has deleted all of its blocks.
- RuntimeState* state = block->is_deleted_ ? NULL : block->client_->state_;
-
- // If the block was re-pinned when it was in the IOMgr queue, don't free it.
- if (block->is_pinned_) {
- // The number of outstanding writes has decreased but the number of free buffers
- // hasn't.
- DCHECK(!block->is_deleted_);
- DCHECK(!block->client_local_)
- << "Client should be waiting. No one should have pinned this block.";
- if (write_status.ok() && !is_cancelled_ && !state->is_cancelled()) {
- status = WriteUnpinnedBlocks();
- }
- } else if (block->client_local_) {
- DCHECK(!block->is_deleted_)
- << "Client should be waiting. No one should have deleted this block.";
- } else {
- DCHECK_EQ(block->buffer_desc_->len, max_block_size_)
- << "Only io sized buffers should spill";
- free_io_buffers_.Enqueue(block->buffer_desc_);
- }
-
- if (!write_status.ok() || !status.ok() || is_cancelled_) {
- VLOG_FILE << "Query: " << query_id_ << ". Write did not complete successfully: "
- "write_status="
- << write_status.GetDetail() << ", status=" << status.GetDetail()
- << ". is_cancelled_=" << is_cancelled_;
- // If the instance is already cancelled, don't confuse things with these errors.
- if (!write_status.ok() && !write_status.IsCancelled()) {
- // Report but do not attempt to recover from write error.
- VLOG_QUERY << "Query: " << query_id_ << " write complete callback with error.";
-
- if (state != NULL) state->LogError(write_status.msg());
- }
- if (!status.ok() && !status.IsCancelled()) {
- VLOG_QUERY << "Query: " << query_id_ << " error while writing unpinned blocks.";
- if (state != NULL) state->LogError(status.msg());
- }
- // Set cancelled. Threads waiting for a write will be woken up in the normal way when
- // one of the writes they are waiting for completes.
- is_cancelled_ = true;
- }
-
- // Notify any threads that may have been expecting to get block's buffer based on
- // the value of 'non_local_outstanding_writes_'. Wake them all up. If we added
- // a buffer to 'free_io_buffers_', one thread will get a buffer. All the others
- // will re-evaluate whether they should continue waiting and if another write needs
- // to be initiated.
- if (!block->client_local_) buffer_available_cv_.notify_all();
- if (block->is_deleted_) {
- // Finish the DeleteBlock() work.
- tmp_file_group_->DestroyWriteHandle(move(block->write_handle_));
- block->buffer_desc_->block = NULL;
- block->buffer_desc_ = NULL;
- ReturnUnusedBlock(block);
- block = NULL;
- } else {
- // Wake up the thread waiting on this block (if any).
- block->write_complete_cv_.notify_one();
- }
-
- DCHECK(Validate()) << endl << DebugInternal();
-}
-
-void BufferedBlockMgr::DeleteBlock(Block* block) {
- unique_lock<mutex> lock(lock_);
- DeleteBlockLocked(lock, block);
-}
-
-void BufferedBlockMgr::DeleteBlockLocked(const unique_lock<mutex>& lock, Block* block) {
- DCHECK(lock.mutex() == &lock_ && lock.owns_lock());
- DCHECK(block->Validate()) << endl << DebugInternal();
- DCHECK(!block->is_deleted_);
- block->is_deleted_ = true;
-
- if (block->is_pinned_) {
- if (block->is_max_size()) --total_pinned_buffers_;
- block->is_pinned_ = false;
- block->client_->UnpinBuffer(block->buffer_desc_);
- if (block->client_->num_pinned_buffers_ < block->client_->num_reserved_buffers_) {
- ++unfullfilled_reserved_buffers_;
- }
- } else if (unpinned_blocks_.Contains(block)) {
- // Remove block from unpinned list.
- unpinned_blocks_.Remove(block);
- }
-
- if (block->in_write_) {
- DCHECK(block->buffer_desc_ != NULL && block->buffer_desc_->len == max_block_size_)
- << "Should never be writing a small buffer";
- // If a write is still pending, cancel it and return. Cleanup will be done in
- // WriteComplete(). Cancelling the write ensures that it won't try to log to the
- // RuntimeState (which may be torn down before the block manager).
- DCHECK(block->Validate()) << endl << block->DebugString();
- return;
- }
-
- if (block->buffer_desc_ != NULL) {
- if (block->buffer_desc_->len != max_block_size_) {
- // Just delete the block for now.
- delete[] block->buffer_desc_->buffer;
- block->client_->tracker_->Release(block->buffer_desc_->len);
- delete block->buffer_desc_;
- block->buffer_desc_ = NULL;
- } else {
- if (!free_io_buffers_.Contains(block->buffer_desc_)) {
- free_io_buffers_.Enqueue(block->buffer_desc_);
- // Wake up one of the waiting threads, which will grab the buffer.
- buffer_available_cv_.notify_one();
- }
- block->buffer_desc_->block = NULL;
- block->buffer_desc_ = NULL;
- }
- }
-
- // Discard any on-disk data. The write is finished so this won't call back into
- // BufferedBlockMgr.
- if (block->write_handle_ != NULL) {
- tmp_file_group_->DestroyWriteHandle(move(block->write_handle_));
- }
- ReturnUnusedBlock(block);
- DCHECK(block->Validate()) << endl << block->DebugString();
- DCHECK(Validate()) << endl << DebugInternal();
-}
-
-void BufferedBlockMgr::ReturnUnusedBlock(Block* block) {
- DCHECK(block->is_deleted_) << block->DebugString();
- DCHECK(!block->is_pinned_) << block->DebugString();;
- DCHECK(block->buffer_desc_ == NULL);
- block->Init();
- unused_blocks_.Enqueue(block);
-}
-
-Status BufferedBlockMgr::FindBufferForBlock(Block* block, bool* in_mem) {
- DCHECK(block != NULL);
- Client* client = block->client_;
- DCHECK(client != NULL);
- DCHECK(!block->is_pinned_ && !block->is_deleted_)
- << "Pinned or deleted block " << endl << block->DebugString();
- *in_mem = false;
-
- unique_lock<mutex> l(lock_);
- if (is_cancelled_) return Status::CANCELLED;
-
- // First check if there is enough reserved memory to satisfy this request.
- bool is_reserved_request = false;
- if (client->num_pinned_buffers_ < client->num_reserved_buffers_) {
- is_reserved_request = true;
- } else if (client->num_tmp_reserved_buffers_ > 0) {
- is_reserved_request = true;
- --client->num_tmp_reserved_buffers_;
- }
-
- DCHECK(Validate()) << endl << DebugInternal();
- if (is_reserved_request) --unfullfilled_reserved_buffers_;
-
- if (!is_reserved_request && remaining_unreserved_buffers() < 1) {
- // The client already has its quota and there are no unreserved blocks left.
- // Note that even if this passes, it is still possible for the path below to
- // see OOM because another query consumed memory from the process tracker. This
- // only happens if the buffer has not already been allocated by the block mgr.
- // This check should ensure that the memory cannot be consumed by another client
- // of the block mgr.
- return Status::OK();
- }
-
- if (block->buffer_desc_ != NULL) {
- // The block is in memory. It may be in 3 states:
- // 1. In the unpinned list. The buffer will not be in the free list.
- // 2. in_write_ == true. The buffer will not be in the free list.
- // 3. The buffer is free, but hasn't yet been reassigned to a different block.
- DCHECK_EQ(block->buffer_desc_->len, max_block_size())
- << "Non-I/O blocks are always pinned";
- DCHECK(unpinned_blocks_.Contains(block) ||
- block->in_write_ ||
- free_io_buffers_.Contains(block->buffer_desc_));
- if (unpinned_blocks_.Contains(block)) {
- unpinned_blocks_.Remove(block);
- DCHECK(!free_io_buffers_.Contains(block->buffer_desc_));
- } else if (block->in_write_) {
- DCHECK(block->in_write_ && !free_io_buffers_.Contains(block->buffer_desc_));
- } else {
- free_io_buffers_.Remove(block->buffer_desc_);
- }
- buffered_pin_counter_->Add(1);
- *in_mem = true;
- } else {
- BufferDescriptor* buffer_desc = NULL;
- RETURN_IF_ERROR(FindBuffer(l, &buffer_desc));
-
- if (buffer_desc == NULL) {
- // There are no free buffers or blocks we can evict. We need to fail this request.
- // If this is an optional request, return OK. If it is required, return OOM.
- if (!is_reserved_request || client->tolerates_oversubscription_) return Status::OK();
-
- if (VLOG_QUERY_IS_ON) {
- stringstream ss;
- ss << "Query id=" << query_id_ << " was unable to get minimum required buffers."
- << endl << DebugInternal() << endl << client->DebugString();
- VLOG_QUERY << ss.str();
- }
- return client->tracker_->MemLimitExceeded(client->state_,
- "Query did not have enough memory to get the minimum required buffers in the "
- "block manager.");
- }
-
- DCHECK(buffer_desc != NULL);
- DCHECK_EQ(buffer_desc->len, max_block_size()) << "Non-I/O buffer";
- if (buffer_desc->block != NULL) {
- // This buffer was assigned to a block but now we are reusing it. Reset the
- // previous block->buffer link.
- DCHECK(buffer_desc->block->Validate()) << endl << buffer_desc->block->DebugString();
- buffer_desc->block->buffer_desc_ = NULL;
- }
- buffer_desc->block = block;
- block->buffer_desc_ = buffer_desc;
- }
- DCHECK(block->buffer_desc_ != NULL);
- DCHECK(block->buffer_desc_->len < max_block_size() || !block->is_pinned_)
- << "Trying to pin already pinned block. "
- << block->buffer_desc_->len << " " << block->is_pinned_;
- block->is_pinned_ = true;
- client->PinBuffer(block->buffer_desc_);
- ++total_pinned_buffers_;
-
- DCHECK(block->Validate()) << endl << block->DebugString();
- // The number of free buffers has decreased. Write unpinned blocks if the number
- // of free buffers is less than the threshold.
- RETURN_IF_ERROR(WriteUnpinnedBlocks());
- DCHECK(Validate()) << endl << DebugInternal();
- return Status::OK();
-}
-
-// We need to find a new buffer. We prefer getting this buffer in this order:
-// 1. Allocate a new block if the number of free blocks is less than the write threshold
-// or if we are running without spilling, until we run out of memory.
-// 2. Pick a buffer from the free list.
-// 3. Wait and evict an unpinned buffer.
-Status BufferedBlockMgr::FindBuffer(unique_lock<mutex>& lock,
- BufferDescriptor** buffer_desc) {
- DCHECK(lock.mutex() == &lock_ && lock.owns_lock());
- *buffer_desc = NULL;
-
- // First, try to allocate a new buffer.
- DCHECK(block_write_threshold_ > 0 || disable_spill_);
- if ((free_io_buffers_.size() < block_write_threshold_ || disable_spill_) &&
- mem_tracker_->TryConsume(max_block_size_)) {
- uint8_t* new_buffer = new uint8_t[max_block_size_];
- *buffer_desc = obj_pool_.Add(new BufferDescriptor(new_buffer, max_block_size_));
- (*buffer_desc)->all_buffers_it = all_io_buffers_.insert(
- all_io_buffers_.end(), *buffer_desc);
- return Status::OK();
- }
-
- // Second, try to pick a buffer from the free list.
- if (free_io_buffers_.empty()) {
- // There are no free buffers. If spills are disabled or there no unpinned blocks we
- // can write, return. We can't get a buffer.
- if (disable_spill_) {
- if (block_write_threshold_ == 0) {
- return Status("Spilling has been disabled due to no usable scratch space. "
- "Please specify a usable scratch space location via the --scratch_dirs "
- "impalad flag.");
- } else {
- return Status("Spilling has been disabled for plans that do not have stats and "
- "are not hinted to prevent potentially bad plans from using too many cluster "
- "resources. Please run COMPUTE STATS on these tables, hint the plan or "
- "disable this behavior via the DISABLE_UNSAFE_SPILLS query option.");
- }
- }
-
- // Third, this block needs to use a buffer that was unpinned from another block.
- // Get a free buffer from the front of the queue and assign it to the block.
- do {
- if (unpinned_blocks_.empty() && non_local_outstanding_writes_ == 0) {
- return Status::OK();
- }
- SCOPED_TIMER(buffer_wait_timer_);
- // Try to evict unpinned blocks before waiting.
- RETURN_IF_ERROR(WriteUnpinnedBlocks());
- DCHECK_GT(non_local_outstanding_writes_, 0) << endl << DebugInternal();
- buffer_available_cv_.wait(lock);
- if (is_cancelled_) return Status::CANCELLED;
- } while (free_io_buffers_.empty());
- }
- *buffer_desc = free_io_buffers_.Dequeue();
- return Status::OK();
-}
-
-BufferedBlockMgr::Block* BufferedBlockMgr::GetUnusedBlock(Client* client) {
- DCHECK(client != NULL);
- Block* new_block = NULL;
- if (unused_blocks_.empty()) {
- new_block = obj_pool_.Add(new Block(this));
- all_blocks_.push_back(new_block);
- new_block->Init();
- created_block_counter_->Add(1);
- } else {
- new_block = unused_blocks_.Dequeue();
- recycled_blocks_counter_->Add(1);
- }
- DCHECK(new_block != NULL);
- new_block->client_ = client;
- return new_block;
-}
-
-bool BufferedBlockMgr::Validate() const {
- int num_free_io_buffers = 0;
-
- if (total_pinned_buffers_ < 0) {
- LOG(ERROR) << "total_pinned_buffers_ < 0: " << total_pinned_buffers_;
- return false;
- }
-
- for (BufferDescriptor* buffer: all_io_buffers_) {
- bool is_free = free_io_buffers_.Contains(buffer);
- num_free_io_buffers += is_free;
-
- if (*buffer->all_buffers_it != buffer) {
- LOG(ERROR) << "All buffers list is corrupt. Buffer iterator is not valid.";
- return false;
- }
-
- if (buffer->block == NULL && !is_free) {
- LOG(ERROR) << "Buffer with no block not in free list." << endl << DebugInternal();
- return false;
- }
-
- if (buffer->len != max_block_size_) {
- LOG(ERROR) << "Non-io sized buffers should not end up on free list.";
- return false;
- }
-
- if (buffer->block != NULL) {
- if (buffer->block->buffer_desc_ != buffer) {
- LOG(ERROR) << "buffer<->block pointers inconsistent. Buffer: " << buffer
- << endl << buffer->block->DebugString();
- return false;
- }
-
- if (!buffer->block->Validate()) {
- LOG(ERROR) << "buffer->block inconsistent."
- << endl << buffer->block->DebugString();
- return false;
- }
-
- if (is_free && (buffer->block->is_pinned_ || buffer->block->in_write_ ||
- unpinned_blocks_.Contains(buffer->block))) {
- LOG(ERROR) << "Block with buffer in free list and"
- << " is_pinned_ = " << buffer->block->is_pinned_
- << " in_write_ = " << buffer->block->in_write_
- << " Unpinned_blocks_.Contains = "
- << unpinned_blocks_.Contains(buffer->block)
- << endl << buffer->block->DebugString();
- return false;
- }
- }
- }
-
- if (free_io_buffers_.size() != num_free_io_buffers) {
- LOG(ERROR) << "free_buffer_list_ inconsistency."
- << " num_free_io_buffers = " << num_free_io_buffers
- << " free_io_buffers_.size() = " << free_io_buffers_.size()
- << endl << DebugInternal();
- return false;
- }
-
- Block* block = unpinned_blocks_.head();
- while (block != NULL) {
- if (!block->Validate()) {
- LOG(ERROR) << "Block inconsistent in unpinned list."
- << endl << block->DebugString();
- return false;
- }
-
- if (block->in_write_ || free_io_buffers_.Contains(block->buffer_desc_)) {
- LOG(ERROR) << "Block in unpinned list with"
- << " in_write_ = " << block->in_write_
- << " free_io_buffers_.Contains = "
- << free_io_buffers_.Contains(block->buffer_desc_)
- << endl << block->DebugString();
- return false;
- }
- block = block->Next();
- }
-
- // Check if we're writing blocks when the number of free buffers is less than
- // the write threshold. We don't write blocks after cancellation.
- if (!is_cancelled_ && !unpinned_blocks_.empty() && !disable_spill_ &&
- (free_io_buffers_.size() + non_local_outstanding_writes_ <
- block_write_threshold_)) {
- // TODO: this isn't correct when WriteUnpinnedBlocks() fails during the call to
- // WriteUnpinnedBlock() so just log the condition but don't return false. Figure
- // out a way to re-enable this change?
- LOG(ERROR) << "Missed writing unpinned blocks";
- }
- return true;
-}
-
-string BufferedBlockMgr::DebugString(Client* client) {
- stringstream ss;
- unique_lock<mutex> l(lock_);
- ss << DebugInternal();
- if (client != NULL) ss << endl << client->DebugString();
- return ss.str();
-}
-
-string BufferedBlockMgr::DebugInternal() const {
- stringstream ss;
- ss << "Buffered block mgr " << this << endl
- << " Num writes outstanding: " << outstanding_writes_counter_->value() << endl
- << " Num free io buffers: " << free_io_buffers_.size() << endl
- << " Num unpinned blocks: " << unpinned_blocks_.size() << endl
- << " Num available buffers: " << remaining_unreserved_buffers() << endl
- << " Total pinned buffers: " << total_pinned_buffers_ << endl
- << " Unfullfilled reserved buffers: " << unfullfilled_reserved_buffers_ << endl
- << " Remaining memory: " << mem_tracker_->SpareCapacity()
- << " (#blocks=" << (mem_tracker_->SpareCapacity() / max_block_size_) << ")" << endl
- << " Block write threshold: " << block_write_threshold_;
- if (tmp_file_group_ != NULL) ss << tmp_file_group_->DebugString();
- return ss.str();
-}
-
-void BufferedBlockMgr::Init(DiskIoMgr* io_mgr, TmpFileMgr* tmp_file_mgr,
- RuntimeProfile* parent_profile, MemTracker* parent_tracker, int64_t mem_limit,
- int64_t scratch_limit) {
- unique_lock<mutex> l(lock_);
- if (initialized_) return;
-
- profile_.reset(new RuntimeProfile(&obj_pool_, "BlockMgr"));
- parent_profile->AddChild(profile_.get());
-
- tmp_file_group_.reset(new TmpFileMgr::FileGroup(
- tmp_file_mgr, io_mgr, profile_.get(), query_id_, scratch_limit));
-
- mem_limit_counter_ = ADD_COUNTER(profile_.get(), "MemoryLimit", TUnit::BYTES);
- mem_limit_counter_->Set(mem_limit);
- block_size_counter_ = ADD_COUNTER(profile_.get(), "MaxBlockSize", TUnit::BYTES);
- block_size_counter_->Set(max_block_size_);
- created_block_counter_ = ADD_COUNTER(profile_.get(), "BlocksCreated", TUnit::UNIT);
- recycled_blocks_counter_ = ADD_COUNTER(profile_.get(), "BlocksRecycled", TUnit::UNIT);
- outstanding_writes_counter_ =
- ADD_COUNTER(profile_.get(), "BlockWritesOutstanding", TUnit::UNIT);
- buffered_pin_counter_ = ADD_COUNTER(profile_.get(), "BufferedPins", TUnit::UNIT);
- buffer_wait_timer_ = ADD_TIMER(profile_.get(), "TotalBufferWaitTime");
-
- // Create a new mem_tracker and allocate buffers.
- mem_tracker_.reset(
- new MemTracker(profile(), mem_limit, "Block Manager", parent_tracker));
-
- initialized_ = true;
-}
-
-} // namespace impala
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-block-mgr.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr.h b/be/src/runtime/buffered-block-mgr.h
deleted file mode 100644
index ab05329..0000000
--- a/be/src/runtime/buffered-block-mgr.h
+++ /dev/null
@@ -1,606 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef IMPALA_RUNTIME_BUFFERED_BLOCK_MGR
-#define IMPALA_RUNTIME_BUFFERED_BLOCK_MGR
-
-#include "runtime/disk-io-mgr.h"
-#include "runtime/tmp-file-mgr.h"
-#include "util/mem-range.h"
-
-#include <boost/unordered_map.hpp>
-
-namespace impala {
-
-class RuntimeState;
-
-/// The BufferedBlockMgr is used to allocate and manage blocks of data using a fixed memory
-/// budget. Available memory is split into a pool of fixed-size memory buffers. When a
-/// client allocates or requests a block, the block is assigned a buffer from this pool and
-/// is 'pinned' in memory. Clients can also unpin a block, allowing the manager to reassign
-/// its buffer to a different block.
-//
-/// The BufferedBlockMgr typically allocates blocks in IO buffer size to get maximal IO
-/// efficiency when spilling. Clients can also request smaller buffers that cannot spill
-/// (note that it would be possible to spill small buffers, but we currently do not allow
-/// it). This is useful to present the same block API and mem tracking for clients (one can
-/// use the block mgr API to mem track non-spillable (smaller) buffers). Clients that do
-/// partitioning (e.g. PHJ and PAGG) will start with these smaller buffer sizes to reduce
-/// the minimum buffering requirements and grow to max sized buffers as the input grows.
-/// For simplicity, these small buffers are not recycled (there's also not really a need
-/// since they are allocated all at once on query startup). These buffers are not counted
-/// against the reservation.
-//
-/// The BufferedBlockMgr reserves one buffer per disk ('block_write_threshold_') for
-/// itself. When the number of free buffers falls below 'block_write_threshold', unpinned
-/// blocks are flushed in Last-In-First-Out order. (It is assumed that unpinned blocks are
-/// re-read in FIFO order). The TmpFileMgr is used to obtain file handles to write to
-/// within the tmp directories configured for Impala.
-//
-/// It is expected to have one BufferedBlockMgr per query. All allocations that can grow
-/// proportional to the input size and that might need to spill to disk should allocate
-/// from the same BufferedBlockMgr.
-//
-/// A client must pin a block in memory to read/write its contents and unpin it when it is
-/// no longer in active use. The BufferedBlockMgr guarantees that:
-/// a) The memory buffer assigned to a block is not removed or released while it is pinned.
-/// b) The contents of an unpinned block will be available on a subsequent call to pin.
-//
-/// The Client supports the following operations:
-/// GetNewBlock(): Returns a new pinned block.
-/// Close(): Frees all memory and disk space. Called when a query is closed or cancelled.
-/// Close() is idempotent.
-//
-/// A Block supports the following operations:
-/// Pin(): Pins a block to a buffer in memory, and reads its contents from disk if
-/// necessary. If there are no free buffers, waits for a buffer to become available.
-/// Invoked before the contents of a block are read or written. The block
-/// will be maintained in memory until Unpin() is called.
-/// Unpin(): Invoked to indicate the block is not in active use. The block is added to a
-/// list of unpinned blocks. Unpinned blocks are only written when the number of free
-/// blocks falls below the 'block_write_threshold'.
-/// Delete(): Invoked to deallocate a block. The buffer associated with the block is
-/// immediately released and its on-disk location (if any) reused. All blocks must be
-/// deleted before the block manager is torn down.
-///
-/// The block manager is thread-safe with the following caveat: A single block cannot be
-/// used simultaneously by multiple clients in any capacity.
-/// However, the block manager client is not thread-safe. That is, the block manager
-/// allows multiple single-threaded block manager clients.
-///
-/// TODO: replace with BufferPool.
-class BufferedBlockMgr {
- private:
- struct BufferDescriptor;
-
- public:
- /// A client of the BufferedBlockMgr. There is a single BufferedBlockMgr per plan
- /// fragment and all operators that need blocks from it should use a separate client.
- /// Each client has the option to reserve a number of blocks that it can claim later.
- /// The remaining memory that is not reserved by any clients is free for all and
- /// available to all clients.
- /// This is an opaque handle.
- struct Client;
-
- /// A fixed-size block of data that may be be persisted to disk. The state of the block
- /// is maintained by the block manager and is described by 3 bools:
- /// is_pinned_ = True if the block is pinned. The block has a non-null buffer_desc_,
- /// buffer_desc_ cannot be in the free buffer list and the block cannot be in
- /// unused_blocks_ or unpinned_blocks_. Newly allocated blocks are pinned.
- /// in_write_ = True if a write has been issued but not completed for this block.
- /// The block cannot be in the unpinned_blocks_ and must have a non-null buffer_desc_
- /// that's not in the free buffer list. It may be pinned or unpinned.
- /// is_deleted_ = True if Delete() has been called on a block. After this, no API call
- /// is valid on the block.
- //
- /// Pin() and Unpin() can be invoked on a block any number of times before Delete().
- /// When a pinned block is unpinned for the first time, it is added to the
- /// unpinned_blocks_ list and its buffer is removed from the free list.
- /// If it is pinned or deleted at any time while it is on the unpinned list, it is
- /// simply removed from that list. When it is dequeued from that list and enqueued
- /// for writing, in_write_ is set to true. The block may be pinned, unpinned or deleted
- /// while in_write_ is true. After the write has completed, the block's buffer will be
- /// returned to the free buffer list if it is no longer pinned, and the block itself
- /// will be put on the unused blocks list if Delete() was called.
- //
- /// A block MUST have a non-null buffer_desc_ if
- /// a) is_pinned_ is true (i.e. the client is using it), or
- /// b) in_write_ is true, (i.e. IO mgr is using it), or
- /// c) It is on the unpinned list (buffer has not been persisted.)
- //
- /// In addition to the block manager API, Block exposes Allocate(), ReturnAllocation()
- /// and BytesRemaining() to allocate and free memory within a block, and buffer() and
- /// valid_data_len() to read/write the contents of a block. These are not thread-safe.
- class Block : public InternalQueue<Block>::Node {
- public:
- /// Pins a block in memory--assigns a free buffer to a block and reads it from disk if
- /// necessary. If there are no free blocks and no unpinned blocks, '*pinned' is set to
- /// false and the block is not pinned. If 'release_block' is non-NULL, if there is
- /// memory pressure, this block will be pinned using the buffer from 'release_block'.
- /// If 'unpin' is true, 'release_block' will be unpinned (regardless of whether or not
- /// the buffer was used for this block). If 'unpin' is false, 'release_block' is
- /// deleted. 'release_block' must be pinned. If an error occurs and 'unpin' was false,
- /// 'release_block' is always deleted. If 'unpin' was true and an error occurs,
- /// 'release_block' may be left pinned or unpinned.
- Status Pin(bool* pinned, Block* release_block = NULL, bool unpin = true);
-
- /// Unpins a block by adding it to the list of unpinned blocks maintained by the block
- /// manager. An unpinned block must be flushed before its buffer is released or
- /// assigned to a different block. Is non-blocking.
- Status Unpin();
-
- /// Delete a block. Its buffer is released and on-disk location can be over-written.
- /// Non-blocking.
- void Delete();
-
- void AddRow() { ++num_rows_; }
- int num_rows() const { return num_rows_; }
-
- /// Allocates the specified number of bytes from this block.
- template <typename T> T* Allocate(int size) {
- DCHECK_GE(BytesRemaining(), size);
- uint8_t* current_location = buffer_desc_->buffer + valid_data_len_;
- valid_data_len_ += size;
- return reinterpret_cast<T*>(current_location);
- }
-
- /// Return the number of remaining bytes that can be allocated in this block.
- int BytesRemaining() const {
- DCHECK(buffer_desc_ != NULL);
- return buffer_desc_->len - valid_data_len_;
- }
-
- /// Return size bytes from the most recent allocation.
- void ReturnAllocation(int size) {
- DCHECK_GE(valid_data_len_, size);
- valid_data_len_ -= size;
- }
-
- /// Pointer to start of the block data in memory. Only guaranteed to be valid if the
- /// block is pinned.
- uint8_t* buffer() const {
- DCHECK(buffer_desc_ != NULL);
- return buffer_desc_->buffer;
- }
-
- /// Returns a reference to the valid data in the block's buffer. Only guaranteed to
- /// be valid if the block is pinned.
- MemRange valid_data() const {
- DCHECK(buffer_desc_ != NULL);
- return MemRange(buffer_desc_->buffer, valid_data_len_);
- }
-
- /// Return the number of bytes allocated in this block.
- int64_t valid_data_len() const { return valid_data_len_; }
-
- /// Returns the length of the underlying buffer. Only callable if the block is
- /// pinned.
- int64_t buffer_len() const {
- DCHECK(is_pinned());
- return buffer_desc_->len;
- }
-
- /// Returns true if this block is the max block size. Only callable if the block
- /// is pinned.
- bool is_max_size() const {
- DCHECK(is_pinned());
- return buffer_desc_->len == block_mgr_->max_block_size();
- }
-
- bool is_pinned() const { return is_pinned_; }
-
- /// Path of temporary file backing the block. Intended for use in testing.
- /// Returns empty string if no backing file allocated.
- std::string TmpFilePath() const;
-
- /// Debug helper method to print the state of a block.
- std::string DebugString() const;
-
- private:
- friend class BufferedBlockMgr;
-
- Block(BufferedBlockMgr* block_mgr);
-
- /// Initialize the state of a block and set the number of bytes allocated to 0.
- void Init();
-
- /// Debug helper method to validate the state of a block. block_mgr_ lock must already
- /// be taken.
- bool Validate() const;
-
- /// Pointer to the buffer associated with the block. NULL if the block is not in
- /// memory and cannot be changed while the block is pinned or being written.
- BufferDescriptor* buffer_desc_;
-
- /// Parent block manager object. Responsible for maintaining the state of the block.
- BufferedBlockMgr* block_mgr_;
-
- /// The client that owns this block.
- Client* client_;
-
- /// Non-NULL when the block data is written to scratch or is in the process of being
- /// written.
- std::unique_ptr<TmpFileMgr::WriteHandle> write_handle_;
-
- /// Length of valid (i.e. allocated) data within the block.
- int64_t valid_data_len_;
-
- /// Number of rows in this block.
- int num_rows_;
-
- /// Block state variables. The block's buffer can be freed only if is_pinned_ and
- /// in_write_ are both false.
-
- /// is_pinned_ is true while the block is pinned by a client.
- bool is_pinned_;
-
- /// in_write_ is set to true when the block is enqueued for writing via DiskIoMgr,
- /// and set to false when the write is complete.
- bool in_write_;
-
- /// True if the block is deleted by the client.
- bool is_deleted_;
-
- /// Condition variable to wait for the write to this block to finish. If 'in_write_'
- /// is true, notify_one() will eventually be called on this condition variable. Only
- /// on thread should wait on this cv at a time.
- boost::condition_variable write_complete_cv_;
-
- /// If true, this block is being written out so the underlying buffer can be
- /// transferred to another block from the same client. We don't want this buffer
- /// getting picked up by another client.
- bool client_local_;
- }; // class Block
-
- /// Create a block manager with the specified mem_limit. If a block mgr with the
- /// same query id has already been created, that block mgr is returned.
- /// - mem_limit: maximum memory that will be used by the block mgr.
- /// - buffer_size: maximum size of each buffer.
- static Status Create(RuntimeState* state, MemTracker* parent,
- RuntimeProfile* profile, TmpFileMgr* tmp_file_mgr, int64_t mem_limit,
- int64_t buffer_size, std::shared_ptr<BufferedBlockMgr>* block_mgr);
-
- ~BufferedBlockMgr();
-
- /// Registers a client with 'num_reserved_buffers'. The returned client is owned
- /// by the BufferedBlockMgr and has the same lifetime as it.
- /// We allow oversubscribing the reserved buffers. It is likely that the
- /// 'num_reserved_buffers' will be very pessimistic for small queries and we don't want
- /// to
- /// fail all of them with mem limit exceeded.
- /// The min reserved buffers is often independent of data size and we still want
- /// to run small queries with very small limits.
- /// Buffers used by this client are reflected in tracker.
- /// 'tolerates_oversubscription' determines how oversubscription is handled. If true,
- /// failure to allocate a reserved buffer is not an error. If false, failure to
- /// allocate a reserved buffer is a MEM_LIMIT_EXCEEDED error.
- /// 'debug_info' is a string that will be printed in debug messages and errors to
- /// identify the client.
- Status RegisterClient(const std::string& debug_info, int num_reserved_buffers,
- bool tolerates_oversubscription, MemTracker* tracker, RuntimeState* state,
- Client** client);
-
- /// Clears all reservations for this client.
- void ClearReservations(Client* client);
-
- /// Tries to acquire a one-time reservation of num_buffers. The semantics are:
- /// - If this call fails, the next 'num_buffers' calls to Pin()/GetNewBlock() might
- /// not have enough memory.
- /// - If this call succeeds, the next 'num_buffers' call to Pin()/GetNewBlock() will
- /// be guaranteed to get the block. Once these blocks have been pinned, the
- /// reservation from this call has no more effect.
- /// Blocks coming from the tmp reservation also count towards the regular reservation.
- /// This is useful to Pin() a number of blocks and guarantee all or nothing behavior.
- bool TryAcquireTmpReservation(Client* client, int num_buffers);
-
- /// Return a new pinned block. If there is no memory for this block, *block will be set
- /// to NULL.
- /// If len > 0, GetNewBlock() will return a block with a buffer of size len. len
- /// must be less than max_block_size and this block cannot be unpinned.
- /// This function will try to allocate new memory for the block up to the limit.
- /// Otherwise it will (conceptually) write out an unpinned block and use that memory.
- /// The caller can pass a non-NULL 'unpin_block' to transfer memory from 'unpin_block'
- /// to the new block. If 'unpin_block' is non-NULL, the new block can never fail to
- /// get a buffer. The semantics of this are:
- /// - If 'unpin_block' is non-NULL, it must be pinned.
- /// - If the call succeeds, 'unpin_block' is unpinned.
- /// - If there is no memory pressure, block will get a newly allocated buffer.
- /// - If there is memory pressure, block will get the buffer from 'unpin_block'.
- Status GetNewBlock(Client* client, Block* unpin_block, Block** block, int64_t len = -1);
-
- /// Test helper to cancel the block mgr. All subsequent calls that return a Status fail
- /// with Status::CANCELLED. Idempotent.
- void Cancel();
-
- /// Returns true if the block manager was cancelled.
- bool IsCancelled();
-
- /// Dumps block mgr state. Grabs lock. If client is not NULL, also dumps its state.
- std::string DebugString(Client* client = NULL);
-
- /// Consumes 'size' bytes from the buffered block mgr. This is used by callers that want
- /// the memory to come from the block mgr pool (and therefore trigger spilling) but need
- /// the allocation to be more flexible than blocks. Buffer space reserved with
- /// TryAcquireTmpReservation may be used to fulfill the request if available. If the
- /// request is unsuccessful, that temporary buffer space is not consumed.
- /// Returns false if there was not enough memory.
- ///
- /// This is used only for the Buckets structure in the hash table, which cannot be
- /// segmented into blocks.
- bool ConsumeMemory(Client* client, int64_t size);
-
- /// All successful allocates bytes from ConsumeMemory() must have a corresponding
- /// ReleaseMemory() call.
- void ReleaseMemory(Client* client, int64_t size);
-
- /// Returns a MEM_LIMIT_EXCEEDED error which includes the minimum memory required by
- /// this 'client' that acts on behalf of the node with id 'node_id'. 'node_id' is used
- /// only for error reporting.
- Status MemLimitTooLowError(Client* client, int node_id);
-
- int num_pinned_buffers(Client* client) const;
- int num_reserved_buffers_remaining(Client* client) const;
- MemTracker* mem_tracker() const { return mem_tracker_.get(); }
- MemTracker* get_tracker(Client* client) const;
- int64_t max_block_size() const { return max_block_size_; }
- int64_t bytes_allocated() const;
- RuntimeProfile* profile() { return profile_.get(); }
- int writes_issued() const { return writes_issued_; }
-
- void set_debug_write_delay_ms(int val) { debug_write_delay_ms_ = val; }
-
- private:
- friend class BufferedBlockMgrTest;
- friend struct Client;
-
- /// Descriptor for a single memory buffer in the pool.
- struct BufferDescriptor : public InternalQueue<BufferDescriptor>::Node {
- /// Start of the buffer.
- uint8_t* buffer;
-
- /// Length of the buffer.
- int64_t len;
-
- /// Block that this buffer is assigned to. May be NULL.
- Block* block;
-
- /// Iterator into all_io_buffers_ for this buffer.
- std::list<BufferDescriptor*>::iterator all_buffers_it;
-
- BufferDescriptor(uint8_t* buf, int64_t len) : buffer(buf), len(len), block(NULL) {}
- };
-
- BufferedBlockMgr(RuntimeState* state, TmpFileMgr* tmp_file_mgr, int64_t block_size,
- int64_t scratch_limit);
-
- /// Initializes the block mgr. Idempotent and thread-safe.
- void Init(DiskIoMgr* io_mgr, TmpFileMgr* tmp_file_mgr, RuntimeProfile* profile,
- MemTracker* parent_tracker, int64_t mem_limit, int64_t scratch_limit);
-
- /// PinBlock(), UnpinBlock(), DeleteBlock() perform the actual work of Block::Pin(),
- /// Unpin() and Delete(). DeleteBlock() must be called without the lock_ taken and
- /// DeleteBlockLocked() must be called with the lock_ taken.
- Status PinBlock(Block* block, bool* pinned, Block* src, bool unpin);
- Status UnpinBlock(Block* block);
- void DeleteBlock(Block* block);
- void DeleteBlockLocked(const boost::unique_lock<boost::mutex>& lock, Block* block);
-
- /// If there is an in-flight write, cancel the write and restore the contents of the
- /// block's buffer. If no write has been started for 'block', does nothing. 'block'
- /// must have an associated buffer. Returns an error status if an error is encountered
- /// while cancelling the write or CANCELLED if the block mgr is cancelled.
- Status CancelWrite(Block* block);
-
- /// If the 'block' is NULL, checks if cancelled and returns. Otherwise, depending on
- /// 'unpin' calls either DeleteBlock() or UnpinBlock(), which both first check for
- /// cancellation. It should be called without the lock_ acquired.
- Status DeleteOrUnpinBlock(Block* block, bool unpin);
-
- /// Transfers the buffer from 'src' to 'dst'. 'src' must be pinned. If a write is
- /// already in flight for 'src', this may block until that write completes.
- /// If unpin == false, 'src' is simply deleted.
- /// If unpin == true, 'src' is unpinned and it may block until the write of 'src' is
- /// completed.
- /// The caller should not hold 'lock_'.
- Status TransferBuffer(Block* dst, Block* src, bool unpin);
-
- /// The number of buffers available for client. That is, if all other clients were
- /// stopped, the number of buffers this client could get.
- int64_t available_buffers(Client* client) const;
-
- /// Returns the total number of unreserved buffers. This is the sum of unpinned,
- /// free and buffers we can still allocate minus the total number of reserved buffers
- /// that are not pinned.
- /// Note this can be negative if the buffers are oversubscribed.
- /// Must be called with lock_ taken.
- int64_t remaining_unreserved_buffers() const;
-
- /// Finds a buffer for a block and pins it. If the block's buffer has not been evicted,
- /// it removes the block from the unpinned list and sets *in_mem = true.
- /// If the block is not in memory, it will call FindBuffer() that may block.
- /// If we can't get a buffer (e.g. no more memory, nothing in the unpinned and free
- /// lists) this function returns with the block unpinned.
- /// Uses the lock_, the caller should not have already acquired the lock_.
- Status FindBufferForBlock(Block* block, bool* in_mem);
-
- /// Returns a new buffer that can be used. *buffer is set to NULL if there was no
- /// memory.
- /// Otherwise, this function gets a new buffer by:
- /// 1. Allocating a new buffer if possible
- /// 2. Using a buffer from the free list (which is populated by moving blocks from
- /// the unpinned list by writing them out).
- /// Must be called with the lock_ already taken. This function can block.
- Status FindBuffer(boost::unique_lock<boost::mutex>& lock, BufferDescriptor** buffer);
-
- /// Writes unpinned blocks via DiskIoMgr until one of the following is true:
- /// 1. The number of outstanding writes >= (block_write_threshold_ - num free buffers)
- /// 2. There are no more unpinned blocks
- /// Must be called with the lock_ already taken. Is not blocking.
- Status WriteUnpinnedBlocks();
-
- /// Issues the write for this block to the DiskIoMgr.
- Status WriteUnpinnedBlock(Block* block);
-
- /// Wait until either there is no in-flight write for 'block' or the block mgr is
- /// cancelled. 'lock_' must be held with 'lock'.
- void WaitForWrite(boost::unique_lock<boost::mutex>& lock, Block* block);
-
- /// Callback used by DiskIoMgr to indicate a block write has completed. write_status
- /// is the status of the write. is_cancelled_ is set to true if write_status is not
- /// Status::OK or a re-issue of the write fails. Returns the block's buffer to the
- /// free buffers list if it is no longer pinned. Returns the block itself to the free
- /// blocks list if it has been deleted.
- void WriteComplete(Block* block, const Status& write_status);
-
- /// Returns a deleted block to the list of free blocks. Assumes the block's buffer has
- /// already been returned to the free buffers list. Non-blocking.
- /// Thread-safe and does not need the lock_ acquired.
- void ReturnUnusedBlock(Block* block);
-
- /// Checks unused_blocks_ for an unused block object, else allocates a new one.
- /// Non-blocking and needs no lock_.
- Block* GetUnusedBlock(Client* client);
-
- // Test helper to get the number of block writes currently outstanding.
- int64_t GetNumWritesOutstanding();
-
- /// Used to debug the state of the block manager. Lock must already be taken.
- bool Validate() const;
- std::string DebugInternal() const;
-
- /// Size of the largest/default block in bytes.
- const int64_t max_block_size_;
-
- /// Unpinned blocks are written when the number of free buffers is below this threshold.
- /// Equal to two times the number of disks.
- const int block_write_threshold_;
-
- /// If true, spilling is disabled. The client calls will fail if there is not enough
- /// memory.
- const bool disable_spill_;
-
- const TUniqueId query_id_;
-
- ObjectPool obj_pool_;
-
- /// Track buffers allocated by the block manager.
- boost::scoped_ptr<MemTracker> mem_tracker_;
-
- /// This lock protects the block and buffer lists below, except for unused_blocks_.
- /// It also protects the various counters and changes to block state. Additionally, it
- /// is used for the blocking condvars: buffer_available_cv_ and
- /// block->write_complete_cv_.
- boost::mutex lock_;
-
- /// If true, Init() has been called.
- bool initialized_;
-
- /// The total number of reserved buffers across all clients that are not pinned.
- int unfullfilled_reserved_buffers_;
-
- /// The total number of pinned buffers across all clients.
- int total_pinned_buffers_;
-
- /// Number of outstanding writes (Writes issued but not completed).
- /// This does not include client-local writes.
- int non_local_outstanding_writes_;
-
- /// Signal availability of free buffers. Also signalled when a write completes for a
- /// pinned block, in case another thread was expecting to obtain its buffer. If
- /// 'non_local_outstanding_writes_' > 0, notify_all() will eventually be called on
- /// this condition variable. To avoid free buffers accumulating while threads wait
- /// on the cv, a woken thread must grab an available buffer (unless is_cancelled_ is
- /// true at that time).
- boost::condition_variable buffer_available_cv_;
-
- /// All used or unused blocks allocated by the BufferedBlockMgr.
- vector<Block*> all_blocks_;
-
- /// List of blocks is_pinned_ = false AND are not on DiskIoMgr's write queue.
- /// Blocks are added to and removed from the back of the list. (i.e. in LIFO order).
- /// Blocks in this list must have is_pinned_ = false, in_write_ = false,
- /// is_deleted_ = false.
- InternalQueue<Block> unpinned_blocks_;
-
- /// List of blocks that have been deleted and are no longer in use.
- /// Can be reused in GetNewBlock(). Blocks in this list must be in the Init'ed state,
- /// i.e. buffer_desc_ = NULL, is_pinned_ = false, in_write_ = false,
- /// is_deleted_ = false, valid_data_len = 0.
- InternalQueue<Block> unused_blocks_;
-
- /// List of buffers that can be assigned to a block in Pin() or GetNewBlock().
- /// These buffers either have no block associated with them or are associated with an
- /// an unpinned block that has been persisted. That is, either block = NULL or
- /// (!block->is_pinned_ && !block->in_write_ && !unpinned_blocks_.Contains(block)).
- /// All of these buffers are io sized.
- InternalQueue<BufferDescriptor> free_io_buffers_;
-
- /// All allocated io-sized buffers.
- std::list<BufferDescriptor*> all_io_buffers_;
-
- /// Group of temporary physical files, (one per tmp device) to which
- /// blocks may be written. Blocks are round-robined across these files.
- boost::scoped_ptr<TmpFileMgr::FileGroup> tmp_file_group_;
-
- /// If true, a disk write failed and all API calls return.
- /// Status::CANCELLED. Set to true if there was an error writing a block, or if
- /// WriteComplete() needed to reissue the write and that failed.
- bool is_cancelled_;
-
- /// Counters and timers to track behavior.
- boost::scoped_ptr<RuntimeProfile> profile_;
-
- /// These have a fixed value for the lifetime of the manager and show memory usage.
- RuntimeProfile::Counter* mem_limit_counter_;
- RuntimeProfile::Counter* block_size_counter_;
-
- /// Total number of blocks created.
- RuntimeProfile::Counter* created_block_counter_;
-
- /// Number of deleted blocks reused.
- RuntimeProfile::Counter* recycled_blocks_counter_;
-
- /// Number of Pin() calls that did not require a disk read.
- RuntimeProfile::Counter* buffered_pin_counter_;
-
- /// Time spent waiting for a free buffer.
- RuntimeProfile::Counter* buffer_wait_timer_;
-
- /// Number of writes outstanding (issued but not completed).
- RuntimeProfile::Counter* outstanding_writes_counter_;
-
- /// Number of writes issued.
- int writes_issued_;
-
- /// Protects query_to_block_mgrs_.
- static SpinLock static_block_mgrs_lock_;
-
- /// All per-query BufferedBlockMgr objects that are in use. For memory management, this
- /// map contains only weak ptrs. BufferedBlockMgrs that are handed out are shared ptrs.
- /// When all the shared ptrs are no longer referenced, the BufferedBlockMgr
- /// d'tor will be called at which point the weak ptr will be removed from the map.
- typedef boost::unordered_map<TUniqueId, std::weak_ptr<BufferedBlockMgr>> BlockMgrsMap;
- static BlockMgrsMap query_to_block_mgrs_;
-
- /// Debug option to delay write completion.
- int debug_write_delay_ms_;
-
-}; // class BufferedBlockMgr
-
-} // namespace impala.
-
-#endif
[06/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream-test.cc b/be/src/runtime/buffered-tuple-stream-test.cc
deleted file mode 100644
index 0904833..0000000
--- a/be/src/runtime/buffered-tuple-stream-test.cc
+++ /dev/null
@@ -1,1264 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <boost/scoped_ptr.hpp>
-#include <boost/bind.hpp>
-#include <boost/filesystem.hpp>
-
-#include <set>
-#include <string>
-#include <limits> // for std::numeric_limits<int>::max()
-
-#include "testutil/gtest-util.h"
-#include "codegen/llvm-codegen.h"
-#include "gutil/gscoped_ptr.h"
-#include "runtime/buffered-tuple-stream.inline.h"
-#include "runtime/collection-value.h"
-#include "runtime/collection-value-builder.h"
-#include "runtime/raw-value.h"
-#include "runtime/row-batch.h"
-#include "runtime/string-value.inline.h"
-#include "runtime/test-env.h"
-#include "runtime/tmp-file-mgr.h"
-#include "service/fe-support.h"
-#include "testutil/desc-tbl-builder.h"
-#include "util/test-info.h"
-
-#include "gen-cpp/Types_types.h"
-#include "gen-cpp/ImpalaInternalService_types.h"
-
-#include "common/names.h"
-
-using kudu::FreeDeleter;
-
-static const int BATCH_SIZE = 250;
-static const int IO_BLOCK_SIZE = 8 * 1024 * 1024;
-static const uint32_t PRIME = 479001599;
-
-namespace impala {
-
-static const StringValue STRINGS[] = {
- StringValue("ABC"),
- StringValue("HELLO"),
- StringValue("123456789"),
- StringValue("FOOBAR"),
- StringValue("ONE"),
- StringValue("THREE"),
- StringValue("abcdefghijklmno"),
- StringValue("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
- StringValue("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
-};
-
-static const int NUM_STRINGS = sizeof(STRINGS) / sizeof(StringValue);
-
-class SimpleTupleStreamTest : public testing::Test {
- protected:
- virtual void SetUp() {
- test_env_.reset(new TestEnv());
- ASSERT_OK(test_env_->Init());
-
- CreateDescriptors();
-
- mem_pool_.reset(new MemPool(&tracker_));
- }
-
- virtual void CreateDescriptors() {
- vector<bool> nullable_tuples(1, false);
- vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
- DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
- int_builder.DeclareTuple() << TYPE_INT;
- int_desc_ = pool_.Add(new RowDescriptor(
- *int_builder.Build(), tuple_ids, nullable_tuples));
-
- DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
- string_builder.DeclareTuple() << TYPE_STRING;
- string_desc_ = pool_.Add(new RowDescriptor(
- *string_builder.Build(), tuple_ids, nullable_tuples));
- }
-
- virtual void TearDown() {
- runtime_state_ = NULL;
- client_ = NULL;
- pool_.Clear();
- mem_pool_->FreeAll();
- test_env_.reset();
- }
-
- /// Setup a block manager with the provided settings and client with no reservation,
- /// tracked by tracker_.
- void InitBlockMgr(int64_t limit, int block_size) {
- ASSERT_OK(test_env_->CreateQueryStateWithBlockMgr(
- 0, limit, block_size, nullptr, &runtime_state_));
- MemTracker* client_tracker =
- pool_.Add(new MemTracker(-1, "client", runtime_state_->instance_mem_tracker()));
- ASSERT_OK(runtime_state_->block_mgr()->RegisterClient(
- "", 0, false, client_tracker, runtime_state_, &client_));
- }
-
- /// Generate the ith element of a sequence of int values.
- int GenIntValue(int i) {
- // Multiply by large prime to get varied bit patterns.
- return i * PRIME;
- }
-
- /// Generate the ith element of a sequence of bool values.
- bool GenBoolValue(int i) {
- // Use a middle bit of the int value.
- return ((GenIntValue(i) >> 8) & 0x1) != 0;
- }
-
- /// Count the total number of slots per row based on the given row descriptor.
- int CountSlotsPerRow(const RowDescriptor& row_desc) {
- int slots_per_row = 0;
- for (int i = 0; i < row_desc.tuple_descriptors().size(); ++i) {
- TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[i];
- slots_per_row += tuple_desc->slots().size();
- }
- return slots_per_row;
- }
-
- /// Allocate a row batch with 'num_rows' of rows with layout described by 'row_desc'.
- /// 'offset' is used to account for rows occupied by any previous row batches. This is
- /// needed to match the values generated in VerifyResults(). If 'gen_null' is true,
- /// some tuples will be set to NULL.
- virtual RowBatch* CreateBatch(
- const RowDescriptor* row_desc, int offset, int num_rows, bool gen_null) {
- RowBatch* batch = pool_.Add(new RowBatch(row_desc, num_rows, &tracker_));
- int num_tuples = row_desc->tuple_descriptors().size();
-
- int idx = offset * CountSlotsPerRow(*row_desc);
- for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
- TupleRow* row = batch->GetRow(row_idx);
- for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
- TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
- Tuple* tuple = Tuple::Create(tuple_desc->byte_size(), batch->tuple_data_pool());
- bool is_null = gen_null && !GenBoolValue(idx);
- for (int slot_idx = 0; slot_idx < tuple_desc->slots().size(); ++slot_idx, ++idx) {
- SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
- void* slot = tuple->GetSlot(slot_desc->tuple_offset());
- switch (slot_desc->type().type) {
- case TYPE_INT:
- *reinterpret_cast<int*>(slot) = GenIntValue(idx);
- break;
- case TYPE_STRING:
- *reinterpret_cast<StringValue*>(slot) = STRINGS[idx % NUM_STRINGS];
- break;
- default:
- // The memory has been zero'ed out already by Tuple::Create().
- break;
- }
- }
- if (is_null) {
- row->SetTuple(tuple_idx, NULL);
- } else {
- row->SetTuple(tuple_idx, tuple);
- }
- }
- batch->CommitLastRow();
- }
- return batch;
- }
-
- virtual RowBatch* CreateIntBatch(int offset, int num_rows, bool gen_null) {
- return CreateBatch(int_desc_, offset, num_rows, gen_null);
- }
-
- virtual RowBatch* CreateStringBatch(int offset, int num_rows, bool gen_null) {
- return CreateBatch(string_desc_, offset, num_rows, gen_null);
- }
-
- void AppendValue(uint8_t* ptr, vector<int>* results) {
- if (ptr == NULL) {
- // For the tests indicate null-ability using the max int value
- results->push_back(std::numeric_limits<int>::max());
- } else {
- results->push_back(*reinterpret_cast<int*>(ptr));
- }
- }
-
- void AppendValue(uint8_t* ptr, vector<StringValue>* results) {
- if (ptr == NULL) {
- results->push_back(StringValue());
- } else {
- StringValue sv = *reinterpret_cast<StringValue*>(ptr);
- uint8_t* copy = mem_pool_->Allocate(sv.len);
- memcpy(copy, sv.ptr, sv.len);
- sv.ptr = reinterpret_cast<char*>(copy);
- results->push_back(sv);
- }
- }
-
- template <typename T>
- void AppendRowTuples(TupleRow* row, RowDescriptor* row_desc, vector<T>* results) {
- DCHECK(row != NULL);
- const int num_tuples = row_desc->tuple_descriptors().size();
-
- for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
- TupleDescriptor* tuple_desc = row_desc->tuple_descriptors()[tuple_idx];
- Tuple* tuple = row->GetTuple(tuple_idx);
- const int num_slots = tuple_desc->slots().size();
- for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx) {
- SlotDescriptor* slot_desc = tuple_desc->slots()[slot_idx];
- if (tuple == NULL) {
- AppendValue(NULL, results);
- } else {
- void* slot = tuple->GetSlot(slot_desc->tuple_offset());
- AppendValue(reinterpret_cast<uint8_t*>(slot), results);
- }
- }
- }
- }
-
- template <typename T>
- void ReadValues(BufferedTupleStream* stream, RowDescriptor* desc, vector<T>* results,
- int num_batches = -1) {
- bool eos = false;
- RowBatch batch(desc, BATCH_SIZE, &tracker_);
- int batches_read = 0;
- do {
- batch.Reset();
- EXPECT_OK(stream->GetNext(&batch, &eos));
- ++batches_read;
- for (int i = 0; i < batch.num_rows(); ++i) {
- AppendRowTuples(batch.GetRow(i), desc, results);
- }
- } while (!eos && (num_batches < 0 || batches_read <= num_batches));
- }
-
- void GetExpectedValue(int idx, bool is_null, int* val) {
- if (is_null) {
- *val = std::numeric_limits<int>::max();
- } else {
- *val = GenIntValue(idx);
- }
- }
-
- void GetExpectedValue(int idx, bool is_null, StringValue* val) {
- if (is_null) {
- *val = StringValue();
- } else {
- *val = STRINGS[idx % NUM_STRINGS];
- }
- }
-
- template <typename T>
- void VerifyResults(const RowDescriptor& row_desc, const vector<T>& results,
- int num_rows, bool gen_null) {
- int idx = 0;
- for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
- const int num_tuples = row_desc.tuple_descriptors().size();
- for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
- const TupleDescriptor* tuple_desc = row_desc.tuple_descriptors()[tuple_idx];
- const int num_slots = tuple_desc->slots().size();
- bool is_null = gen_null && !GenBoolValue(idx);
- for (int slot_idx = 0; slot_idx < num_slots; ++slot_idx, ++idx) {
- T expected_val;
- GetExpectedValue(idx, is_null, &expected_val);
- ASSERT_EQ(results[idx], expected_val)
- << "results[" << idx << "] " << results[idx] << " != "
- << expected_val << " row_idx=" << row_idx
- << " tuple_idx=" << tuple_idx << " slot_idx=" << slot_idx
- << " gen_null=" << gen_null;
- }
- }
- }
- DCHECK_EQ(results.size(), idx);
- }
-
- // Test adding num_batches of ints to the stream and reading them back.
- // If unpin_stream is true, operate the stream in unpinned mode.
- // Assumes that enough buffers are available to read and write the stream.
- template <typename T>
- void TestValues(int num_batches, RowDescriptor* desc, bool gen_null,
- bool unpin_stream, int num_rows = BATCH_SIZE, bool use_small_buffers = true) {
- BufferedTupleStream stream(runtime_state_, desc, runtime_state_->block_mgr(), client_,
- use_small_buffers, false);
- ASSERT_OK(stream.Init(-1, NULL, true));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
-
- if (unpin_stream) {
- ASSERT_OK(stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
- }
- // Add rows to the stream
- int offset = 0;
- for (int i = 0; i < num_batches; ++i) {
- RowBatch* batch = NULL;
-
- Status status;
- ASSERT_TRUE(sizeof(T) == sizeof(int) || sizeof(T) == sizeof(StringValue));
- batch = CreateBatch(desc, offset, num_rows, gen_null);
- for (int j = 0; j < batch->num_rows(); ++j) {
- bool b = stream.AddRow(batch->GetRow(j), &status);
- ASSERT_OK(status);
- if (!b) {
- ASSERT_TRUE(stream.using_small_buffers());
- bool got_buffer;
- ASSERT_OK(stream.SwitchToIoBuffers(&got_buffer));
- ASSERT_TRUE(got_buffer);
- b = stream.AddRow(batch->GetRow(j), &status);
- ASSERT_OK(status);
- }
- ASSERT_TRUE(b);
- }
- offset += batch->num_rows();
- // Reset the batch to make sure the stream handles the memory correctly.
- batch->Reset();
- }
-
- bool got_read_buffer;
- ASSERT_OK(stream.PrepareForRead(false, &got_read_buffer));
- ASSERT_TRUE(got_read_buffer);
-
- // Read all the rows back
- vector<T> results;
- ReadValues(&stream, desc, &results);
-
- // Verify result
- VerifyResults<T>(*desc, results, num_rows * num_batches, gen_null);
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
- }
-
- void TestIntValuesInterleaved(int num_batches, int num_batches_before_read,
- bool unpin_stream) {
- for (int small_buffers = 0; small_buffers < 2; ++small_buffers) {
- BufferedTupleStream stream(runtime_state_, int_desc_, runtime_state_->block_mgr(),
- client_, small_buffers == 0, // initial small buffers
- true); // read_write
- ASSERT_OK(stream.Init(-1, NULL, true));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
- bool got_read_buffer;
- ASSERT_OK(stream.PrepareForRead(true, &got_read_buffer));
- ASSERT_TRUE(got_read_buffer);
- if (unpin_stream) {
- ASSERT_OK(stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
- }
-
- vector<int> results;
-
- for (int i = 0; i < num_batches; ++i) {
- RowBatch* batch = CreateIntBatch(i * BATCH_SIZE, BATCH_SIZE, false);
- for (int j = 0; j < batch->num_rows(); ++j) {
- Status status;
- bool b = stream.AddRow(batch->GetRow(j), &status);
- ASSERT_TRUE(b);
- ASSERT_OK(status);
- }
- // Reset the batch to make sure the stream handles the memory correctly.
- batch->Reset();
- if (i % num_batches_before_read == 0) {
- ReadValues(&stream, int_desc_, &results,
- (rand() % num_batches_before_read) + 1);
- }
- }
- ReadValues(&stream, int_desc_, &results);
-
- VerifyResults<int>(*int_desc_, results, BATCH_SIZE * num_batches, false);
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
- }
- }
-
- void TestUnpinPin(bool varlen_data);
-
- void TestTransferMemory(bool pinned_stream, bool read_write);
-
- scoped_ptr<TestEnv> test_env_;
- RuntimeState* runtime_state_;
- BufferedBlockMgr::Client* client_;
-
- MemTracker tracker_;
- ObjectPool pool_;
- RowDescriptor* int_desc_;
- RowDescriptor* string_desc_;
- scoped_ptr<MemPool> mem_pool_;
-};
-
-
-// Tests with a non-NULLable tuple per row.
-class SimpleNullStreamTest : public SimpleTupleStreamTest {
- protected:
- virtual void CreateDescriptors() {
- vector<bool> nullable_tuples(1, true);
- vector<TTupleId> tuple_ids(1, static_cast<TTupleId>(0));
-
- DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
- int_builder.DeclareTuple() << TYPE_INT;
- int_desc_ = pool_.Add(new RowDescriptor(
- *int_builder.Build(), tuple_ids, nullable_tuples));
-
- DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
- string_builder.DeclareTuple() << TYPE_STRING;
- string_desc_ = pool_.Add(new RowDescriptor(
- *string_builder.Build(), tuple_ids, nullable_tuples));
- }
-}; // SimpleNullStreamTest
-
-// Tests with multiple non-NULLable tuples per row.
-class MultiTupleStreamTest : public SimpleTupleStreamTest {
- protected:
- virtual void CreateDescriptors() {
- vector<bool> nullable_tuples;
- nullable_tuples.push_back(false);
- nullable_tuples.push_back(false);
- nullable_tuples.push_back(false);
-
- vector<TTupleId> tuple_ids;
- tuple_ids.push_back(static_cast<TTupleId>(0));
- tuple_ids.push_back(static_cast<TTupleId>(1));
- tuple_ids.push_back(static_cast<TTupleId>(2));
-
- DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
- int_builder.DeclareTuple() << TYPE_INT;
- int_builder.DeclareTuple() << TYPE_INT;
- int_builder.DeclareTuple() << TYPE_INT;
- int_desc_ = pool_.Add(new RowDescriptor(
- *int_builder.Build(), tuple_ids, nullable_tuples));
-
- DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
- string_builder.DeclareTuple() << TYPE_STRING;
- string_builder.DeclareTuple() << TYPE_STRING;
- string_builder.DeclareTuple() << TYPE_STRING;
- string_desc_ = pool_.Add(new RowDescriptor(
- *string_builder.Build(), tuple_ids, nullable_tuples));
- }
-};
-
-// Tests with multiple NULLable tuples per row.
-class MultiNullableTupleStreamTest : public SimpleTupleStreamTest {
- protected:
- virtual void CreateDescriptors() {
- vector<bool> nullable_tuples;
- nullable_tuples.push_back(false);
- nullable_tuples.push_back(true);
- nullable_tuples.push_back(true);
-
- vector<TTupleId> tuple_ids;
- tuple_ids.push_back(static_cast<TTupleId>(0));
- tuple_ids.push_back(static_cast<TTupleId>(1));
- tuple_ids.push_back(static_cast<TTupleId>(2));
-
- DescriptorTblBuilder int_builder(test_env_->exec_env()->frontend(), &pool_);
- int_builder.DeclareTuple() << TYPE_INT;
- int_builder.DeclareTuple() << TYPE_INT;
- int_builder.DeclareTuple() << TYPE_INT;
- int_desc_ = pool_.Add(new RowDescriptor(
- *int_builder.Build(), tuple_ids, nullable_tuples));
-
- DescriptorTblBuilder string_builder(test_env_->exec_env()->frontend(), &pool_);
- string_builder.DeclareTuple() << TYPE_STRING;
- string_builder.DeclareTuple() << TYPE_STRING;
- string_builder.DeclareTuple() << TYPE_STRING;
- string_desc_ = pool_.Add(new RowDescriptor(
- *string_builder.Build(), tuple_ids, nullable_tuples));
- }
-};
-
-/// Tests with collection types.
-class ArrayTupleStreamTest : public SimpleTupleStreamTest {
- protected:
- RowDescriptor* array_desc_;
-
- virtual void CreateDescriptors() {
- // tuples: (array<string>, array<array<int>>) (array<int>)
- vector<bool> nullable_tuples(2, true);
- vector<TTupleId> tuple_ids;
- tuple_ids.push_back(static_cast<TTupleId>(0));
- tuple_ids.push_back(static_cast<TTupleId>(1));
- ColumnType string_array_type;
- string_array_type.type = TYPE_ARRAY;
- string_array_type.children.push_back(TYPE_STRING);
-
- ColumnType int_array_type;
- int_array_type.type = TYPE_ARRAY;
- int_array_type.children.push_back(TYPE_STRING);
-
- ColumnType nested_array_type;
- nested_array_type.type = TYPE_ARRAY;
- nested_array_type.children.push_back(int_array_type);
-
- DescriptorTblBuilder builder(test_env_->exec_env()->frontend(), &pool_);
- builder.DeclareTuple() << string_array_type << nested_array_type;
- builder.DeclareTuple() << int_array_type;
- array_desc_ = pool_.Add(new RowDescriptor(
- *builder.Build(), tuple_ids, nullable_tuples));
- }
-};
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleTupleStreamTest, Basic) {
- InitBlockMgr(-1, IO_BLOCK_SIZE);
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
- TestValues<int>(100, int_desc_, false, true);
- TestValues<int>(1, int_desc_, false, false);
- TestValues<int>(10, int_desc_, false, false);
- TestValues<int>(100, int_desc_, false, false);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
- TestValues<StringValue>(100, string_desc_, false, true);
- TestValues<StringValue>(1, string_desc_, false, false);
- TestValues<StringValue>(10, string_desc_, false, false);
- TestValues<StringValue>(100, string_desc_, false, false);
-
- TestIntValuesInterleaved(1, 1, true);
- TestIntValuesInterleaved(10, 5, true);
- TestIntValuesInterleaved(100, 15, true);
- TestIntValuesInterleaved(1, 1, false);
- TestIntValuesInterleaved(10, 5, false);
- TestIntValuesInterleaved(100, 15, false);
-}
-
-// Test with only 1 buffer.
-TEST_F(SimpleTupleStreamTest, OneBufferSpill) {
- // Each buffer can only hold 100 ints, so this spills quite often.
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(buffer_size, buffer_size);
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
-}
-
-// Test with a few buffers.
-TEST_F(SimpleTupleStreamTest, ManyBufferSpill) {
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(10 * buffer_size, buffer_size);
-
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
- TestValues<int>(100, int_desc_, false, true);
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
- TestValues<StringValue>(100, string_desc_, false, true);
-
- TestIntValuesInterleaved(1, 1, true);
- TestIntValuesInterleaved(10, 5, true);
- TestIntValuesInterleaved(100, 15, true);
-}
-
-void SimpleTupleStreamTest::TestUnpinPin(bool varlen_data) {
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(3 * buffer_size, buffer_size);
- RowDescriptor* row_desc = varlen_data ? string_desc_ : int_desc_;
-
- BufferedTupleStream stream(
- runtime_state_, row_desc, runtime_state_->block_mgr(), client_, true, false);
- ASSERT_OK(stream.Init(-1, NULL, true));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
-
- int offset = 0;
- bool full = false;
- while (!full) {
- RowBatch* batch = varlen_data ? CreateStringBatch(offset, BATCH_SIZE, false)
- : CreateIntBatch(offset, BATCH_SIZE, false);
- int j = 0;
- for (; j < batch->num_rows(); ++j) {
- Status status;
- full = !stream.AddRow(batch->GetRow(j), &status);
- ASSERT_OK(status);
- if (full) break;
- }
- offset += j;
- }
-
- ASSERT_OK(stream.UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
-
- bool pinned = false;
- ASSERT_OK(stream.PinStream(false, &pinned));
- ASSERT_TRUE(pinned);
-
-
- // Read and verify result a few times. We should be able to reread the stream if
- // we don't use delete on read mode.
- int read_iters = 3;
- for (int i = 0; i < read_iters; ++i) {
- bool delete_on_read = i == read_iters - 1;
- bool got_read_buffer;
- ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_buffer));
- ASSERT_TRUE(got_read_buffer);
-
- if (varlen_data) {
- vector<StringValue> results;
- ReadValues(&stream, row_desc, &results);
- VerifyResults<StringValue>(*string_desc_, results, offset, false);
- } else {
- vector<int> results;
- ReadValues(&stream, row_desc, &results);
- VerifyResults<int>(*int_desc_, results, offset, false);
- }
- }
-
- // After delete_on_read, all blocks aside from the last should be deleted.
- // Note: this should really be 0, but the BufferedTupleStream returns eos before
- // deleting the last block, rather than after, so the last block isn't deleted
- // until the stream is closed.
- ASSERT_EQ(stream.bytes_in_mem(false), buffer_size);
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-
- ASSERT_EQ(stream.bytes_in_mem(false), 0);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPin) {
- TestUnpinPin(false);
-}
-
-TEST_F(SimpleTupleStreamTest, UnpinPinVarlen) {
- TestUnpinPin(false);
-}
-
-TEST_F(SimpleTupleStreamTest, SmallBuffers) {
- int buffer_size = IO_BLOCK_SIZE;
- InitBlockMgr(2 * buffer_size, buffer_size);
-
- BufferedTupleStream stream(
- runtime_state_, int_desc_, runtime_state_->block_mgr(), client_, true, false);
- ASSERT_OK(stream.Init(-1, NULL, false));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
-
- // Initial buffer should be small.
- EXPECT_LT(stream.bytes_in_mem(false), buffer_size);
-
- RowBatch* batch = CreateIntBatch(0, 1024, false);
-
- Status status;
- for (int i = 0; i < batch->num_rows(); ++i) {
- bool ret = stream.AddRow(batch->GetRow(i), &status);
- EXPECT_TRUE(ret);
- ASSERT_OK(status);
- }
- EXPECT_LT(stream.bytes_in_mem(false), buffer_size);
- EXPECT_LT(stream.byte_size(), buffer_size);
- ASSERT_TRUE(stream.using_small_buffers());
-
- // 40 MB of ints
- batch = CreateIntBatch(0, 10 * 1024 * 1024, false);
- for (int i = 0; i < batch->num_rows(); ++i) {
- bool ret = stream.AddRow(batch->GetRow(i), &status);
- ASSERT_OK(status);
- if (!ret) {
- ASSERT_TRUE(stream.using_small_buffers());
- bool got_buffer;
- ASSERT_OK(stream.SwitchToIoBuffers(&got_buffer));
- ASSERT_TRUE(got_buffer);
- ret = stream.AddRow(batch->GetRow(i), &status);
- ASSERT_OK(status);
- }
- ASSERT_TRUE(ret);
- }
- EXPECT_EQ(stream.bytes_in_mem(false), buffer_size);
-
- // TODO: Test for IMPALA-2330. In case SwitchToIoBuffers() fails to get buffer then
- // using_small_buffers() should still return true.
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-void SimpleTupleStreamTest::TestTransferMemory(bool pin_stream, bool read_write) {
- // Use smaller buffers so that the explicit FLUSH_RESOURCES flag is required to
- // make the batch at capacity.
- int buffer_size = 4 * 1024;
- InitBlockMgr(100 * buffer_size, buffer_size);
-
- BufferedTupleStream stream(
- runtime_state_, int_desc_, runtime_state_->block_mgr(), client_, false, read_write);
- ASSERT_OK(stream.Init(-1, NULL, pin_stream));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
- RowBatch* batch = CreateIntBatch(0, 1024, false);
-
- // Construct a stream with 4 blocks.
- const int total_num_blocks = 4;
- while (stream.byte_size() < total_num_blocks * buffer_size) {
- Status status;
- for (int i = 0; i < batch->num_rows(); ++i) {
- bool ret = stream.AddRow(batch->GetRow(i), &status);
- EXPECT_TRUE(ret);
- ASSERT_OK(status);
- }
- }
-
- bool got_read_buffer;
- ASSERT_OK(stream.PrepareForRead(true, &got_read_buffer));
- ASSERT_TRUE(got_read_buffer);
-
- batch->Reset();
- stream.Close(batch, RowBatch::FlushMode::FLUSH_RESOURCES);
- if (pin_stream) {
- DCHECK_EQ(total_num_blocks, batch->num_blocks());
- } else if (read_write) {
- // Read and write block should be attached.
- DCHECK_EQ(2, batch->num_blocks());
- } else {
- // Read block should be attached.
- DCHECK_EQ(1, batch->num_blocks());
- }
- DCHECK(batch->AtCapacity()); // Flush resources flag should have been set.
- batch->Reset();
- DCHECK_EQ(0, batch->num_blocks());
-}
-
-/// Test attaching memory to a row batch from a pinned stream.
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamReadWrite) {
- TestTransferMemory(true, true);
-}
-
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromPinnedStreamNoReadWrite) {
- TestTransferMemory(true, false);
-}
-
-/// Test attaching memory to a row batch from an unpinned stream.
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamReadWrite) {
- TestTransferMemory(false, true);
-}
-
-TEST_F(SimpleTupleStreamTest, TransferMemoryFromUnpinnedStreamNoReadWrite) {
- TestTransferMemory(false, false);
-}
-
-// Test that tuple stream functions if it references strings outside stream. The
-// aggregation node relies on this since it updates tuples in-place.
-TEST_F(SimpleTupleStreamTest, StringsOutsideStream) {
- int buffer_size = 8 * 1024 * 1024;
- InitBlockMgr(2 * buffer_size, buffer_size);
- Status status = Status::OK();
-
- int num_batches = 100;
- int rows_added = 0;
- DCHECK_EQ(string_desc_->tuple_descriptors().size(), 1);
- TupleDescriptor& tuple_desc = *string_desc_->tuple_descriptors()[0];
-
- set<SlotId> external_slots;
- for (int i = 0; i < tuple_desc.string_slots().size(); ++i) {
- external_slots.insert(tuple_desc.string_slots()[i]->id());
- }
-
- BufferedTupleStream stream(runtime_state_, string_desc_, runtime_state_->block_mgr(),
- client_, true, false, external_slots);
- for (int i = 0; i < num_batches; ++i) {
- RowBatch* batch = CreateStringBatch(rows_added, BATCH_SIZE, false);
- for (int j = 0; j < batch->num_rows(); ++j) {
- uint8_t* varlen_data;
- int fixed_size = tuple_desc.byte_size();
- uint8_t* tuple = stream.AllocateRow(fixed_size, 0, &varlen_data, &status);
- ASSERT_TRUE(tuple != NULL);
- ASSERT_TRUE(status.ok());
- // Copy fixed portion in, but leave it pointing to row batch's varlen data.
- memcpy(tuple, batch->GetRow(j)->GetTuple(0), fixed_size);
- }
- rows_added += batch->num_rows();
- }
-
- DCHECK_EQ(rows_added, stream.num_rows());
-
- for (int delete_on_read = 0; delete_on_read <= 1; ++delete_on_read) {
- // Keep stream in memory and test we can read ok.
- vector<StringValue> results;
- bool got_read_buffer;
- ASSERT_OK(stream.PrepareForRead(delete_on_read, &got_read_buffer));
- ASSERT_TRUE(got_read_buffer);
- ReadValues(&stream, string_desc_, &results);
- VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
- }
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Construct a big row by stiching together many tuples so the total row size
-// will be close to the IO block size. With null indicators, stream will fail to
-// be initialized; Without null indicators, things should work fine.
-TEST_F(SimpleTupleStreamTest, BigRow) {
- InitBlockMgr(2 * IO_BLOCK_SIZE, IO_BLOCK_SIZE);
- vector<TupleId> tuple_ids;
- vector<bool> nullable_tuples;
- vector<bool> non_nullable_tuples;
-
- DescriptorTblBuilder big_row_builder(test_env_->exec_env()->frontend(), &pool_);
- // Each tuple contains 8 slots of TYPE_INT and a single byte for null indicator.
- const int num_tuples = IO_BLOCK_SIZE / (8 * sizeof(int) + 1);
- for (int tuple_idx = 0; tuple_idx < num_tuples; ++tuple_idx) {
- big_row_builder.DeclareTuple() << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT
- << TYPE_INT << TYPE_INT << TYPE_INT << TYPE_INT;
- tuple_ids.push_back(static_cast<TTupleId>(tuple_idx));
- nullable_tuples.push_back(true);
- non_nullable_tuples.push_back(false);
- }
- DescriptorTbl *desc = big_row_builder.Build();
-
- // Construct a big row with all non-nullable tuples.
- RowDescriptor* row_desc = pool_.Add(new RowDescriptor(
- *desc, tuple_ids, non_nullable_tuples));
- ASSERT_FALSE(row_desc->IsAnyTupleNullable());
- // Test writing this row into the stream and then reading it back.
- TestValues<int>(1, row_desc, false, false, 1, false);
- TestValues<int>(1, row_desc, false, true, 1, false);
-
- // Construct a big row with nullable tuples. This requires space for null indicators
- // in the stream which, as a result, will fail to initialize.
- RowDescriptor* nullable_row_desc = pool_.Add(new RowDescriptor(
- *desc, tuple_ids, nullable_tuples));
- ASSERT_TRUE(nullable_row_desc->IsAnyTupleNullable());
- BufferedTupleStream nullable_stream(runtime_state_, nullable_row_desc,
- runtime_state_->block_mgr(), client_, false, false);
- Status status = nullable_stream.Init(-1, NULL, true);
- ASSERT_FALSE(status.ok());
- nullable_stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test for IMPALA-3923: overflow of 32-bit int in GetRows().
-TEST_F(SimpleTupleStreamTest, TestGetRowsOverflow) {
- InitBlockMgr(-1, 8 * 1024 * 1024);
- BufferedTupleStream stream(
- runtime_state_, int_desc_, runtime_state_->block_mgr(), client_, false, false);
- ASSERT_OK(stream.Init(-1, NULL, true));
-
- Status status;
- // Add more rows than can be fit in a RowBatch (limited by its 32-bit row count).
- // Actually adding the rows would take a very long time, so just set num_rows_.
- // This puts the stream in an inconsistent state, but exercises the right code path.
- stream.num_rows_ = 1L << 33;
- bool got_rows;
- scoped_ptr<RowBatch> overflow_batch;
- ASSERT_FALSE(stream.GetRows(&overflow_batch, &got_rows).ok());
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Basic API test. No data should be going to disk.
-TEST_F(SimpleNullStreamTest, Basic) {
- InitBlockMgr(-1, IO_BLOCK_SIZE);
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
- TestValues<int>(100, int_desc_, false, true);
- TestValues<int>(1, int_desc_, true, true);
- TestValues<int>(10, int_desc_, true, true);
- TestValues<int>(100, int_desc_, true, true);
- TestValues<int>(1, int_desc_, false, false);
- TestValues<int>(10, int_desc_, false, false);
- TestValues<int>(100, int_desc_, false, false);
- TestValues<int>(1, int_desc_, true, false);
- TestValues<int>(10, int_desc_, true, false);
- TestValues<int>(100, int_desc_, true, false);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
- TestValues<StringValue>(100, string_desc_, false, true);
- TestValues<StringValue>(1, string_desc_, true, true);
- TestValues<StringValue>(10, string_desc_, true, true);
- TestValues<StringValue>(100, string_desc_, true, true);
- TestValues<StringValue>(1, string_desc_, false, false);
- TestValues<StringValue>(10, string_desc_, false, false);
- TestValues<StringValue>(100, string_desc_, false, false);
- TestValues<StringValue>(1, string_desc_, true, false);
- TestValues<StringValue>(10, string_desc_, true, false);
- TestValues<StringValue>(100, string_desc_, true, false);
-
- TestIntValuesInterleaved(1, 1, true);
- TestIntValuesInterleaved(10, 5, true);
- TestIntValuesInterleaved(100, 15, true);
- TestIntValuesInterleaved(1, 1, false);
- TestIntValuesInterleaved(10, 5, false);
- TestIntValuesInterleaved(100, 15, false);
-}
-
-// Test tuple stream with only 1 buffer and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleOneBufferSpill) {
- // Each buffer can only hold 100 ints, so this spills quite often.
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(buffer_size, buffer_size);
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
-}
-
-// Test with a few buffers and rows with multiple tuples.
-TEST_F(MultiTupleStreamTest, MultiTupleManyBufferSpill) {
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(10 * buffer_size, buffer_size);
-
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
- TestValues<int>(100, int_desc_, false, true);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
- TestValues<StringValue>(100, string_desc_, false, true);
-
- TestIntValuesInterleaved(1, 1, true);
- TestIntValuesInterleaved(10, 5, true);
- TestIntValuesInterleaved(100, 15, true);
-}
-
-// Test that we can allocate a row in the stream and copy in multiple tuples then
-// read it back from the stream.
-TEST_F(MultiTupleStreamTest, MultiTupleAllocateRow) {
- // Use small buffers so it will be flushed to disk.
- int buffer_size = 4 * 1024;
- InitBlockMgr(2 * buffer_size, buffer_size);
- Status status = Status::OK();
-
- int num_batches = 1;
- int rows_added = 0;
- BufferedTupleStream stream(
- runtime_state_, string_desc_, runtime_state_->block_mgr(), client_, false, false);
- ASSERT_OK(stream.Init(-1, NULL, false));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
-
- for (int i = 0; i < num_batches; ++i) {
- RowBatch* batch = CreateStringBatch(rows_added, 1, false);
- for (int j = 0; j < batch->num_rows(); ++j) {
- TupleRow* row = batch->GetRow(j);
- int64_t fixed_size = 0;
- int64_t varlen_size = 0;
- for (int k = 0; k < string_desc_->tuple_descriptors().size(); k++) {
- TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[k];
- fixed_size += tuple_desc->byte_size();
- varlen_size += row->GetTuple(k)->VarlenByteSize(*tuple_desc);
- }
- uint8_t* varlen_data;
- uint8_t* fixed_data = stream.AllocateRow(fixed_size, varlen_size, &varlen_data,
- &status);
- ASSERT_TRUE(fixed_data != NULL);
- ASSERT_TRUE(status.ok());
- uint8_t* varlen_write_ptr = varlen_data;
- for (int k = 0; k < string_desc_->tuple_descriptors().size(); k++) {
- TupleDescriptor* tuple_desc = string_desc_->tuple_descriptors()[k];
- Tuple* src = row->GetTuple(k);
- Tuple* dst = reinterpret_cast<Tuple*>(fixed_data);
- fixed_data += tuple_desc->byte_size();
- memcpy(dst, src, tuple_desc->byte_size());
- for (int l = 0; l < tuple_desc->slots().size(); l++) {
- SlotDescriptor* slot = tuple_desc->slots()[l];
- StringValue* src_string = src->GetStringSlot(slot->tuple_offset());
- StringValue* dst_string = dst->GetStringSlot(slot->tuple_offset());
- dst_string->ptr = reinterpret_cast<char*>(varlen_write_ptr);
- memcpy(dst_string->ptr, src_string->ptr, src_string->len);
- varlen_write_ptr += src_string->len;
- }
- }
- ASSERT_EQ(varlen_data + varlen_size, varlen_write_ptr);
- }
- rows_added += batch->num_rows();
- }
-
- for (int i = 0; i < 3; ++i) {
- bool delete_on_read = i == 2;
- vector<StringValue> results;
- bool got_read_buffer;
- stream.PrepareForRead(delete_on_read, &got_read_buffer);
- ASSERT_TRUE(got_read_buffer);
- ReadValues(&stream, string_desc_, &results);
- VerifyResults<StringValue>(*string_desc_, results, rows_added, false);
- }
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// Test with rows with multiple nullable tuples.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleOneBufferSpill) {
- // Each buffer can only hold 100 ints, so this spills quite often.
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(buffer_size, buffer_size);
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
- TestValues<int>(1, int_desc_, true, true);
- TestValues<int>(10, int_desc_, true, true);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
- TestValues<StringValue>(1, string_desc_, true, true);
- TestValues<StringValue>(10, string_desc_, true, true);
-}
-
-// Test with a few buffers.
-TEST_F(MultiNullableTupleStreamTest, MultiNullableTupleManyBufferSpill) {
- int buffer_size = 100 * sizeof(int);
- InitBlockMgr(10 * buffer_size, buffer_size);
-
- TestValues<int>(1, int_desc_, false, true);
- TestValues<int>(10, int_desc_, false, true);
- TestValues<int>(100, int_desc_, false, true);
- TestValues<int>(1, int_desc_, true, true);
- TestValues<int>(10, int_desc_, true, true);
- TestValues<int>(100, int_desc_, true, true);
-
- TestValues<StringValue>(1, string_desc_, false, true);
- TestValues<StringValue>(10, string_desc_, false, true);
- TestValues<StringValue>(100, string_desc_, false, true);
- TestValues<StringValue>(1, string_desc_, true, true);
- TestValues<StringValue>(10, string_desc_, true, true);
- TestValues<StringValue>(100, string_desc_, true, true);
-
- TestIntValuesInterleaved(1, 1, true);
- TestIntValuesInterleaved(10, 5, true);
- TestIntValuesInterleaved(100, 15, true);
-}
-
-/// Test that ComputeRowSize handles nulls
-TEST_F(MultiNullableTupleStreamTest, TestComputeRowSize) {
- InitBlockMgr(-1, 8 * 1024 * 1024);
- const vector<TupleDescriptor*>& tuple_descs = string_desc_->tuple_descriptors();
- // String in second tuple is stored externally.
- set<SlotId> external_slots;
- const SlotDescriptor* external_string_slot = tuple_descs[1]->slots()[0];
- external_slots.insert(external_string_slot->id());
-
- BufferedTupleStream stream(runtime_state_, string_desc_, runtime_state_->block_mgr(),
- client_, false, false, external_slots);
- gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
- malloc(tuple_descs.size() * sizeof(Tuple*))));
- gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[0]->byte_size())));
- gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[1]->byte_size())));
- gscoped_ptr<Tuple, FreeDeleter> tuple2(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[2]->byte_size())));
- memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
- memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
- memset(tuple2.get(), 0, tuple_descs[2]->byte_size());
-
- // All nullable tuples are NULL.
- row->SetTuple(0, tuple0.get());
- row->SetTuple(1, NULL);
- row->SetTuple(2, NULL);
- EXPECT_EQ(tuple_descs[0]->byte_size(), stream.ComputeRowSize(row.get()));
-
- // Tuples are initialized to empty and have no var-len data.
- row->SetTuple(1, tuple1.get());
- row->SetTuple(2, tuple2.get());
- EXPECT_EQ(string_desc_->GetRowSize(), stream.ComputeRowSize(row.get()));
-
- // Tuple 0 has some data.
- const SlotDescriptor* string_slot = tuple_descs[0]->slots()[0];
- StringValue* sv = tuple0->GetStringSlot(string_slot->tuple_offset());
- *sv = STRINGS[0];
- int64_t expected_len = string_desc_->GetRowSize() + sv->len;
- EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
-
- // Check that external slots aren't included in count.
- sv = tuple1->GetStringSlot(external_string_slot->tuple_offset());
- sv->ptr = reinterpret_cast<char*>(1234);
- sv->len = 1234;
- EXPECT_EQ(expected_len, stream.ComputeRowSize(row.get()));
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-/// Test that deep copy works with arrays by copying into a BufferedTupleStream, freeing
-/// the original rows, then reading back the rows and verifying the contents.
-TEST_F(ArrayTupleStreamTest, TestArrayDeepCopy) {
- Status status;
- InitBlockMgr(-1, IO_BLOCK_SIZE);
- const int NUM_ROWS = 4000;
- BufferedTupleStream stream(
- runtime_state_, array_desc_, runtime_state_->block_mgr(), client_, false, false);
- const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
- // Write out a predictable pattern of data by iterating over arrays of constants.
- int strings_index = 0; // we take the mod of this as index into STRINGS.
- int array_lens[] = { 0, 1, 5, 10, 1000, 2, 49, 20 };
- int num_array_lens = sizeof(array_lens) / sizeof(array_lens[0]);
- int array_len_index = 0;
- ASSERT_OK(stream.Init(-1, NULL, false));
- bool got_write_buffer;
- ASSERT_OK(stream.PrepareForWrite(&got_write_buffer));
- ASSERT_TRUE(got_write_buffer);
-
- for (int i = 0; i < NUM_ROWS; ++i) {
- int expected_row_size = tuple_descs[0]->byte_size() + tuple_descs[1]->byte_size();
- gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
- malloc(tuple_descs.size() * sizeof(Tuple*))));
- gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[0]->byte_size())));
- gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[1]->byte_size())));
- memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
- memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
- row->SetTuple(0, tuple0.get());
- row->SetTuple(1, tuple1.get());
-
- // Only array<string> is non-null.
- tuple0->SetNull(tuple_descs[0]->slots()[1]->null_indicator_offset());
- tuple1->SetNull(tuple_descs[1]->slots()[0]->null_indicator_offset());
- const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
- const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
-
- int array_len = array_lens[array_len_index++ % num_array_lens];
- CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
- cv->ptr = NULL;
- cv->num_tuples = 0;
- CollectionValueBuilder builder(cv, *item_desc, mem_pool_.get(), runtime_state_,
- array_len);
- Tuple* array_data;
- int num_rows;
- builder.GetFreeMemory(&array_data, &num_rows);
- expected_row_size += item_desc->byte_size() * array_len;
-
- // Fill the array with pointers to our constant strings.
- for (int j = 0; j < array_len; ++j) {
- const StringValue* string = &STRINGS[strings_index++ % NUM_STRINGS];
- array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
- RawValue::Write(string, array_data, item_desc->slots()[0], mem_pool_.get());
- array_data += item_desc->byte_size();
- expected_row_size += string->len;
- }
- builder.CommitTuples(array_len);
-
- // Check that internal row size computation gives correct result.
- EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
- bool b = stream.AddRow(row.get(), &status);
- ASSERT_TRUE(b);
- ASSERT_OK(status);
- mem_pool_->FreeAll(); // Free data as soon as possible to smoke out issues.
- }
-
- // Read back and verify data.
- bool got_read_buffer;
- stream.PrepareForRead(false, &got_read_buffer);
- ASSERT_TRUE(got_read_buffer);
- strings_index = 0;
- array_len_index = 0;
- bool eos = false;
- int rows_read = 0;
- RowBatch batch(array_desc_, BATCH_SIZE, &tracker_);
- do {
- batch.Reset();
- ASSERT_OK(stream.GetNext(&batch, &eos));
- for (int i = 0; i < batch.num_rows(); ++i) {
- TupleRow* row = batch.GetRow(i);
- Tuple* tuple0 = row->GetTuple(0);
- Tuple* tuple1 = row->GetTuple(1);
- ASSERT_TRUE(tuple0 != NULL);
- ASSERT_TRUE(tuple1 != NULL);
- const SlotDescriptor* array_slot_desc = tuple_descs[0]->slots()[0];
- ASSERT_FALSE(tuple0->IsNull(array_slot_desc->null_indicator_offset()));
- ASSERT_TRUE(tuple0->IsNull(tuple_descs[0]->slots()[1]->null_indicator_offset()));
- ASSERT_TRUE(tuple1->IsNull(tuple_descs[1]->slots()[0]->null_indicator_offset()));
-
- const TupleDescriptor* item_desc = array_slot_desc->collection_item_descriptor();
- int expected_array_len = array_lens[array_len_index++ % num_array_lens];
- CollectionValue* cv = tuple0->GetCollectionSlot(array_slot_desc->tuple_offset());
- ASSERT_EQ(expected_array_len, cv->num_tuples);
- for (int j = 0; j < cv->num_tuples; ++j) {
- Tuple* item = reinterpret_cast<Tuple*>(cv->ptr + j * item_desc->byte_size());
- const SlotDescriptor* string_desc = item_desc->slots()[0];
- ASSERT_FALSE(item->IsNull(string_desc->null_indicator_offset()));
- const StringValue* expected = &STRINGS[strings_index++ % NUM_STRINGS];
- const StringValue* actual = item->GetStringSlot(string_desc->tuple_offset());
- ASSERT_EQ(*expected, *actual);
- }
- }
- rows_read += batch.num_rows();
- } while (!eos);
- ASSERT_EQ(NUM_ROWS, rows_read);
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-/// Test that ComputeRowSize handles nulls
-TEST_F(ArrayTupleStreamTest, TestComputeRowSize) {
- InitBlockMgr(-1, 8 * 1024 * 1024);
- const vector<TupleDescriptor*>& tuple_descs = array_desc_->tuple_descriptors();
- set<SlotId> external_slots;
- // Second array slot in first tuple is stored externally.
- const SlotDescriptor* external_array_slot = tuple_descs[0]->slots()[1];
- external_slots.insert(external_array_slot->id());
-
- BufferedTupleStream stream(runtime_state_, array_desc_, runtime_state_->block_mgr(),
- client_, false, false, external_slots);
- gscoped_ptr<TupleRow, FreeDeleter> row(reinterpret_cast<TupleRow*>(
- malloc(tuple_descs.size() * sizeof(Tuple*))));
- gscoped_ptr<Tuple, FreeDeleter> tuple0(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[0]->byte_size())));
- gscoped_ptr<Tuple, FreeDeleter> tuple1(reinterpret_cast<Tuple*>(
- malloc(tuple_descs[1]->byte_size())));
- memset(tuple0.get(), 0, tuple_descs[0]->byte_size());
- memset(tuple1.get(), 0, tuple_descs[1]->byte_size());
-
- // All tuples are NULL.
- row->SetTuple(0, NULL);
- row->SetTuple(1, NULL);
- EXPECT_EQ(0, stream.ComputeRowSize(row.get()));
-
- // Tuples are initialized to empty and have no var-len data.
- row->SetTuple(0, tuple0.get());
- row->SetTuple(1, tuple1.get());
- EXPECT_EQ(array_desc_->GetRowSize(), stream.ComputeRowSize(row.get()));
-
- // Tuple 0 has an array.
- int expected_row_size = array_desc_->GetRowSize();
- const SlotDescriptor* array_slot = tuple_descs[0]->slots()[0];
- const TupleDescriptor* item_desc = array_slot->collection_item_descriptor();
- int array_len = 128;
- CollectionValue* cv = tuple0->GetCollectionSlot(array_slot->tuple_offset());
- CollectionValueBuilder builder(cv, *item_desc, mem_pool_.get(), runtime_state_,
- array_len);
- Tuple* array_data;
- int num_rows;
- builder.GetFreeMemory(&array_data, &num_rows);
- expected_row_size += item_desc->byte_size() * array_len;
-
- // Fill the array with pointers to our constant strings.
- for (int i = 0; i < array_len; ++i) {
- const StringValue* str = &STRINGS[i % NUM_STRINGS];
- array_data->SetNotNull(item_desc->slots()[0]->null_indicator_offset());
- RawValue::Write(str, array_data, item_desc->slots()[0], mem_pool_.get());
- array_data += item_desc->byte_size();
- expected_row_size += str->len;
- }
- builder.CommitTuples(array_len);
- EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-
- // Check that the external slot isn't included in size.
- cv = tuple0->GetCollectionSlot(external_array_slot->tuple_offset());
- // ptr of external slot shouldn't be dereferenced when computing size.
- cv->ptr = reinterpret_cast<uint8_t*>(1234);
- cv->num_tuples = 1234;
- EXPECT_EQ(expected_row_size, stream.ComputeRowSize(row.get()));
-
- // Check that the array is excluded if tuple 0's array has its null indicator set.
- tuple0->SetNull(array_slot->null_indicator_offset());
- EXPECT_EQ(array_desc_->GetRowSize(), stream.ComputeRowSize(row.get()));
-
- stream.Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
-}
-
-// TODO: more tests.
-// - The stream can operate in many modes
-
-}
-
-int main(int argc, char** argv) {
- ::testing::InitGoogleTest(&argc, argv);
- impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
- impala::InitFeSupport();
- impala::LlvmCodeGen::InitializeLlvm();
- return RUN_ALL_TESTS();
-}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-tuple-stream.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-tuple-stream.cc b/be/src/runtime/buffered-tuple-stream.cc
deleted file mode 100644
index cce6390..0000000
--- a/be/src/runtime/buffered-tuple-stream.cc
+++ /dev/null
@@ -1,903 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "runtime/buffered-tuple-stream.inline.h"
-
-#include <boost/bind.hpp>
-#include <gutil/strings/substitute.h>
-
-#include "runtime/collection-value.h"
-#include "runtime/descriptors.h"
-#include "runtime/string-value.h"
-#include "runtime/tuple-row.h"
-#include "util/bit-util.h"
-#include "util/debug-util.h"
-#include "util/runtime-profile-counters.h"
-
-#include "common/names.h"
-
-using namespace impala;
-using namespace strings;
-
-// The first NUM_SMALL_BLOCKS of the tuple stream are made of blocks less than the
-// IO size. These blocks never spill.
-// TODO: Consider adding a 4MB in-memory buffer that would split the gap between the
-// 512KB in-memory buffer and the 8MB (IO-sized) spillable buffer.
-static const int64_t INITIAL_BLOCK_SIZES[] = { 64 * 1024, 512 * 1024 };
-static const int NUM_SMALL_BLOCKS = sizeof(INITIAL_BLOCK_SIZES) / sizeof(int64_t);
-
-string BufferedTupleStream::RowIdx::DebugString() const {
- stringstream ss;
- ss << "RowIdx block=" << block() << " offset=" << offset() << " idx=" << idx();
- return ss.str();
-}
-
-BufferedTupleStream::BufferedTupleStream(RuntimeState* state,
- const RowDescriptor* row_desc, BufferedBlockMgr* block_mgr,
- BufferedBlockMgr::Client* client, bool use_initial_small_buffers, bool read_write,
- const set<SlotId>& ext_varlen_slots)
- : state_(state),
- desc_(row_desc),
- block_mgr_(block_mgr),
- block_mgr_client_(client),
- total_byte_size_(0),
- read_tuple_idx_(-1),
- read_ptr_(NULL),
- read_end_ptr_(NULL),
- write_tuple_idx_(-1),
- write_ptr_(NULL),
- write_end_ptr_(NULL),
- rows_returned_(0),
- read_block_idx_(-1),
- write_block_(NULL),
- num_pinned_(0),
- num_small_blocks_(0),
- num_rows_(0),
- pin_timer_(NULL),
- unpin_timer_(NULL),
- get_new_block_timer_(NULL),
- read_write_(read_write),
- has_nullable_tuple_(row_desc->IsAnyTupleNullable()),
- use_small_buffers_(use_initial_small_buffers),
- delete_on_read_(false),
- closed_(false),
- pinned_(true) {
- read_block_null_indicators_size_ = -1;
- write_block_null_indicators_size_ = -1;
- max_null_indicators_size_ = -1;
- read_block_ = blocks_.end();
- fixed_tuple_row_size_ = 0;
- for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
- const TupleDescriptor* tuple_desc = desc_->tuple_descriptors()[i];
- const int tuple_byte_size = tuple_desc->byte_size();
- fixed_tuple_sizes_.push_back(tuple_byte_size);
- fixed_tuple_row_size_ += tuple_byte_size;
-
- vector<SlotDescriptor*> tuple_string_slots;
- vector<SlotDescriptor*> tuple_coll_slots;
- for (int j = 0; j < tuple_desc->slots().size(); ++j) {
- SlotDescriptor* slot = tuple_desc->slots()[j];
- if (!slot->type().IsVarLenType()) continue;
- if (ext_varlen_slots.find(slot->id()) == ext_varlen_slots.end()) {
- if (slot->type().IsVarLenStringType()) {
- tuple_string_slots.push_back(slot);
- } else {
- DCHECK(slot->type().IsCollectionType());
- tuple_coll_slots.push_back(slot);
- }
- }
- }
- if (!tuple_string_slots.empty()) {
- inlined_string_slots_.push_back(make_pair(i, tuple_string_slots));
- }
-
- if (!tuple_coll_slots.empty()) {
- inlined_coll_slots_.push_back(make_pair(i, tuple_coll_slots));
- }
- }
-}
-
-BufferedTupleStream::~BufferedTupleStream() {
- DCHECK(closed_);
-}
-
-// Returns the number of pinned blocks in the list. Only called in DCHECKs to validate
-// num_pinned_.
-int NumPinned(const list<BufferedBlockMgr::Block*>& blocks) {
- int num_pinned = 0;
- for (BufferedBlockMgr::Block* block : blocks) {
- if (block->is_pinned() && block->is_max_size()) ++num_pinned;
- }
- return num_pinned;
-}
-
-string BufferedTupleStream::DebugString() const {
- stringstream ss;
- ss << "BufferedTupleStream num_rows=" << num_rows_ << " rows_returned="
- << rows_returned_ << " pinned=" << (pinned_ ? "true" : "false")
- << " delete_on_read=" << (delete_on_read_ ? "true" : "false")
- << " closed=" << (closed_ ? "true" : "false")
- << " num_pinned=" << num_pinned_
- << " write_block=" << write_block_ << " read_block_=";
- if (read_block_ == blocks_.end()) {
- ss << "<end>";
- } else {
- ss << *read_block_;
- }
- ss << " blocks=[\n";
- for (BufferedBlockMgr::Block* block : blocks_) {
- ss << "{" << block->DebugString() << "}";
- if (block != blocks_.back()) ss << ",\n";
- }
- ss << "]";
- return ss.str();
-}
-
-Status BufferedTupleStream::Init(int node_id, RuntimeProfile* profile, bool pinned) {
- if (profile != NULL) {
- pin_timer_ = ADD_TIMER(profile, "PinTime");
- unpin_timer_ = ADD_TIMER(profile, "UnpinTime");
- get_new_block_timer_ = ADD_TIMER(profile, "GetNewBlockTime");
- }
-
- max_null_indicators_size_ = ComputeNumNullIndicatorBytes(block_mgr_->max_block_size());
- if (UNLIKELY(max_null_indicators_size_ < 0)) {
- // The block cannot even fit in a row of tuples so just assume there is one row.
- int null_indicators_size =
- BitUtil::RoundUpNumi64(desc_->tuple_descriptors().size()) * 8;
- return Status(TErrorCode::BTS_BLOCK_OVERFLOW,
- PrettyPrinter::Print(fixed_tuple_row_size_, TUnit::BYTES),
- PrettyPrinter::Print(null_indicators_size, TUnit::BYTES));
- }
-
- if (block_mgr_->max_block_size() < INITIAL_BLOCK_SIZES[0]) {
- use_small_buffers_ = false;
- }
- if (!pinned) RETURN_IF_ERROR(UnpinStream(UNPIN_ALL_EXCEPT_CURRENT));
- return Status::OK();
-}
-
-Status BufferedTupleStream::PrepareForWrite(bool* got_buffer) {
- DCHECK(write_block_ == NULL);
- return NewWriteBlockForRow(fixed_tuple_row_size_, got_buffer);
-}
-
-Status BufferedTupleStream::SwitchToIoBuffers(bool* got_buffer) {
- if (!use_small_buffers_) {
- *got_buffer = (write_block_ != NULL);
- return Status::OK();
- }
- use_small_buffers_ = false;
- Status status =
- NewWriteBlock(block_mgr_->max_block_size(), max_null_indicators_size_, got_buffer);
- // IMPALA-2330: Set the flag using small buffers back to false in case it failed to
- // got a buffer.
- DCHECK(status.ok() || !*got_buffer) << status.ok() << " " << *got_buffer;
- use_small_buffers_ = !*got_buffer;
- return status;
-}
-
-void BufferedTupleStream::Close(RowBatch* batch, RowBatch::FlushMode flush) {
- for (BufferedBlockMgr::Block* block : blocks_) {
- if (batch != NULL && block->is_pinned()) {
- batch->AddBlock(block, flush);
- } else {
- block->Delete();
- }
- }
- blocks_.clear();
- num_pinned_ = 0;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_));
- closed_ = true;
-}
-
-int64_t BufferedTupleStream::bytes_in_mem(bool ignore_current) const {
- int64_t result = 0;
- for (BufferedBlockMgr::Block* block : blocks_) {
- if (!block->is_pinned()) continue;
- if (!block->is_max_size()) continue;
- if (block == write_block_ && ignore_current) continue;
- result += block->buffer_len();
- }
- return result;
-}
-
-Status BufferedTupleStream::UnpinBlock(BufferedBlockMgr::Block* block) {
- SCOPED_TIMER(unpin_timer_);
- DCHECK(block->is_pinned());
- if (!block->is_max_size()) return Status::OK();
- RETURN_IF_ERROR(block->Unpin());
- --num_pinned_;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_));
- return Status::OK();
-}
-
-Status BufferedTupleStream::NewWriteBlock(
- int64_t block_len, int64_t null_indicators_size, bool* got_block) noexcept {
- DCHECK(!closed_);
- DCHECK_GE(null_indicators_size, 0);
- *got_block = false;
-
- BufferedBlockMgr::Block* unpin_block = write_block_;
- if (write_block_ != NULL) {
- DCHECK(write_block_->is_pinned());
- if (pinned_ || write_block_ == *read_block_ || !write_block_->is_max_size()) {
- // In these cases, don't unpin the current write block.
- unpin_block = NULL;
- }
- }
-
- BufferedBlockMgr::Block* new_block = NULL;
- {
- SCOPED_TIMER(get_new_block_timer_);
- RETURN_IF_ERROR(block_mgr_->GetNewBlock(
- block_mgr_client_, unpin_block, &new_block, block_len));
- }
- *got_block = new_block != NULL;
-
- if (!*got_block) {
- DCHECK(unpin_block == NULL);
- return Status::OK();
- }
-
- if (unpin_block != NULL) {
- DCHECK(unpin_block == write_block_);
- DCHECK(!write_block_->is_pinned());
- --num_pinned_;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_));
- }
-
- // Mark the entire block as containing valid data to avoid updating it as we go.
- new_block->Allocate<uint8_t>(block_len);
-
- // Compute and allocate the block header with the null indicators.
- DCHECK_EQ(null_indicators_size, ComputeNumNullIndicatorBytes(block_len));
- write_block_null_indicators_size_ = null_indicators_size;
- write_tuple_idx_ = 0;
- write_ptr_ = new_block->buffer() + write_block_null_indicators_size_;
- write_end_ptr_ = new_block->buffer() + block_len;
-
- blocks_.push_back(new_block);
- block_start_idx_.push_back(new_block->buffer());
- write_block_ = new_block;
- DCHECK(write_block_->is_pinned());
- DCHECK_EQ(write_block_->num_rows(), 0);
- if (write_block_->is_max_size()) {
- ++num_pinned_;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_));
- } else {
- ++num_small_blocks_;
- }
- total_byte_size_ += block_len;
- return Status::OK();
-}
-
-Status BufferedTupleStream::NewWriteBlockForRow(
- int64_t row_size, bool* got_block) noexcept {
- int64_t block_len = 0;
- int64_t null_indicators_size = 0;
- if (use_small_buffers_) {
- *got_block = false;
- if (blocks_.size() < NUM_SMALL_BLOCKS) {
- block_len = INITIAL_BLOCK_SIZES[blocks_.size()];
- null_indicators_size = ComputeNumNullIndicatorBytes(block_len);
- // Use small buffer only if:
- // 1. the small buffer's size is smaller than the configured max block size.
- // 2. a single row of tuples and null indicators (if any) fit in the small buffer.
- //
- // If condition 2 above is not met, we will bail. An alternative would be
- // to try the next larger small buffer.
- *got_block = block_len < block_mgr_->max_block_size() &&
- null_indicators_size >= 0 && row_size + null_indicators_size <= block_len;
- }
- // Do not switch to IO-buffers automatically. Do not get a buffer.
- if (!*got_block) return Status::OK();
- } else {
- DCHECK_GE(max_null_indicators_size_, 0);
- block_len = block_mgr_->max_block_size();
- null_indicators_size = max_null_indicators_size_;
- // Check if the size of row and null indicators exceeds the IO block size.
- if (UNLIKELY(row_size + null_indicators_size > block_len)) {
- return Status(TErrorCode::BTS_BLOCK_OVERFLOW,
- PrettyPrinter::Print(row_size, TUnit::BYTES),
- PrettyPrinter::Print(null_indicators_size, TUnit::BYTES));
- }
- }
- return NewWriteBlock(block_len, null_indicators_size, got_block);
-}
-
-Status BufferedTupleStream::NextReadBlock() {
- DCHECK(!closed_);
- DCHECK(read_block_ != blocks_.end());
- DCHECK_EQ(num_pinned_, NumPinned(blocks_)) << pinned_;
-
- // If non-NULL, this will be the current block if we are going to free it while
- // grabbing the next block. This will stay NULL if we don't want to free the
- // current block.
- BufferedBlockMgr::Block* block_to_free =
- (!pinned_ || delete_on_read_) ? *read_block_ : NULL;
- if (delete_on_read_) {
- DCHECK(read_block_ == blocks_.begin());
- DCHECK(*read_block_ != write_block_);
- blocks_.pop_front();
- read_block_ = blocks_.begin();
- read_block_idx_ = 0;
- if (block_to_free != NULL && !block_to_free->is_max_size()) {
- block_to_free->Delete();
- block_to_free = NULL;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_)) << DebugString();
- }
- } else {
- ++read_block_;
- ++read_block_idx_;
- if (block_to_free != NULL && !block_to_free->is_max_size()) block_to_free = NULL;
- }
-
- bool pinned = false;
- if (read_block_ == blocks_.end() || (*read_block_)->is_pinned()) {
- // End of the blocks or already pinned, just handle block_to_free
- if (block_to_free != NULL) {
- SCOPED_TIMER(unpin_timer_);
- if (delete_on_read_) {
- block_to_free->Delete();
- --num_pinned_;
- } else {
- RETURN_IF_ERROR(UnpinBlock(block_to_free));
- }
- }
- } else {
- // Call into the block mgr to atomically unpin/delete the old block and pin the
- // new block.
- SCOPED_TIMER(pin_timer_);
- RETURN_IF_ERROR((*read_block_)->Pin(&pinned, block_to_free, !delete_on_read_));
- if (!pinned) {
- DCHECK(block_to_free == NULL) << "Should have been able to pin."
- << endl << block_mgr_->DebugString(block_mgr_client_);;
- }
- if (block_to_free == NULL && pinned) ++num_pinned_;
- }
-
- if (read_block_ != blocks_.end() && (*read_block_)->is_pinned()) {
- read_block_null_indicators_size_ =
- ComputeNumNullIndicatorBytes((*read_block_)->buffer_len());
- DCHECK_GE(read_block_null_indicators_size_, 0);
- read_tuple_idx_ = 0;
- read_ptr_ = (*read_block_)->buffer() + read_block_null_indicators_size_;
- read_end_ptr_ = (*read_block_)->buffer() + (*read_block_)->buffer_len();
- }
- DCHECK_EQ(num_pinned_, NumPinned(blocks_)) << DebugString();
- return Status::OK();
-}
-
-Status BufferedTupleStream::PrepareForRead(bool delete_on_read, bool* got_buffer) {
- DCHECK(!closed_);
- if (blocks_.empty()) return Status::OK();
-
- if (!read_write_ && write_block_ != NULL) {
- DCHECK(write_block_->is_pinned());
- if (!pinned_ && write_block_ != blocks_.front()) {
- RETURN_IF_ERROR(UnpinBlock(write_block_));
- }
- write_block_ = NULL;
- }
-
- // Walk the blocks and pin the first IO-sized block.
- for (BufferedBlockMgr::Block* block : blocks_) {
- if (!block->is_pinned()) {
- SCOPED_TIMER(pin_timer_);
- bool current_pinned;
- RETURN_IF_ERROR(block->Pin(¤t_pinned));
- if (!current_pinned) {
- *got_buffer = false;
- return Status::OK();
- }
- ++num_pinned_;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_));
- }
- if (block->is_max_size()) break;
- }
-
- read_block_ = blocks_.begin();
- DCHECK(read_block_ != blocks_.end());
- read_block_null_indicators_size_ =
- ComputeNumNullIndicatorBytes((*read_block_)->buffer_len());
- DCHECK_GE(read_block_null_indicators_size_, 0);
- read_tuple_idx_ = 0;
- read_ptr_ = (*read_block_)->buffer() + read_block_null_indicators_size_;
- read_end_ptr_ = (*read_block_)->buffer() + (*read_block_)->buffer_len();
- rows_returned_ = 0;
- read_block_idx_ = 0;
- delete_on_read_ = delete_on_read;
- *got_buffer = true;
- return Status::OK();
-}
-
-Status BufferedTupleStream::PinStream(bool already_reserved, bool* pinned) {
- DCHECK(!closed_);
- DCHECK(pinned != NULL);
- if (!already_reserved) {
- // If we can't get all the blocks, don't try at all.
- if (!block_mgr_->TryAcquireTmpReservation(block_mgr_client_, blocks_unpinned())) {
- *pinned = false;
- return Status::OK();
- }
- }
-
- for (BufferedBlockMgr::Block* block : blocks_) {
- if (block->is_pinned()) continue;
- {
- SCOPED_TIMER(pin_timer_);
- RETURN_IF_ERROR(block->Pin(pinned));
- }
- if (!*pinned) {
- VLOG_QUERY << "Should have been reserved." << endl
- << block_mgr_->DebugString(block_mgr_client_);
- return Status::OK();
- }
- ++num_pinned_;
- DCHECK_EQ(num_pinned_, NumPinned(blocks_));
- }
-
- if (!delete_on_read_) {
- // Populate block_start_idx_ on pin.
- DCHECK_EQ(block_start_idx_.size(), blocks_.size());
- block_start_idx_.clear();
- for (BufferedBlockMgr::Block* block : blocks_) {
- block_start_idx_.push_back(block->buffer());
- }
- }
- *pinned = true;
- pinned_ = true;
- return Status::OK();
-}
-
-Status BufferedTupleStream::UnpinStream(UnpinMode mode) {
- DCHECK(!closed_);
- DCHECK(mode == UNPIN_ALL || mode == UNPIN_ALL_EXCEPT_CURRENT);
- SCOPED_TIMER(unpin_timer_);
-
- for (BufferedBlockMgr::Block* block: blocks_) {
- if (!block->is_pinned()) continue;
- if (mode == UNPIN_ALL_EXCEPT_CURRENT
- && (block == write_block_ || (read_write_ && block == *read_block_))) {
- continue;
- }
- RETURN_IF_ERROR(UnpinBlock(block));
- }
- if (mode == UNPIN_ALL) {
- read_block_ = blocks_.end();
- write_block_ = NULL;
- }
- pinned_ = false;
- return Status::OK();
-}
-
-int BufferedTupleStream::ComputeNumNullIndicatorBytes(int block_size) const {
- if (has_nullable_tuple_) {
- // We assume that all rows will use their max size, so we may be underutilizing the
- // space, i.e. we may have some unused space in case of rows with NULL tuples.
- const uint32_t tuples_per_row = desc_->tuple_descriptors().size();
- const uint32_t min_row_size_in_bits = 8 * fixed_tuple_row_size_ + tuples_per_row;
- const uint32_t block_size_in_bits = 8 * block_size;
- const uint32_t max_num_rows = block_size_in_bits / min_row_size_in_bits;
- if (UNLIKELY(max_num_rows == 0)) return -1;
- return BitUtil::RoundUpNumi64(max_num_rows * tuples_per_row) * 8;
- } else {
- // If there are no nullable tuples then no need to waste space for null indicators.
- return 0;
- }
-}
-
-Status BufferedTupleStream::GetRows(scoped_ptr<RowBatch>* batch, bool* got_rows) {
- if (num_rows() > numeric_limits<int>::max()) {
- // RowBatch::num_rows_ is a 32-bit int, avoid an overflow.
- return Status(Substitute("Trying to read $0 rows into in-memory batch failed. Limit "
- "is $1", num_rows(), numeric_limits<int>::max()));
- }
- RETURN_IF_ERROR(PinStream(false, got_rows));
- if (!*got_rows) return Status::OK();
- bool got_read_buffer;
- RETURN_IF_ERROR(PrepareForRead(false, &got_read_buffer));
- DCHECK(got_read_buffer) << "Stream was pinned";
- batch->reset(
- new RowBatch(desc_, num_rows(), block_mgr_->get_tracker(block_mgr_client_)));
- bool eos = false;
- // Loop until GetNext fills the entire batch. Each call can stop at block
- // boundaries. We generally want it to stop, so that blocks can be freed
- // as we read. It is safe in this case because we pin the entire stream.
- while (!eos) {
- RETURN_IF_ERROR(GetNext(batch->get(), &eos));
- }
- return Status::OK();
-}
-
-Status BufferedTupleStream::GetNext(RowBatch* batch, bool* eos) {
- return GetNextInternal<false>(batch, eos, NULL);
-}
-
-Status BufferedTupleStream::GetNext(RowBatch* batch, bool* eos,
- vector<RowIdx>* indices) {
- return GetNextInternal<true>(batch, eos, indices);
-}
-
-template <bool FILL_INDICES>
-Status BufferedTupleStream::GetNextInternal(RowBatch* batch, bool* eos,
- vector<RowIdx>* indices) {
- if (has_nullable_tuple_) {
- return GetNextInternal<FILL_INDICES, true>(batch, eos, indices);
- } else {
- return GetNextInternal<FILL_INDICES, false>(batch, eos, indices);
- }
-}
-
-template <bool FILL_INDICES, bool HAS_NULLABLE_TUPLE>
-Status BufferedTupleStream::GetNextInternal(RowBatch* batch, bool* eos,
- vector<RowIdx>* indices) {
- DCHECK(!closed_);
- DCHECK(batch->row_desc()->LayoutEquals(*desc_));
- *eos = (rows_returned_ == num_rows_);
- if (*eos) return Status::OK();
- DCHECK_GE(read_block_null_indicators_size_, 0);
-
- const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
- DCHECK_LE(read_tuple_idx_ / tuples_per_row, (*read_block_)->num_rows());
- DCHECK_EQ(read_tuple_idx_ % tuples_per_row, 0);
- int rows_returned_curr_block = read_tuple_idx_ / tuples_per_row;
-
- if (UNLIKELY(rows_returned_curr_block == (*read_block_)->num_rows())) {
- // Get the next block in the stream. We need to do this at the beginning of
- // the GetNext() call to ensure the buffer management semantics. NextReadBlock()
- // will recycle the memory for the rows returned from the *previous* call to
- // GetNext().
- RETURN_IF_ERROR(NextReadBlock());
- DCHECK(read_block_ != blocks_.end()) << DebugString();
- DCHECK_GE(read_block_null_indicators_size_, 0);
- rows_returned_curr_block = 0;
- }
-
- DCHECK(read_block_ != blocks_.end());
- DCHECK((*read_block_)->is_pinned()) << DebugString();
- DCHECK_GE(read_tuple_idx_, 0);
-
- int rows_left_in_block = (*read_block_)->num_rows() - rows_returned_curr_block;
- int rows_to_fill = std::min(batch->capacity() - batch->num_rows(), rows_left_in_block);
- DCHECK_GE(rows_to_fill, 1);
- batch->AddRows(rows_to_fill);
- uint8_t* tuple_row_mem = reinterpret_cast<uint8_t*>(batch->GetRow(batch->num_rows()));
-
- // Produce tuple rows from the current block and the corresponding position on the
- // null tuple indicator.
- if (FILL_INDICES) {
- DCHECK(indices != NULL);
- DCHECK(!delete_on_read_);
- DCHECK_EQ(batch->num_rows(), 0);
- indices->clear();
- indices->reserve(rows_to_fill);
- }
-
- uint8_t* null_word = NULL;
- uint32_t null_pos = 0;
- // Start reading from position read_tuple_idx_ in the block.
- // IMPALA-2256: Special case if there are no materialized slots.
- bool increment_row = RowConsumesMemory();
- uint64_t last_read_row = increment_row * (read_tuple_idx_ / tuples_per_row);
- for (int i = 0; i < rows_to_fill; ++i) {
- if (FILL_INDICES) {
- indices->push_back(RowIdx());
- DCHECK_EQ(indices->size(), i + 1);
- (*indices)[i].set(read_block_idx_, read_ptr_ - (*read_block_)->buffer(),
- last_read_row);
- }
- // Copy the row into the output batch.
- TupleRow* output_row = reinterpret_cast<TupleRow*>(tuple_row_mem);
- if (HAS_NULLABLE_TUPLE) {
- for (int j = 0; j < tuples_per_row; ++j) {
- // Stitch together the tuples from the block and the NULL ones.
- null_word = (*read_block_)->buffer() + (read_tuple_idx_ >> 3);
- null_pos = read_tuple_idx_ & 7;
- ++read_tuple_idx_;
- const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
- // Copy tuple and advance read_ptr_. If it is a NULL tuple, it calls SetTuple
- // with Tuple* being 0x0. To do that we multiply the current read_ptr_ with
- // false (0x0).
- output_row->SetTuple(j, reinterpret_cast<Tuple*>(
- reinterpret_cast<uint64_t>(read_ptr_) * is_not_null));
- read_ptr_ += fixed_tuple_sizes_[j] * is_not_null;
- }
- } else {
- // When we know that there are no nullable tuples we can skip null checks.
- for (int j = 0; j < tuples_per_row; ++j) {
- output_row->SetTuple(j, reinterpret_cast<Tuple*>(read_ptr_));
- read_ptr_ += fixed_tuple_sizes_[j];
- }
- read_tuple_idx_ += tuples_per_row;
- }
- tuple_row_mem += sizeof(Tuple*) * tuples_per_row;
-
- // Update string slot ptrs, skipping external strings.
- for (int j = 0; j < inlined_string_slots_.size(); ++j) {
- Tuple* tuple = output_row->GetTuple(inlined_string_slots_[j].first);
- if (HAS_NULLABLE_TUPLE && tuple == NULL) continue;
- FixUpStringsForRead(inlined_string_slots_[j].second, tuple);
- }
-
- // Update collection slot ptrs, skipping external collections. We traverse the
- // collection structure in the same order as it was written to the stream, allowing
- // us to infer the data layout based on the length of collections and strings.
- for (int j = 0; j < inlined_coll_slots_.size(); ++j) {
- Tuple* tuple = output_row->GetTuple(inlined_coll_slots_[j].first);
- if (HAS_NULLABLE_TUPLE && tuple == NULL) continue;
- FixUpCollectionsForRead(inlined_coll_slots_[j].second, tuple);
- }
- last_read_row += increment_row;
- }
-
- batch->CommitRows(rows_to_fill);
- rows_returned_ += rows_to_fill;
- *eos = (rows_returned_ == num_rows_);
- if ((!pinned_ || delete_on_read_)
- && rows_returned_curr_block + rows_to_fill == (*read_block_)->num_rows()) {
- // No more data in this block. The batch must be immediately returned up the operator
- // tree and deep copied so that NextReadBlock() can reuse the read block's buffer.
- batch->MarkNeedsDeepCopy();
- }
- if (FILL_INDICES) DCHECK_EQ(indices->size(), rows_to_fill);
- DCHECK_LE(read_ptr_, read_end_ptr_);
- return Status::OK();
-}
-
-void BufferedTupleStream::FixUpStringsForRead(const vector<SlotDescriptor*>& string_slots,
- Tuple* tuple) {
- DCHECK(tuple != NULL);
- for (int i = 0; i < string_slots.size(); ++i) {
- const SlotDescriptor* slot_desc = string_slots[i];
- if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-
- StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
- DCHECK_LE(sv->len, read_block_bytes_remaining());
- sv->ptr = reinterpret_cast<char*>(read_ptr_);
- read_ptr_ += sv->len;
- }
-}
-
-void BufferedTupleStream::FixUpCollectionsForRead(const vector<SlotDescriptor*>& collection_slots,
- Tuple* tuple) {
- DCHECK(tuple != NULL);
- for (int i = 0; i < collection_slots.size(); ++i) {
- const SlotDescriptor* slot_desc = collection_slots[i];
- if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
-
- CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
- const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
- int coll_byte_size = cv->num_tuples * item_desc.byte_size();
- DCHECK_LE(coll_byte_size, read_block_bytes_remaining());
- cv->ptr = reinterpret_cast<uint8_t*>(read_ptr_);
- read_ptr_ += coll_byte_size;
-
- if (!item_desc.HasVarlenSlots()) continue;
- uint8_t* coll_data = cv->ptr;
- for (int j = 0; j < cv->num_tuples; ++j) {
- Tuple* item = reinterpret_cast<Tuple*>(coll_data);
- FixUpStringsForRead(item_desc.string_slots(), item);
- FixUpCollectionsForRead(item_desc.collection_slots(), item);
- coll_data += item_desc.byte_size();
- }
- }
-}
-
-int64_t BufferedTupleStream::ComputeRowSize(TupleRow* row) const noexcept {
- int64_t size = 0;
- if (has_nullable_tuple_) {
- for (int i = 0; i < fixed_tuple_sizes_.size(); ++i) {
- if (row->GetTuple(i) != NULL) size += fixed_tuple_sizes_[i];
- }
- } else {
- size = fixed_tuple_row_size_;
- }
- for (int i = 0; i < inlined_string_slots_.size(); ++i) {
- Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
- if (tuple == NULL) continue;
- const vector<SlotDescriptor*>& slots = inlined_string_slots_[i].second;
- for (auto it = slots.begin(); it != slots.end(); ++it) {
- if (tuple->IsNull((*it)->null_indicator_offset())) continue;
- size += tuple->GetStringSlot((*it)->tuple_offset())->len;
- }
- }
-
- for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
- Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
- if (tuple == NULL) continue;
- const vector<SlotDescriptor*>& slots = inlined_coll_slots_[i].second;
- for (auto it = slots.begin(); it != slots.end(); ++it) {
- if (tuple->IsNull((*it)->null_indicator_offset())) continue;
- CollectionValue* cv = tuple->GetCollectionSlot((*it)->tuple_offset());
- const TupleDescriptor& item_desc = *(*it)->collection_item_descriptor();
- size += cv->num_tuples * item_desc.byte_size();
-
- if (!item_desc.HasVarlenSlots()) continue;
- for (int j = 0; j < cv->num_tuples; ++j) {
- Tuple* item = reinterpret_cast<Tuple*>(&cv->ptr[j * item_desc.byte_size()]);
- size += item->VarlenByteSize(item_desc);
- }
- }
- }
- return size;
-}
-
-bool BufferedTupleStream::AddRowSlow(TupleRow* row, Status* status) noexcept {
- bool got_block;
- int64_t row_size = ComputeRowSize(row);
- *status = NewWriteBlockForRow(row_size, &got_block);
- if (!status->ok() || !got_block) return false;
- return DeepCopy(row);
-}
-
-bool BufferedTupleStream::DeepCopy(TupleRow* row) noexcept {
- if (has_nullable_tuple_) {
- return DeepCopyInternal<true>(row);
- } else {
- return DeepCopyInternal<false>(row);
- }
-}
-
-// TODO: this really needs codegen
-// TODO: in case of duplicate tuples, this can redundantly serialize data.
-template <bool HasNullableTuple>
-bool BufferedTupleStream::DeepCopyInternal(TupleRow* row) noexcept {
- if (UNLIKELY(write_block_ == NULL)) return false;
- DCHECK_GE(write_block_null_indicators_size_, 0);
- DCHECK(write_block_->is_pinned()) << DebugString() << std::endl
- << write_block_->DebugString();
-
- const uint64_t tuples_per_row = desc_->tuple_descriptors().size();
- uint32_t bytes_remaining = write_block_bytes_remaining();
- if (UNLIKELY((bytes_remaining < fixed_tuple_row_size_) ||
- (HasNullableTuple &&
- (write_tuple_idx_ + tuples_per_row > write_block_null_indicators_size_ * 8)))) {
- return false;
- }
-
- // Copy the not NULL fixed len tuples. For the NULL tuples just update the NULL tuple
- // indicator.
- if (HasNullableTuple) {
- DCHECK_GT(write_block_null_indicators_size_, 0);
- uint8_t* null_word = NULL;
- uint32_t null_pos = 0;
- for (int i = 0; i < tuples_per_row; ++i) {
- null_word = write_block_->buffer() + (write_tuple_idx_ >> 3); // / 8
- null_pos = write_tuple_idx_ & 7;
- ++write_tuple_idx_;
- const int tuple_size = fixed_tuple_sizes_[i];
- Tuple* t = row->GetTuple(i);
- const uint8_t mask = 1 << (7 - null_pos);
- if (t != NULL) {
- *null_word &= ~mask;
- memcpy(write_ptr_, t, tuple_size);
- write_ptr_ += tuple_size;
- } else {
- *null_word |= mask;
- }
- }
- DCHECK_LE(write_tuple_idx_ - 1, write_block_null_indicators_size_ * 8);
- } else {
- // If we know that there are no nullable tuples no need to set the nullability flags.
- DCHECK_EQ(write_block_null_indicators_size_, 0);
- for (int i = 0; i < tuples_per_row; ++i) {
- const int tuple_size = fixed_tuple_sizes_[i];
- Tuple* t = row->GetTuple(i);
- // TODO: Once IMPALA-1306 (Avoid passing empty tuples of non-materialized slots)
- // is delivered, the check below should become DCHECK(t != NULL).
- DCHECK(t != NULL || tuple_size == 0);
- memcpy(write_ptr_, t, tuple_size);
- write_ptr_ += tuple_size;
- }
- }
-
- // Copy inlined string slots. Note: we do not need to convert the string ptrs to offsets
- // on the write path, only on the read. The tuple data is immediately followed
- // by the string data so only the len information is necessary.
- for (int i = 0; i < inlined_string_slots_.size(); ++i) {
- const Tuple* tuple = row->GetTuple(inlined_string_slots_[i].first);
- if (HasNullableTuple && tuple == NULL) continue;
- if (UNLIKELY(!CopyStrings(tuple, inlined_string_slots_[i].second))) return false;
- }
-
- // Copy inlined collection slots. We copy collection data in a well-defined order so
- // we do not need to convert pointers to offsets on the write path.
- for (int i = 0; i < inlined_coll_slots_.size(); ++i) {
- const Tuple* tuple = row->GetTuple(inlined_coll_slots_[i].first);
- if (HasNullableTuple && tuple == NULL) continue;
- if (UNLIKELY(!CopyCollections(tuple, inlined_coll_slots_[i].second))) return false;
- }
-
- write_block_->AddRow();
- ++num_rows_;
- return true;
-}
-
-bool BufferedTupleStream::CopyStrings(const Tuple* tuple,
- const vector<SlotDescriptor*>& string_slots) {
- for (int i = 0; i < string_slots.size(); ++i) {
- const SlotDescriptor* slot_desc = string_slots[i];
- if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
- const StringValue* sv = tuple->GetStringSlot(slot_desc->tuple_offset());
- if (LIKELY(sv->len > 0)) {
- if (UNLIKELY(write_block_bytes_remaining() < sv->len)) return false;
-
- memcpy(write_ptr_, sv->ptr, sv->len);
- write_ptr_ += sv->len;
- }
- }
- return true;
-}
-
-bool BufferedTupleStream::CopyCollections(const Tuple* tuple,
- const vector<SlotDescriptor*>& collection_slots) {
- for (int i = 0; i < collection_slots.size(); ++i) {
- const SlotDescriptor* slot_desc = collection_slots[i];
- if (tuple->IsNull(slot_desc->null_indicator_offset())) continue;
- const CollectionValue* cv = tuple->GetCollectionSlot(slot_desc->tuple_offset());
- const TupleDescriptor& item_desc = *slot_desc->collection_item_descriptor();
- if (LIKELY(cv->num_tuples > 0)) {
- int coll_byte_size = cv->num_tuples * item_desc.byte_size();
- if (UNLIKELY(write_block_bytes_remaining() < coll_byte_size)) return false;
- uint8_t* coll_data = write_ptr_;
- memcpy(coll_data, cv->ptr, coll_byte_size);
- write_ptr_ += coll_byte_size;
-
- if (!item_desc.HasVarlenSlots()) continue;
- // Copy variable length data when present in collection items.
- for (int j = 0; j < cv->num_tuples; ++j) {
- const Tuple* item = reinterpret_cast<Tuple*>(coll_data);
- if (UNLIKELY(!CopyStrings(item, item_desc.string_slots()))) return false;
- if (UNLIKELY(!CopyCollections(item, item_desc.collection_slots()))) return false;
- coll_data += item_desc.byte_size();
- }
- }
- }
- return true;
-}
-
-void BufferedTupleStream::GetTupleRow(const RowIdx& idx, TupleRow* row) const {
- DCHECK(row != NULL);
- DCHECK(!closed_);
- DCHECK(is_pinned());
- DCHECK(!delete_on_read_);
- DCHECK_EQ(blocks_.size(), block_start_idx_.size());
- DCHECK_LT(idx.block(), blocks_.size());
-
- uint8_t* data = block_start_idx_[idx.block()] + idx.offset();
- if (has_nullable_tuple_) {
- // Stitch together the tuples from the block and the NULL ones.
- const int tuples_per_row = desc_->tuple_descriptors().size();
- uint32_t tuple_idx = idx.idx() * tuples_per_row;
- for (int i = 0; i < tuples_per_row; ++i) {
- const uint8_t* null_word = block_start_idx_[idx.block()] + (tuple_idx >> 3);
- const uint32_t null_pos = tuple_idx & 7;
- const bool is_not_null = ((*null_word & (1 << (7 - null_pos))) == 0);
- row->SetTuple(i, reinterpret_cast<Tuple*>(
- reinterpret_cast<uint64_t>(data) * is_not_null));
- data += desc_->tuple_descriptors()[i]->byte_size() * is_not_null;
- ++tuple_idx;
- }
- } else {
- for (int i = 0; i < desc_->tuple_descriptors().size(); ++i) {
- row->SetTuple(i, reinterpret_cast<Tuple*>(data));
- data += desc_->tuple_descriptors()[i]->byte_size();
- }
- }
-}
[10/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc
index 7067961..fc0a4a6 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -31,10 +31,12 @@
#include "exprs/scalar-expr-evaluator.h"
#include "exprs/slot-ref.h"
#include "gutil/strings/substitute.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
#include "runtime/descriptors.h"
+#include "runtime/exec-env.h"
#include "runtime/mem-pool.h"
#include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
#include "runtime/raw-value.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-state.h"
@@ -111,7 +113,6 @@ PartitionedAggregationNode::PartitionedAggregationNode(
needs_finalize_(tnode.agg_node.need_finalize),
is_streaming_preagg_(tnode.agg_node.use_streaming_preaggregation),
needs_serialize_(false),
- block_mgr_client_(NULL),
output_partition_(NULL),
process_batch_no_grouping_fn_(NULL),
process_batch_fn_(NULL),
@@ -224,24 +225,6 @@ Status PartitionedAggregationNode::Prepare(RuntimeState* state) {
RETURN_IF_ERROR(HashTableCtx::Create(pool_, state, build_exprs_,
grouping_exprs_, true, vector<bool>(build_exprs_.size(), true),
state->fragment_hash_seed(), MAX_PARTITION_DEPTH, 1, expr_mem_pool(), &ht_ctx_));
- RETURN_IF_ERROR(state_->block_mgr()->RegisterClient(
- Substitute("PartitionedAggregationNode id=$0 ptr=$1", id_, this),
- MinRequiredBuffers(), true, mem_tracker(), state, &block_mgr_client_));
- }
-
- // TODO: Is there a need to create the stream here? If memory reservations work we may
- // be able to create this stream lazily and only whenever we need to spill.
- if (!is_streaming_preagg_ && needs_serialize_ && block_mgr_client_ != NULL) {
- serialize_stream_.reset(new BufferedTupleStream(state, &intermediate_row_desc_,
- state->block_mgr(), block_mgr_client_, false /* use_initial_small_buffers */,
- false /* read_write */));
- RETURN_IF_ERROR(serialize_stream_->Init(id(), runtime_profile(), false));
- bool got_buffer;
- RETURN_IF_ERROR(serialize_stream_->PrepareForWrite(&got_buffer));
- if (!got_buffer) {
- return state_->block_mgr()->MemLimitTooLowError(block_mgr_client_, id());
- }
- DCHECK(serialize_stream_->has_write_block());
}
AddCodegenDisabledMessage(state);
return Status::OK();
@@ -265,8 +248,16 @@ Status PartitionedAggregationNode::Open(RuntimeState* state) {
SCOPED_TIMER(runtime_profile_->total_time_counter());
// Open the child before consuming resources in this node.
RETURN_IF_ERROR(child(0)->Open(state));
-
RETURN_IF_ERROR(ExecNode::Open(state));
+
+ // Claim reservation after the child has been opened to reduce the peak reservation
+ // requirement.
+ if (!buffer_pool_client_.is_registered() && !grouping_exprs_.empty()) {
+ DCHECK_GE(resource_profile_.min_reservation,
+ resource_profile_.spillable_buffer_size * MinRequiredBuffers());
+ RETURN_IF_ERROR(ClaimBufferReservation(state));
+ }
+
if (ht_ctx_.get() != nullptr) RETURN_IF_ERROR(ht_ctx_->Open(state));
RETURN_IF_ERROR(AggFnEvaluator::Open(agg_fn_evals_, state));
if (grouping_exprs_.empty()) {
@@ -278,6 +269,25 @@ Status PartitionedAggregationNode::Open(RuntimeState* state) {
RETURN_IF_ERROR(state_->GetQueryStatus());
singleton_output_tuple_returned_ = false;
} else {
+ if (ht_allocator_ == nullptr) {
+ // Allocate 'serialize_stream_' and 'ht_allocator_' on the first Open() call.
+ ht_allocator_.reset(new Suballocator(state_->exec_env()->buffer_pool(),
+ &buffer_pool_client_, resource_profile_.spillable_buffer_size));
+
+ if (!is_streaming_preagg_ && needs_serialize_) {
+ serialize_stream_.reset(new BufferedTupleStreamV2(state, &intermediate_row_desc_,
+ &buffer_pool_client_, resource_profile_.spillable_buffer_size,
+ resource_profile_.spillable_buffer_size));
+ RETURN_IF_ERROR(serialize_stream_->Init(id(), false));
+ bool got_buffer;
+ // Reserve the memory for 'serialize_stream_' so we don't need to scrounge up
+ // another buffer during spilling.
+ RETURN_IF_ERROR(serialize_stream_->PrepareForWrite(&got_buffer));
+ DCHECK(got_buffer)
+ << "Accounted in min reservation" << buffer_pool_client_.DebugString();
+ DCHECK(serialize_stream_->has_write_iterator());
+ }
+ }
RETURN_IF_ERROR(CreateHashPartitions(0));
}
@@ -520,9 +530,12 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
bool ht_needs_expansion = false;
for (int i = 0; i < PARTITION_FANOUT; ++i) {
HashTable* hash_tbl = GetHashTable(i);
- DCHECK(hash_tbl != NULL);
- remaining_capacity[i] = hash_tbl->NumInsertsBeforeResize();
- ht_needs_expansion |= remaining_capacity[i] < child_batch_->num_rows();
+ if (hash_tbl == nullptr) {
+ remaining_capacity[i] = 0;
+ } else {
+ remaining_capacity[i] = hash_tbl->NumInsertsBeforeResize();
+ ht_needs_expansion |= remaining_capacity[i] < child_batch_->num_rows();
+ }
}
// Stop expanding hash tables if we're not reducing the input sufficiently. As our
@@ -533,9 +546,12 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
if (ht_needs_expansion && ShouldExpandPreaggHashTables()) {
for (int i = 0; i < PARTITION_FANOUT; ++i) {
HashTable* ht = GetHashTable(i);
- if (remaining_capacity[i] < child_batch_->num_rows()) {
+ if (ht != nullptr && remaining_capacity[i] < child_batch_->num_rows()) {
SCOPED_TIMER(ht_resize_timer_);
- if (ht->CheckAndResize(child_batch_->num_rows(), ht_ctx_.get())) {
+ bool resized;
+ RETURN_IF_ERROR(
+ ht->CheckAndResize(child_batch_->num_rows(), ht_ctx_.get(), &resized));
+ if (resized) {
remaining_capacity[i] = ht->NumInsertsBeforeResize();
}
}
@@ -548,7 +564,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity));
} else {
RETURN_IF_ERROR(ProcessBatchStreaming(needs_serialize_, prefetch_mode,
- child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity ));
+ child_batch_.get(), out_batch, ht_ctx_.get(), remaining_capacity));
}
child_batch_->Reset(); // All rows from child_batch_ were processed.
@@ -557,7 +573,7 @@ Status PartitionedAggregationNode::GetRowsStreaming(RuntimeState* state,
if (child_eos_) {
child(0)->Close(state);
child_batch_.reset();
- MoveHashPartitions(child(0)->rows_returned());
+ RETURN_IF_ERROR(MoveHashPartitions(child(0)->rows_returned()));
}
num_rows_returned_ += out_batch->num_rows();
@@ -570,8 +586,10 @@ bool PartitionedAggregationNode::ShouldExpandPreaggHashTables() const {
int64_t ht_rows = 0;
for (int i = 0; i < PARTITION_FANOUT; ++i) {
HashTable* ht = hash_partitions_[i]->hash_tbl.get();
- ht_mem += ht->CurrentMemSize();
- ht_rows += ht->size();
+ if (ht != nullptr) {
+ ht_mem += ht->CurrentMemSize();
+ ht_rows += ht->size();
+ }
}
// Need some rows in tables to have valid statistics.
@@ -678,7 +696,6 @@ void PartitionedAggregationNode::Close(RuntimeState* state) {
if (serialize_stream_.get() != nullptr) {
serialize_stream_->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
}
- if (block_mgr_client_ != nullptr) state->block_mgr()->ClearReservations(block_mgr_client_);
ScalarExpr::Close(grouping_exprs_);
ScalarExpr::Close(build_exprs_);
AggFn::Close(agg_fns_);
@@ -705,56 +722,55 @@ Status PartitionedAggregationNode::Partition::InitStreams() {
}
}
- aggregated_row_stream.reset(new BufferedTupleStream(parent->state_,
- &parent->intermediate_row_desc_, parent->state_->block_mgr(),
- parent->block_mgr_client_, true /* use_initial_small_buffers */,
- false /* read_write */, external_varlen_slots));
- RETURN_IF_ERROR(
- aggregated_row_stream->Init(parent->id(), parent->runtime_profile(), true));
+ aggregated_row_stream.reset(new BufferedTupleStreamV2(parent->state_,
+ &parent->intermediate_row_desc_, &parent->buffer_pool_client_,
+ parent->resource_profile_.spillable_buffer_size,
+ parent->resource_profile_.spillable_buffer_size, external_varlen_slots));
+ RETURN_IF_ERROR(aggregated_row_stream->Init(parent->id(), true));
bool got_buffer;
RETURN_IF_ERROR(aggregated_row_stream->PrepareForWrite(&got_buffer));
if (!got_buffer) {
- return parent->state_->block_mgr()->MemLimitTooLowError(
- parent->block_mgr_client_, parent->id());
+ stringstream ss;
+ parent->DebugString(2, &ss);
+ DCHECK(parent->is_streaming_preagg_)
+ << "Merge agg should have enough reservation " << parent->id_ << "\n"
+ << parent->buffer_pool_client_.DebugString() << "\n"
+ << ss.str();
+ DiscardAggregatedRowStream();
}
if (!parent->is_streaming_preagg_) {
- unaggregated_row_stream.reset(new BufferedTupleStream(parent->state_,
- parent->child(0)->row_desc(), parent->state_->block_mgr(),
- parent->block_mgr_client_, true /* use_initial_small_buffers */,
- false /* read_write */));
+ unaggregated_row_stream.reset(new BufferedTupleStreamV2(parent->state_,
+ parent->child(0)->row_desc(), &parent->buffer_pool_client_,
+ parent->resource_profile_.spillable_buffer_size,
+ parent->resource_profile_.spillable_buffer_size));
// This stream is only used to spill, no need to ever have this pinned.
- RETURN_IF_ERROR(unaggregated_row_stream->Init(parent->id(), parent->runtime_profile(),
- false));
- // TODO: allocate this buffer later only if we spill the partition.
- RETURN_IF_ERROR(unaggregated_row_stream->PrepareForWrite(&got_buffer));
- if (!got_buffer) {
- return parent->state_->block_mgr()->MemLimitTooLowError(
- parent->block_mgr_client_, parent->id());
- }
- DCHECK(unaggregated_row_stream->has_write_block());
+ RETURN_IF_ERROR(unaggregated_row_stream->Init(parent->id(), false));
+ // Save memory by waiting until we spill to allocate the write buffer for the
+ // unaggregated row stream.
+ DCHECK(!unaggregated_row_stream->has_write_iterator());
}
return Status::OK();
}
-bool PartitionedAggregationNode::Partition::InitHashTable() {
- DCHECK(hash_tbl.get() == NULL);
+Status PartitionedAggregationNode::Partition::InitHashTable(bool* got_memory) {
+ DCHECK(aggregated_row_stream != nullptr);
+ DCHECK(hash_tbl == nullptr);
// We use the upper PARTITION_FANOUT num bits to pick the partition so only the
// remaining bits can be used for the hash table.
// TODO: we could switch to 64 bit hashes and then we don't need a max size.
// It might be reasonable to limit individual hash table size for other reasons
// though. Always start with small buffers.
- hash_tbl.reset(HashTable::Create(parent->state_, parent->block_mgr_client_,
- false, 1, NULL, 1L << (32 - NUM_PARTITIONING_BITS),
- PAGG_DEFAULT_HASH_TABLE_SZ));
+ hash_tbl.reset(HashTable::Create(parent->ht_allocator_.get(), false, 1, nullptr,
+ 1L << (32 - NUM_PARTITIONING_BITS), PAGG_DEFAULT_HASH_TABLE_SZ));
// Please update the error message in CreateHashPartitions() if initial size of
// hash table changes.
- return hash_tbl->Init();
+ return hash_tbl->Init(got_memory);
}
Status PartitionedAggregationNode::Partition::SerializeStreamForSpilling() {
DCHECK(!parent->is_streaming_preagg_);
- if (parent->needs_serialize_ && aggregated_row_stream->num_rows() != 0) {
+ if (parent->needs_serialize_) {
// We need to do a lot more work in this case. This step effectively does a merge
// aggregation in this node. We need to serialize the intermediates, spill the
// intermediates and then feed them into the aggregate function's merge step.
@@ -767,70 +783,69 @@ Status PartitionedAggregationNode::Partition::SerializeStreamForSpilling() {
// for those UDAs.
DCHECK(parent->serialize_stream_.get() != NULL);
DCHECK(!parent->serialize_stream_->is_pinned());
- DCHECK(parent->serialize_stream_->has_write_block());
// Serialize and copy the spilled partition's stream into the new stream.
- Status status = Status::OK();
- bool failed_to_add = false;
- BufferedTupleStream* new_stream = parent->serialize_stream_.get();
+ Status status;
+ BufferedTupleStreamV2* new_stream = parent->serialize_stream_.get();
HashTable::Iterator it = hash_tbl->Begin(parent->ht_ctx_.get());
while (!it.AtEnd()) {
Tuple* tuple = it.GetTuple();
it.Next();
AggFnEvaluator::Serialize(agg_fn_evals, tuple);
if (UNLIKELY(!new_stream->AddRow(reinterpret_cast<TupleRow*>(&tuple), &status))) {
- failed_to_add = true;
- break;
+ DCHECK(!status.ok()) << "Stream was unpinned - AddRow() only fails on error";
+ // Even if we can't add to new_stream, finish up processing this agg stream to make
+ // clean up easier (someone has to finalize this stream and we don't want to remember
+ // where we are).
+ parent->CleanupHashTbl(agg_fn_evals, it);
+ hash_tbl->Close();
+ hash_tbl.reset();
+ aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+ return status;
}
}
- // Even if we can't add to new_stream, finish up processing this agg stream to make
- // clean up easier (someone has to finalize this stream and we don't want to remember
- // where we are).
- if (failed_to_add) {
- parent->CleanupHashTbl(agg_fn_evals, it);
- hash_tbl->Close();
- hash_tbl.reset();
- aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
- RETURN_IF_ERROR(status);
- return parent->state_->block_mgr()->MemLimitTooLowError(parent->block_mgr_client_,
- parent->id());
- }
- DCHECK(status.ok());
-
aggregated_row_stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
aggregated_row_stream.swap(parent->serialize_stream_);
// Recreate the serialize_stream (and reserve 1 buffer) now in preparation for
// when we need to spill again. We need to have this available before we need
// to spill to make sure it is available. This should be acquirable since we just
// freed at least one buffer from this partition's (old) aggregated_row_stream.
- parent->serialize_stream_.reset(
- new BufferedTupleStream(parent->state_, &parent->intermediate_row_desc_,
- parent->state_->block_mgr(), parent->block_mgr_client_,
- false /* use_initial_small_buffers */, false /* read_write */));
- status = parent->serialize_stream_->Init(parent->id(), parent->runtime_profile(),
- false);
+ parent->serialize_stream_.reset(new BufferedTupleStreamV2(parent->state_,
+ &parent->intermediate_row_desc_, &parent->buffer_pool_client_,
+ parent->resource_profile_.spillable_buffer_size,
+ parent->resource_profile_.spillable_buffer_size));
+ status = parent->serialize_stream_->Init(parent->id(), false);
if (status.ok()) {
bool got_buffer;
status = parent->serialize_stream_->PrepareForWrite(&got_buffer);
- if (status.ok() && !got_buffer) {
- status = parent->state_->block_mgr()->MemLimitTooLowError(
- parent->block_mgr_client_, parent->id());
- }
+ DCHECK(!status.ok() || got_buffer) << "Accounted in min reservation";
}
if (!status.ok()) {
hash_tbl->Close();
hash_tbl.reset();
return status;
}
- DCHECK(parent->serialize_stream_->has_write_block());
+ DCHECK(parent->serialize_stream_->has_write_iterator());
}
return Status::OK();
}
-Status PartitionedAggregationNode::Partition::Spill() {
+void PartitionedAggregationNode::Partition::DiscardAggregatedRowStream() {
+ DCHECK(parent->is_streaming_preagg_);
+ DCHECK(aggregated_row_stream != nullptr);
+ DCHECK_EQ(aggregated_row_stream->num_rows(), 0);
+ if (hash_tbl != nullptr) hash_tbl->Close();
+ hash_tbl.reset();
+ aggregated_row_stream->Close(nullptr, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
+ aggregated_row_stream.reset();
+}
+
+Status PartitionedAggregationNode::Partition::Spill(bool more_aggregate_rows) {
+ DCHECK(!parent->is_streaming_preagg_);
DCHECK(!is_closed);
DCHECK(!is_spilled());
+ RETURN_IF_ERROR(parent->state_->StartSpilling(parent->mem_tracker()));
RETURN_IF_ERROR(SerializeStreamForSpilling());
@@ -846,34 +861,18 @@ Status PartitionedAggregationNode::Partition::Spill() {
hash_tbl->Close();
hash_tbl.reset();
- // Try to switch both streams to IO-sized buffers to avoid allocating small buffers
- // for spilled partition.
- bool got_buffer = true;
- if (aggregated_row_stream->using_small_buffers()) {
- RETURN_IF_ERROR(aggregated_row_stream->SwitchToIoBuffers(&got_buffer));
- }
- // Unpin the stream as soon as possible to increase the chances that the
- // SwitchToIoBuffers() call below will succeed. If we're repartitioning, rows that
- // were already aggregated (rows from the input partition's aggregated stream) will
- // need to be added to this hash partition's aggregated stream, so we need to leave
- // the write block pinned.
- // TODO: when not repartitioning, don't leave the write block pinned.
- DCHECK(!got_buffer || aggregated_row_stream->has_write_block())
- << aggregated_row_stream->DebugString();
- RETURN_IF_ERROR(
- aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
-
- if (got_buffer && unaggregated_row_stream->using_small_buffers()) {
- RETURN_IF_ERROR(unaggregated_row_stream->SwitchToIoBuffers(&got_buffer));
- }
- if (!got_buffer) {
- // We'll try again to get the buffers when the stream fills up the small buffers.
- VLOG_QUERY << "Not enough memory to switch to IO-sized buffer for partition "
- << this << " of agg=" << parent->id_ << " agg small buffers="
- << aggregated_row_stream->using_small_buffers()
- << " unagg small buffers="
- << unaggregated_row_stream->using_small_buffers();
- VLOG_FILE << GetStackTrace();
+ // Unpin the stream to free memory, but leave a write buffer in place so we can
+ // continue appending rows to one of the streams in the partition.
+ DCHECK(aggregated_row_stream->has_write_iterator());
+ DCHECK(!unaggregated_row_stream->has_write_iterator());
+ if (more_aggregate_rows) {
+ aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
+ } else {
+ aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+ bool got_buffer;
+ RETURN_IF_ERROR(unaggregated_row_stream->PrepareForWrite(&got_buffer));
+ DCHECK(got_buffer)
+ << "Accounted in min reservation" << parent->buffer_pool_client_.DebugString();
}
COUNTER_ADD(parent->num_spilled_partitions_, 1);
@@ -933,33 +932,27 @@ Tuple* PartitionedAggregationNode::ConstructIntermediateTuple(
}
Tuple* PartitionedAggregationNode::ConstructIntermediateTuple(
- const vector<AggFnEvaluator*>& agg_fn_evals, BufferedTupleStream* stream,
+ const vector<AggFnEvaluator*>& agg_fn_evals, BufferedTupleStreamV2* stream,
Status* status) noexcept {
DCHECK(stream != NULL && status != NULL);
// Allocate space for the entire tuple in the stream.
const int fixed_size = intermediate_tuple_desc_->byte_size();
const int varlen_size = GroupingExprsVarlenSize();
- uint8_t* varlen_buffer;
- uint8_t* fixed_buffer = stream->AllocateRow(fixed_size, varlen_size, &varlen_buffer,
- status);
- if (UNLIKELY(fixed_buffer == NULL)) {
- if (!status->ok() || !stream->using_small_buffers()) return NULL;
- // IMPALA-2352: Make a best effort to switch to IO buffers and re-allocate.
- // If SwitchToIoBuffers() fails the caller of this function can try to free
- // some space, e.g. through spilling, and re-attempt to allocate space for
- // this row.
- bool got_buffer;
- *status = stream->SwitchToIoBuffers(&got_buffer);
- if (!status->ok() || !got_buffer) return NULL;
- fixed_buffer = stream->AllocateRow(fixed_size, varlen_size, &varlen_buffer, status);
- if (fixed_buffer == NULL) return NULL;
- }
-
- Tuple* intermediate_tuple = reinterpret_cast<Tuple*>(fixed_buffer);
- intermediate_tuple->Init(fixed_size);
- CopyGroupingValues(intermediate_tuple, varlen_buffer, varlen_size);
- InitAggSlots(agg_fn_evals, intermediate_tuple);
- return intermediate_tuple;
+ const int tuple_size = fixed_size + varlen_size;
+ uint8_t* tuple_data = stream->AddRowCustomBegin(tuple_size, status);
+ if (UNLIKELY(tuple_data == nullptr)) {
+ // If we failed to allocate and did not hit an error (indicated by a non-ok status),
+ // the caller of this function can try to free some space, e.g. through spilling, and
+ // re-attempt to allocate space for this row.
+ return nullptr;
+ }
+ Tuple* tuple = reinterpret_cast<Tuple*>(tuple_data);
+ tuple->Init(fixed_size);
+ uint8_t* varlen_buffer = tuple_data + fixed_size;
+ CopyGroupingValues(tuple, varlen_buffer, varlen_size);
+ InitAggSlots(agg_fn_evals, tuple);
+ stream->AddRowCustomEnd(tuple_size);
+ return tuple;
}
int PartitionedAggregationNode::GroupingExprsVarlenSize() {
@@ -1079,30 +1072,30 @@ Tuple* PartitionedAggregationNode::GetOutputTuple(
return dst;
}
-Status PartitionedAggregationNode::AppendSpilledRow(BufferedTupleStream* stream,
- TupleRow* row) {
- DCHECK(stream != NULL);
+template <bool AGGREGATED_ROWS>
+Status PartitionedAggregationNode::AppendSpilledRow(
+ Partition* __restrict__ partition, TupleRow* __restrict__ row) {
+ DCHECK(!is_streaming_preagg_);
+ DCHECK(partition->is_spilled());
+ BufferedTupleStreamV2* stream = AGGREGATED_ROWS ?
+ partition->aggregated_row_stream.get() :
+ partition->unaggregated_row_stream.get();
DCHECK(!stream->is_pinned());
- DCHECK(stream->has_write_block());
- if (LIKELY(stream->AddRow(row, &process_batch_status_))) return Status::OK();
+ Status status;
+ if (LIKELY(stream->AddRow(row, &status))) return Status::OK();
+ RETURN_IF_ERROR(status);
- // Adding fails iff either we hit an error or haven't switched to I/O buffers.
- RETURN_IF_ERROR(process_batch_status_);
+ // Keep trying to free memory by spilling until we succeed or hit an error.
+ // Running out of partitions to spill is treated as an error by SpillPartition().
while (true) {
- bool got_buffer;
- RETURN_IF_ERROR(stream->SwitchToIoBuffers(&got_buffer));
- if (got_buffer) break;
- RETURN_IF_ERROR(SpillPartition());
+ RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS));
+ if (stream->AddRow(row, &status)) return Status::OK();
+ RETURN_IF_ERROR(status);
}
-
- // Adding the row should succeed after the I/O buffer switch.
- if (stream->AddRow(row, &process_batch_status_)) return Status::OK();
- DCHECK(!process_batch_status_.ok());
- return process_batch_status_;
}
-void PartitionedAggregationNode::DebugString(int indentation_level,
- stringstream* out) const {
+void PartitionedAggregationNode::DebugString(
+ int indentation_level, stringstream* out) const {
*out << string(indentation_level * 2, ' ');
*out << "PartitionedAggregationNode("
<< "intermediate_tuple_id=" << intermediate_tuple_id_
@@ -1114,85 +1107,100 @@ void PartitionedAggregationNode::DebugString(int indentation_level,
*out << ")";
}
-Status PartitionedAggregationNode::CreateHashPartitions(int level) {
+Status PartitionedAggregationNode::CreateHashPartitions(
+ int level, int single_partition_idx) {
if (is_streaming_preagg_) DCHECK_EQ(level, 0);
if (UNLIKELY(level >= MAX_PARTITION_DEPTH)) {
- return Status(TErrorCode::PARTITIONED_AGG_MAX_PARTITION_DEPTH, id_, MAX_PARTITION_DEPTH);
+ return Status(
+ TErrorCode::PARTITIONED_AGG_MAX_PARTITION_DEPTH, id_, MAX_PARTITION_DEPTH);
}
ht_ctx_->set_level(level);
DCHECK(hash_partitions_.empty());
+ int num_partitions_created = 0;
for (int i = 0; i < PARTITION_FANOUT; ++i) {
- Partition* new_partition = new Partition(this, level);
- DCHECK(new_partition != NULL);
- hash_partitions_.push_back(partition_pool_->Add(new_partition));
- RETURN_IF_ERROR(new_partition->InitStreams());
- hash_tbls_[i] = NULL;
- }
- if (!is_streaming_preagg_) {
- DCHECK_GT(state_->block_mgr()->num_reserved_buffers_remaining(block_mgr_client_), 0);
+ hash_tbls_[i] = nullptr;
+ if (single_partition_idx == -1 || i == single_partition_idx) {
+ Partition* new_partition = partition_pool_->Add(new Partition(this, level, i));
+ ++num_partitions_created;
+ hash_partitions_.push_back(new_partition);
+ RETURN_IF_ERROR(new_partition->InitStreams());
+ } else {
+ hash_partitions_.push_back(nullptr);
+ }
}
-
// Now that all the streams are reserved (meaning we have enough memory to execute
// the algorithm), allocate the hash tables. These can fail and we can still continue.
for (int i = 0; i < PARTITION_FANOUT; ++i) {
- if (UNLIKELY(!hash_partitions_[i]->InitHashTable())) {
- // We don't spill on preaggregations. If we have so little memory that we can't
- // allocate small hash tables, the mem limit is just too low.
- if (is_streaming_preagg_) {
- int64_t alloc_size = PAGG_DEFAULT_HASH_TABLE_SZ * HashTable::BucketSize();
- string details = Substitute("Cannot perform aggregation at node with id $0."
- " Failed to initialize hash table in preaggregation. The memory limit"
- " is too low to execute the query.", id_);
- return mem_tracker()->MemLimitExceeded(state_, details, alloc_size);
+ Partition* partition = hash_partitions_[i];
+ if (partition == nullptr) continue;
+ if (partition->aggregated_row_stream == nullptr) {
+ // Failed to create the aggregated row stream - cannot create a hash table.
+ // Just continue with a NULL hash table so rows will be passed through.
+ DCHECK(is_streaming_preagg_);
+ } else {
+ bool got_memory;
+ RETURN_IF_ERROR(partition->InitHashTable(&got_memory));
+ // Spill the partition if we cannot create a hash table for a merge aggregation.
+ if (UNLIKELY(!got_memory)) {
+ if (is_streaming_preagg_) {
+ partition->DiscardAggregatedRowStream();
+ } else {
+ // If we're repartitioning, we will be writing aggregated rows first.
+ RETURN_IF_ERROR(partition->Spill(level > 0));
+ }
}
- RETURN_IF_ERROR(hash_partitions_[i]->Spill());
}
- hash_tbls_[i] = hash_partitions_[i]->hash_tbl.get();
+ hash_tbls_[i] = partition->hash_tbl.get();
}
- COUNTER_ADD(partitions_created_, hash_partitions_.size());
+ COUNTER_ADD(partitions_created_, num_partitions_created);
if (!is_streaming_preagg_) {
COUNTER_SET(max_partition_level_, level);
}
return Status::OK();
}
-Status PartitionedAggregationNode::CheckAndResizeHashPartitions(int num_rows,
- const HashTableCtx* ht_ctx) {
+Status PartitionedAggregationNode::CheckAndResizeHashPartitions(
+ bool partitioning_aggregated_rows, int num_rows, const HashTableCtx* ht_ctx) {
DCHECK(!is_streaming_preagg_);
for (int i = 0; i < PARTITION_FANOUT; ++i) {
Partition* partition = hash_partitions_[i];
+ if (partition == nullptr) continue;
while (!partition->is_spilled()) {
{
SCOPED_TIMER(ht_resize_timer_);
- if (partition->hash_tbl->CheckAndResize(num_rows, ht_ctx)) break;
+ bool resized;
+ RETURN_IF_ERROR(partition->hash_tbl->CheckAndResize(num_rows, ht_ctx, &resized));
+ if (resized) break;
}
- RETURN_IF_ERROR(SpillPartition());
+ RETURN_IF_ERROR(SpillPartition(partitioning_aggregated_rows));
}
}
return Status::OK();
}
int64_t PartitionedAggregationNode::LargestSpilledPartition() const {
+ DCHECK(!is_streaming_preagg_);
int64_t max_rows = 0;
for (int i = 0; i < hash_partitions_.size(); ++i) {
Partition* partition = hash_partitions_[i];
- if (partition->is_closed || !partition->is_spilled()) continue;
- int64_t rows = partition->aggregated_row_stream->num_rows() +
- partition->unaggregated_row_stream->num_rows();
+ if (partition == nullptr || partition->is_closed || !partition->is_spilled()) {
+ continue;
+ }
+ int64_t rows = partition->aggregated_row_stream->num_rows()
+ + partition->unaggregated_row_stream->num_rows();
if (rows > max_rows) max_rows = rows;
}
return max_rows;
}
Status PartitionedAggregationNode::NextPartition() {
- DCHECK(output_partition_ == NULL);
+ DCHECK(output_partition_ == nullptr);
// Keep looping until we get to a partition that fits in memory.
- Partition* partition = NULL;
+ Partition* partition = nullptr;
while (true) {
- partition = NULL;
// First return partitions that are fully aggregated (and in memory).
if (!aggregated_partitions_.empty()) {
partition = aggregated_partitions_.front();
@@ -1201,56 +1209,23 @@ Status PartitionedAggregationNode::NextPartition() {
break;
}
- if (partition == NULL) {
- DCHECK(!spilled_partitions_.empty());
- DCHECK(!is_streaming_preagg_);
- DCHECK_EQ(state_->block_mgr()->num_pinned_buffers(block_mgr_client_),
- needs_serialize_ ? 1 : 0);
-
- // TODO: we can probably do better than just picking the first partition. We
- // can base this on the amount written to disk, etc.
- partition = spilled_partitions_.front();
- DCHECK(partition->is_spilled());
-
- // Create the new hash partitions to repartition into.
- // TODO: we don't need to repartition here. We are now working on 1 / FANOUT
- // of the input so it's reasonably likely it can fit. We should look at this
- // partitions size and just do the aggregation if it fits in memory.
- RETURN_IF_ERROR(CreateHashPartitions(partition->level + 1));
- COUNTER_ADD(num_repartitions_, 1);
-
- // Rows in this partition could have been spilled into two streams, depending
- // on if it is an aggregated intermediate, or an unaggregated row.
- // Note: we must process the aggregated rows first to save a hash table lookup
- // in ProcessBatch().
- RETURN_IF_ERROR(ProcessStream<true>(partition->aggregated_row_stream.get()));
- RETURN_IF_ERROR(ProcessStream<false>(partition->unaggregated_row_stream.get()));
-
- COUNTER_ADD(num_row_repartitioned_, partition->aggregated_row_stream->num_rows());
- COUNTER_ADD(num_row_repartitioned_,
- partition->unaggregated_row_stream->num_rows());
+ // No aggregated partitions in memory - we should not be using any reservation aside
+ // from 'serialize_stream_'.
+ DCHECK_EQ(serialize_stream_ != nullptr ? serialize_stream_->BytesPinned(false) : 0,
+ buffer_pool_client_.GetUsedReservation()) << buffer_pool_client_.DebugString();
- partition->Close(false);
- spilled_partitions_.pop_front();
-
- // Done processing this partition. Move the new partitions into
- // spilled_partitions_/aggregated_partitions_.
- int64_t num_input_rows = partition->aggregated_row_stream->num_rows() +
- partition->unaggregated_row_stream->num_rows();
-
- // Check if there was any reduction in the size of partitions after repartitioning.
- int64_t largest_partition = LargestSpilledPartition();
- DCHECK_GE(num_input_rows, largest_partition) << "Cannot have a partition with "
- "more rows than the input";
- if (UNLIKELY(num_input_rows == largest_partition)) {
- return Status(TErrorCode::PARTITIONED_AGG_REPARTITION_FAILS, id_,
- partition->level + 1, num_input_rows);
- }
- RETURN_IF_ERROR(MoveHashPartitions(num_input_rows));
- }
- }
+ // Try to fit a single spilled partition in memory. We can often do this because
+ // we only need to fit 1/PARTITION_FANOUT of the data in memory.
+ // TODO: in some cases when the partition probably won't fit in memory it could
+ // be better to skip directly to repartitioning.
+ RETURN_IF_ERROR(BuildSpilledPartition(&partition));
+ if (partition != nullptr) break;
- DCHECK(partition->hash_tbl.get() != NULL);
+ // If we can't fit the partition in memory, repartition it.
+ RETURN_IF_ERROR(RepartitionSpilledPartition());
+ }
+ DCHECK(!partition->is_spilled());
+ DCHECK(partition->hash_tbl.get() != nullptr);
DCHECK(partition->aggregated_row_stream->is_pinned());
output_partition_ = partition;
@@ -1259,8 +1234,105 @@ Status PartitionedAggregationNode::NextPartition() {
return Status::OK();
}
-template<bool AGGREGATED_ROWS>
-Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stream) {
+Status PartitionedAggregationNode::BuildSpilledPartition(Partition** built_partition) {
+ DCHECK(!spilled_partitions_.empty());
+ DCHECK(!is_streaming_preagg_);
+ // Leave the partition in 'spilled_partitions_' to be closed if we hit an error.
+ Partition* src_partition = spilled_partitions_.front();
+ DCHECK(src_partition->is_spilled());
+
+ // Create a new hash partition from the rows of the spilled partition. This is simpler
+ // than trying to finish building a partially-built partition in place. We only
+ // initialise one hash partition that all rows in 'src_partition' will hash to.
+ RETURN_IF_ERROR(CreateHashPartitions(src_partition->level, src_partition->idx));
+ Partition* dst_partition = hash_partitions_[src_partition->idx];
+ DCHECK(dst_partition != nullptr);
+
+ // Rebuild the hash table over spilled aggregate rows then start adding unaggregated
+ // rows to the hash table. It's possible the partition will spill at either stage.
+ // In that case we need to finish processing 'src_partition' so that all rows are
+ // appended to 'dst_partition'.
+ // TODO: if the partition spills again but the aggregation reduces the input
+ // significantly, we could do better here by keeping the incomplete hash table in
+ // memory and only spilling unaggregated rows that didn't fit in the hash table
+ // (somewhat similar to the passthrough pre-aggregation).
+ RETURN_IF_ERROR(ProcessStream<true>(src_partition->aggregated_row_stream.get()));
+ RETURN_IF_ERROR(ProcessStream<false>(src_partition->unaggregated_row_stream.get()));
+ src_partition->Close(false);
+ spilled_partitions_.pop_front();
+ hash_partitions_.clear();
+
+ if (dst_partition->is_spilled()) {
+ PushSpilledPartition(dst_partition);
+ *built_partition = nullptr;
+ // Spilled the partition - we should not be using any reservation except from
+ // 'serialize_stream_'.
+ DCHECK_EQ(serialize_stream_ != nullptr ? serialize_stream_->BytesPinned(false) : 0,
+ buffer_pool_client_.GetUsedReservation()) << buffer_pool_client_.DebugString();
+ } else {
+ *built_partition = dst_partition;
+ }
+ return Status::OK();
+}
+
+Status PartitionedAggregationNode::RepartitionSpilledPartition() {
+ DCHECK(!spilled_partitions_.empty());
+ DCHECK(!is_streaming_preagg_);
+ // Leave the partition in 'spilled_partitions_' to be closed if we hit an error.
+ Partition* partition = spilled_partitions_.front();
+ DCHECK(partition->is_spilled());
+
+ // Create the new hash partitions to repartition into. This will allocate a
+ // write buffer for each partition's aggregated row stream.
+ RETURN_IF_ERROR(CreateHashPartitions(partition->level + 1));
+ COUNTER_ADD(num_repartitions_, 1);
+
+ // Rows in this partition could have been spilled into two streams, depending
+ // on if it is an aggregated intermediate, or an unaggregated row. Aggregated
+ // rows are processed first to save a hash table lookup in ProcessBatch().
+ RETURN_IF_ERROR(ProcessStream<true>(partition->aggregated_row_stream.get()));
+
+ // Prepare write buffers so we can append spilled rows to unaggregated partitions.
+ for (Partition* hash_partition : hash_partitions_) {
+ if (!hash_partition->is_spilled()) continue;
+ // The aggregated rows have been repartitioned. Free up at least a buffer's worth of
+ // reservation and use it to pin the unaggregated write buffer.
+ hash_partition->aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+ bool got_buffer;
+ RETURN_IF_ERROR(
+ hash_partition->unaggregated_row_stream->PrepareForWrite(&got_buffer));
+ DCHECK(got_buffer)
+ << "Accounted in min reservation" << buffer_pool_client_.DebugString();
+ }
+ RETURN_IF_ERROR(ProcessStream<false>(partition->unaggregated_row_stream.get()));
+
+ COUNTER_ADD(num_row_repartitioned_, partition->aggregated_row_stream->num_rows());
+ COUNTER_ADD(num_row_repartitioned_, partition->unaggregated_row_stream->num_rows());
+
+ partition->Close(false);
+ spilled_partitions_.pop_front();
+
+ // Done processing this partition. Move the new partitions into
+ // spilled_partitions_/aggregated_partitions_.
+ int64_t num_input_rows = partition->aggregated_row_stream->num_rows()
+ + partition->unaggregated_row_stream->num_rows();
+
+ // Check if there was any reduction in the size of partitions after repartitioning.
+ int64_t largest_partition = LargestSpilledPartition();
+ DCHECK_GE(num_input_rows, largest_partition) << "Partition had more rows than input";
+ if (UNLIKELY(num_input_rows == largest_partition)) {
+ stringstream ss;
+ DebugString(2, &ss);
+ return Status(TErrorCode::PARTITIONED_AGG_REPARTITION_FAILS, id_,
+ partition->level + 1, num_input_rows, buffer_pool_client_.DebugString(),
+ ss.str());
+ }
+ RETURN_IF_ERROR(MoveHashPartitions(num_input_rows));
+ return Status::OK();
+}
+
+template <bool AGGREGATED_ROWS>
+Status PartitionedAggregationNode::ProcessStream(BufferedTupleStreamV2* input_stream) {
DCHECK(!is_streaming_preagg_);
if (input_stream->num_rows() > 0) {
while (true) {
@@ -1268,7 +1340,7 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stre
RETURN_IF_ERROR(input_stream->PrepareForRead(true, &got_buffer));
if (got_buffer) break;
// Did not have a buffer to read the input stream. Spill and try again.
- RETURN_IF_ERROR(SpillPartition());
+ RETURN_IF_ERROR(SpillPartition(AGGREGATED_ROWS));
}
TPrefetchMode::type prefetch_mode = state_->query_options().prefetch_mode;
@@ -1288,16 +1360,17 @@ Status PartitionedAggregationNode::ProcessStream(BufferedTupleStream* input_stre
return Status::OK();
}
-Status PartitionedAggregationNode::SpillPartition() {
+Status PartitionedAggregationNode::SpillPartition(bool more_aggregate_rows) {
int64_t max_freed_mem = 0;
int partition_idx = -1;
// Iterate over the partitions and pick the largest partition that is not spilled.
for (int i = 0; i < hash_partitions_.size(); ++i) {
+ if (hash_partitions_[i] == nullptr) continue;
if (hash_partitions_[i]->is_closed) continue;
if (hash_partitions_[i]->is_spilled()) continue;
// Pass 'true' because we need to keep the write block pinned. See Partition::Spill().
- int64_t mem = hash_partitions_[i]->aggregated_row_stream->bytes_in_mem(true);
+ int64_t mem = hash_partitions_[i]->aggregated_row_stream->BytesPinned(true);
mem += hash_partitions_[i]->hash_tbl->ByteSize();
mem += hash_partitions_[i]->agg_fn_pool->total_reserved_bytes();
DCHECK_GT(mem, 0); // At least the hash table buckets should occupy memory.
@@ -1306,26 +1379,26 @@ Status PartitionedAggregationNode::SpillPartition() {
partition_idx = i;
}
}
- if (partition_idx == -1) {
- // Could not find a partition to spill. This means the mem limit was just too low.
- return state_->block_mgr()->MemLimitTooLowError(block_mgr_client_, id());
- }
-
+ DCHECK_NE(partition_idx, -1) << "Should have been able to spill a partition to "
+ << "reclaim memory: " << buffer_pool_client_.DebugString();
hash_tbls_[partition_idx] = NULL;
- return hash_partitions_[partition_idx]->Spill();
+ return hash_partitions_[partition_idx]->Spill(more_aggregate_rows);
}
Status PartitionedAggregationNode::MoveHashPartitions(int64_t num_input_rows) {
DCHECK(!hash_partitions_.empty());
stringstream ss;
- ss << "PA(node_id=" << id() << ") partitioned(level="
- << hash_partitions_[0]->level << ") "
- << num_input_rows << " rows into:" << endl;
+ ss << "PA(node_id=" << id() << ") partitioned(level=" << hash_partitions_[0]->level
+ << ") " << num_input_rows << " rows into:" << endl;
for (int i = 0; i < hash_partitions_.size(); ++i) {
Partition* partition = hash_partitions_[i];
- int64_t aggregated_rows = partition->aggregated_row_stream->num_rows();
+ if (partition == nullptr) continue;
+ int64_t aggregated_rows = 0;
+ if (partition->aggregated_row_stream != nullptr) {
+ aggregated_rows = partition->aggregated_row_stream->num_rows();
+ }
int64_t unaggregated_rows = 0;
- if (partition->unaggregated_row_stream != NULL) {
+ if (partition->unaggregated_row_stream != nullptr) {
unaggregated_rows = partition->unaggregated_row_stream->num_rows();
}
double total_rows = aggregated_rows + unaggregated_rows;
@@ -1341,54 +1414,46 @@ Status PartitionedAggregationNode::MoveHashPartitions(int64_t num_input_rows) {
if (total_rows == 0) {
partition->Close(false);
} else if (partition->is_spilled()) {
- DCHECK(partition->hash_tbl.get() == NULL);
- // We need to unpin all the spilled partitions to make room to allocate new
- // hash_partitions_ when we repartition the spilled partitions.
- // TODO: we only need to do this when we have memory pressure. This might be
- // okay though since the block mgr should only write these to disk if there
- // is memory pressure.
- RETURN_IF_ERROR(
- partition->aggregated_row_stream->UnpinStream(BufferedTupleStream::UNPIN_ALL));
- RETURN_IF_ERROR(partition->unaggregated_row_stream->UnpinStream(
- BufferedTupleStream::UNPIN_ALL));
-
- // Push new created partitions at the front. This means a depth first walk
- // (more finely partitioned partitions are processed first). This allows us
- // to delete blocks earlier and bottom out the recursion earlier.
- spilled_partitions_.push_front(partition);
+ PushSpilledPartition(partition);
} else {
aggregated_partitions_.push_back(partition);
}
-
}
VLOG(2) << ss.str();
hash_partitions_.clear();
return Status::OK();
}
+void PartitionedAggregationNode::PushSpilledPartition(Partition* partition) {
+ DCHECK(partition->is_spilled());
+ DCHECK(partition->hash_tbl == nullptr);
+ // Ensure all pages in the spilled partition's streams are unpinned by invalidating
+ // the streams' read and write iterators. We may need all the memory to process the
+ // next spilled partitions.
+ partition->aggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+ partition->unaggregated_row_stream->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
+ spilled_partitions_.push_front(partition);
+}
+
void PartitionedAggregationNode::ClosePartitions() {
- for (int i = 0; i < hash_partitions_.size(); ++i) {
- hash_partitions_[i]->Close(true);
- }
- for (list<Partition*>::iterator it = aggregated_partitions_.begin();
- it != aggregated_partitions_.end(); ++it) {
- (*it)->Close(true);
- }
- for (list<Partition*>::iterator it = spilled_partitions_.begin();
- it != spilled_partitions_.end(); ++it) {
- (*it)->Close(true);
+ for (Partition* partition : hash_partitions_) {
+ if (partition != nullptr) partition->Close(true);
}
+ hash_partitions_.clear();
+ for (Partition* partition : aggregated_partitions_) partition->Close(true);
aggregated_partitions_.clear();
+ for (Partition* partition : spilled_partitions_) partition->Close(true);
spilled_partitions_.clear();
- hash_partitions_.clear();
memset(hash_tbls_, 0, sizeof(hash_tbls_));
partition_pool_->Clear();
}
Status PartitionedAggregationNode::QueryMaintenance(RuntimeState* state) {
AggFnEvaluator::FreeLocalAllocations(agg_fn_evals_);
- for (int i = 0; i < hash_partitions_.size(); ++i) {
- AggFnEvaluator::FreeLocalAllocations(hash_partitions_[i]->agg_fn_evals);
+ for (Partition* partition : hash_partitions_) {
+ if (partition != nullptr) {
+ AggFnEvaluator::FreeLocalAllocations(partition->agg_fn_evals);
+ }
}
if (ht_ctx_.get() != nullptr) ht_ctx_->FreeLocalAllocations();
return ExecNode::QueryMaintenance(state);
@@ -1972,4 +2037,8 @@ Status PartitionedAggregationNode::CodegenProcessBatchStreaming(
return Status::OK();
}
+// Instantiate required templates.
+template Status PartitionedAggregationNode::AppendSpilledRow<false>(
+ Partition*, TupleRow*);
+template Status PartitionedAggregationNode::AppendSpilledRow<true>(Partition*, TupleRow*);
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-aggregation-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.h b/be/src/exec/partitioned-aggregation-node.h
index 066dc28..4f8b622 100644
--- a/be/src/exec/partitioned-aggregation-node.h
+++ b/be/src/exec/partitioned-aggregation-node.h
@@ -19,13 +19,15 @@
#ifndef IMPALA_EXEC_PARTITIONED_AGGREGATION_NODE_H
#define IMPALA_EXEC_PARTITIONED_AGGREGATION_NODE_H
+#include <deque>
+
#include <boost/scoped_ptr.hpp>
#include "exec/exec-node.h"
#include "exec/hash-table.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/buffered-tuple-stream.h"
-#include "runtime/descriptors.h" // for TupleId
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/bufferpool/suballocator.h"
+#include "runtime/descriptors.h" // for TupleId
#include "runtime/mem-pool.h"
#include "runtime/string-value.h"
@@ -229,7 +231,9 @@ class PartitionedAggregationNode : public ExecNode {
std::vector<int> string_grouping_exprs_;
RuntimeState* state_;
- BufferedBlockMgr::Client* block_mgr_client_;
+
+ /// Allocator for hash table memory.
+ boost::scoped_ptr<Suballocator> ht_allocator_;
/// MemPool used to allocate memory for when we don't have grouping and don't initialize
/// the partitioning structures, or during Close() when creating new output tuples.
@@ -337,12 +341,12 @@ class PartitionedAggregationNode : public ExecNode {
HashTable* hash_tbls_[PARTITION_FANOUT];
/// All partitions that have been spilled and need further processing.
- std::list<Partition*> spilled_partitions_;
+ std::deque<Partition*> spilled_partitions_;
/// All partitions that are aggregated and can just return the results in GetNext().
/// After consuming all the input, hash_partitions_ is split into spilled_partitions_
/// and aggregated_partitions_, depending on if it was spilled or not.
- std::list<Partition*> aggregated_partitions_;
+ std::deque<Partition*> aggregated_partitions_;
/// END: Members that must be Reset()
/////////////////////////////////////////
@@ -352,31 +356,42 @@ class PartitionedAggregationNode : public ExecNode {
/// initially use small buffers. Streaming pre-aggregations do not spill and do not
/// require an unaggregated stream.
struct Partition {
- Partition(PartitionedAggregationNode* parent, int level)
- : parent(parent), is_closed(false), level(level) {}
+ Partition(PartitionedAggregationNode* parent, int level, int idx)
+ : parent(parent), is_closed(false), level(level), idx(idx) {}
~Partition();
/// Initializes aggregated_row_stream and unaggregated_row_stream (if a spilling
- /// aggregation), reserving one buffer for each. The buffers backing these streams
- /// are reserved, so this function will not fail with a continuable OOM. If we fail
- /// to init these buffers, the mem limit is too low to run this algorithm.
- Status InitStreams();
+ /// aggregation), allocating one buffer for each. Spilling merge aggregations must
+ /// have enough reservation for the initial buffer for the stream, so this should
+ /// not fail due to OOM. Preaggregations do not reserve any buffers: if does not
+ /// have enough reservation for the initial buffer, the aggregated row stream is not
+ /// created and an OK status is returned.
+ Status InitStreams() WARN_UNUSED_RESULT;
- /// Initializes the hash table. Returns false on OOM.
- bool InitHashTable();
+ /// Initializes the hash table. 'aggregated_row_stream' must be non-NULL.
+ /// Sets 'got_memory' to true if the hash table was initialised or false on OOM.
+ Status InitHashTable(bool* got_memory) WARN_UNUSED_RESULT;
/// Called in case we need to serialize aggregated rows. This step effectively does
/// a merge aggregation in this node.
- Status SerializeStreamForSpilling();
+ Status SerializeStreamForSpilling() WARN_UNUSED_RESULT;
/// Closes this partition. If finalize_rows is true, this iterates over all rows
/// in aggregated_row_stream and finalizes them (this is only used in the cancellation
/// path).
void Close(bool finalize_rows);
- /// Spills this partition, unpinning streams and cleaning up hash tables as necessary.
- Status Spill();
+ /// Spill this partition. 'more_aggregate_rows' = true means that more aggregate rows
+ /// may be appended to the the partition before appending unaggregated rows. On
+ /// success, one of the streams is left with a write iterator: the aggregated stream
+ /// if 'more_aggregate_rows' is true or the unaggregated stream otherwise.
+ Status Spill(bool more_aggregate_rows) WARN_UNUSED_RESULT;
+
+ /// Discards the aggregated row stream and hash table. Only valid to call if this is
+ /// a streaming preaggregation and the initial memory allocation for hash tables or
+ /// the aggregated stream failed. The aggregated stream must have 0 rows.
+ void DiscardAggregatedRowStream();
bool is_spilled() const { return hash_tbl.get() == NULL; }
@@ -390,9 +405,12 @@ class PartitionedAggregationNode : public ExecNode {
/// etc.
const int level;
+ /// The index of this partition within 'hash_partitions_' at its level.
+ const int idx;
+
/// Hash table for this partition.
/// Can be NULL if this partition is no longer maintaining a hash table (i.e.
- /// is spilled).
+ /// is spilled or we are passing through all rows for this partition).
boost::scoped_ptr<HashTable> hash_tbl;
/// Clone of parent's agg_fn_evals_ and backing MemPool.
@@ -401,18 +419,24 @@ class PartitionedAggregationNode : public ExecNode {
/// Tuple stream used to store aggregated rows. When the partition is not spilled,
/// (meaning the hash table is maintained), this stream is pinned and contains the
- /// memory referenced by the hash table. When it is spilled, aggregate rows are
- /// just appended to this stream.
- boost::scoped_ptr<BufferedTupleStream> aggregated_row_stream;
+ /// memory referenced by the hash table. When it is spilled, this consumes reservation
+ /// for a write buffer only during repartitioning of aggregated rows.
+ ///
+ /// For streaming preaggs, this may be NULL if sufficient memory is not available.
+ /// In that case hash_tbl is also NULL and all rows for the partition will be passed
+ /// through.
+ boost::scoped_ptr<BufferedTupleStreamV2> aggregated_row_stream;
/// Unaggregated rows that are spilled. Always NULL for streaming pre-aggregations.
- boost::scoped_ptr<BufferedTupleStream> unaggregated_row_stream;
+ /// Always unpinned. Has a write buffer allocated when the partition is spilled and
+ /// unaggregated rows are being processed.
+ boost::scoped_ptr<BufferedTupleStreamV2> unaggregated_row_stream;
};
/// Stream used to store serialized spilled rows. Only used if needs_serialize_
/// is set. This stream is never pinned and only used in Partition::Spill as a
/// a temporary buffer.
- boost::scoped_ptr<BufferedTupleStream> serialize_stream_;
+ boost::scoped_ptr<BufferedTupleStreamV2> serialize_stream_;
/// Accessor for 'hash_tbls_' that verifies consistency with the partitions.
HashTable* ALWAYS_INLINE GetHashTable(int partition_idx) {
@@ -447,7 +471,7 @@ class PartitionedAggregationNode : public ExecNode {
/// FunctionContexts, so is stored outside the stream. If stream's small buffers get
/// full, it will attempt to switch to IO-buffers.
Tuple* ConstructIntermediateTuple(const std::vector<AggFnEvaluator*>& agg_fn_evals,
- BufferedTupleStream* stream, Status* status) noexcept;
+ BufferedTupleStreamV2* stream, Status* status) noexcept;
/// Constructs intermediate tuple, allocating memory from pool instead of the stream.
/// Returns NULL and sets status if there is not enough memory to allocate the tuple.
@@ -495,7 +519,7 @@ class PartitionedAggregationNode : public ExecNode {
/// Do the aggregation for all tuple rows in the batch when there is no grouping.
/// This function is replaced by codegen.
- Status ProcessBatchNoGrouping(RowBatch* batch);
+ Status ProcessBatchNoGrouping(RowBatch* batch) WARN_UNUSED_RESULT;
/// Processes a batch of rows. This is the core function of the algorithm. We partition
/// the rows into hash_partitions_, spilling as necessary.
@@ -507,9 +531,9 @@ class PartitionedAggregationNode : public ExecNode {
//
/// This function is replaced by codegen. We pass in ht_ctx_.get() as an argument for
/// performance.
- template<bool AGGREGATED_ROWS>
- Status IR_ALWAYS_INLINE ProcessBatch(RowBatch* batch,
- TPrefetchMode::type prefetch_mode, HashTableCtx* ht_ctx);
+ template <bool AGGREGATED_ROWS>
+ Status IR_ALWAYS_INLINE ProcessBatch(RowBatch* batch, TPrefetchMode::type prefetch_mode,
+ HashTableCtx* ht_ctx) WARN_UNUSED_RESULT;
/// Evaluates the rows in 'batch' starting at 'start_row_idx' and stores the results in
/// the expression values cache in 'ht_ctx'. The number of rows evaluated depends on
@@ -524,7 +548,8 @@ class PartitionedAggregationNode : public ExecNode {
/// ProcessBatch for codegen to substitute function calls with codegen'd versions.
/// May spill partitions if not enough memory is available.
template <bool AGGREGATED_ROWS>
- Status IR_ALWAYS_INLINE ProcessRow(TupleRow* row, HashTableCtx* ht_ctx);
+ Status IR_ALWAYS_INLINE ProcessRow(
+ TupleRow* row, HashTableCtx* ht_ctx) WARN_UNUSED_RESULT;
/// Create a new intermediate tuple in partition, initialized with row. ht_ctx is
/// the context for the partition's hash table and hash is the precomputed hash of
@@ -533,35 +558,33 @@ class PartitionedAggregationNode : public ExecNode {
/// tuple to the partition's stream. Must be inlined into ProcessBatch for codegen
/// to substitute function calls with codegen'd versions. insert_it is an iterator
/// for insertion returned from HashTable::FindBuildRowBucket().
- template<bool AGGREGATED_ROWS>
- Status IR_ALWAYS_INLINE AddIntermediateTuple(Partition* partition,
- TupleRow* row, uint32_t hash, HashTable::Iterator insert_it);
-
- /// Append a row to a spilled partition. May spill partitions if needed to switch to
- /// I/O buffers. Selects the correct stream according to the argument. Inlined into
- /// ProcessBatch().
- template<bool AGGREGATED_ROWS>
- Status IR_ALWAYS_INLINE AppendSpilledRow(Partition* partition, TupleRow* row);
+ template <bool AGGREGATED_ROWS>
+ Status IR_ALWAYS_INLINE AddIntermediateTuple(Partition* partition, TupleRow* row,
+ uint32_t hash, HashTable::Iterator insert_it) WARN_UNUSED_RESULT;
- /// Append a row to a stream of a spilled partition. May spill partitions if needed
- /// to append the row.
- Status AppendSpilledRow(BufferedTupleStream* stream, TupleRow* row);
+ /// Append a row to a spilled partition. The row may be aggregated or unaggregated
+ /// according to AGGREGATED_ROWS. May spill partitions if needed to append the row
+ /// buffers.
+ template <bool AGGREGATED_ROWS>
+ Status IR_ALWAYS_INLINE AppendSpilledRow(
+ Partition* partition, TupleRow* row) WARN_UNUSED_RESULT;
/// Reads all the rows from input_stream and process them by calling ProcessBatch().
- template<bool AGGREGATED_ROWS>
- Status ProcessStream(BufferedTupleStream* input_stream);
+ template <bool AGGREGATED_ROWS>
+ Status ProcessStream(BufferedTupleStreamV2* input_stream) WARN_UNUSED_RESULT;
/// Output 'singleton_output_tuple_' and transfer memory to 'row_batch'.
void GetSingletonOutput(RowBatch* row_batch);
/// Get rows for the next rowbatch from the next partition. Sets 'partition_eos_' to
/// true if all rows from all partitions have been returned or the limit is reached.
- Status GetRowsFromPartition(RuntimeState* state, RowBatch* row_batch);
+ Status GetRowsFromPartition(
+ RuntimeState* state, RowBatch* row_batch) WARN_UNUSED_RESULT;
/// Get output rows from child for streaming pre-aggregation. Aggregates some rows with
/// hash table and passes through other rows converted into the intermediate
/// tuple format. Sets 'child_eos_' once all rows from child have been returned.
- Status GetRowsStreaming(RuntimeState* state, RowBatch* row_batch);
+ Status GetRowsStreaming(RuntimeState* state, RowBatch* row_batch) WARN_UNUSED_RESULT;
/// Return true if we should keep expanding hash tables in the preagg. If false,
/// the preagg should pass through any rows it can't fit in its tables.
@@ -582,7 +605,7 @@ class PartitionedAggregationNode : public ExecNode {
/// 'ht_ctx' is passed in as a way to avoid aliasing of 'this' confusing the optimiser.
Status ProcessBatchStreaming(bool needs_serialize, TPrefetchMode::type prefetch_mode,
RowBatch* in_batch, RowBatch* out_batch, HashTableCtx* ht_ctx,
- int remaining_capacity[PARTITION_FANOUT]);
+ int remaining_capacity[PARTITION_FANOUT]) WARN_UNUSED_RESULT;
/// Tries to add intermediate to the hash table 'hash_tbl' of 'partition' for streaming
/// aggregation. The input row must have been evaluated with 'ht_ctx', with 'hash' set
@@ -592,18 +615,24 @@ class PartitionedAggregationNode : public ExecNode {
/// keeps track of how many more entries can be added to the hash table so we can avoid
/// retrying inserts. It is decremented if an insert succeeds and set to zero if an
/// insert fails. If an error occurs, returns false and sets 'status'.
- bool IR_ALWAYS_INLINE TryAddToHashTable(HashTableCtx* ht_ctx,
- Partition* partition, HashTable* hash_tbl, TupleRow* in_row, uint32_t hash,
- int* remaining_capacity, Status* status);
+ bool IR_ALWAYS_INLINE TryAddToHashTable(HashTableCtx* ht_ctx, Partition* partition,
+ HashTable* hash_tbl, TupleRow* in_row, uint32_t hash, int* remaining_capacity,
+ Status* status) WARN_UNUSED_RESULT;
/// Initializes hash_partitions_. 'level' is the level for the partitions to create.
+ /// If 'single_partition_idx' is provided, it must be a number in range
+ /// [0, PARTITION_FANOUT), and only that partition is created - the others are
+ /// initialized to NULL.
/// Also sets ht_ctx_'s level to 'level'.
- Status CreateHashPartitions(int level);
+ Status CreateHashPartitions(
+ int level, int single_partition_idx = -1) WARN_UNUSED_RESULT;
/// Ensure that hash tables for all in-memory partitions are large enough to fit
/// 'num_rows' additional hash table entries. If there is not enough memory to
- /// resize the hash tables, may spill partitions.
- Status CheckAndResizeHashPartitions(int num_rows, const HashTableCtx* ht_ctx);
+ /// resize the hash tables, may spill partitions. 'aggregated_rows' is true if
+ /// we're currently partitioning aggregated rows.
+ Status CheckAndResizeHashPartitions(
+ bool aggregated_rows, int num_rows, const HashTableCtx* ht_ctx) WARN_UNUSED_RESULT;
/// Iterates over all the partitions in hash_partitions_ and returns the number of rows
/// of the largest spilled partition (in terms of number of aggregated and unaggregated
@@ -614,16 +643,39 @@ class PartitionedAggregationNode : public ExecNode {
/// initializes output_iterator_ and output_partition_. This either removes
/// a partition from aggregated_partitions_ (and is done) or removes the next
/// partition from aggregated_partitions_ and repartitions it.
- Status NextPartition();
-
- /// Picks a partition from hash_partitions_ to spill.
- Status SpillPartition();
+ Status NextPartition() WARN_UNUSED_RESULT;
+
+ /// Tries to build the first partition in 'spilled_partitions_'.
+ /// If successful, set *built_partition to the partition. The caller owns the partition
+ /// and is responsible for closing it. If unsuccessful because the partition could not
+ /// fit in memory, set *built_partition to NULL and append the spilled partition to the
+ /// head of 'spilled_partitions_' so it can be processed by
+ /// RepartitionSpilledPartition().
+ Status BuildSpilledPartition(Partition** built_partition) WARN_UNUSED_RESULT;
+
+ /// Repartitions the first partition in 'spilled_partitions_' into PARTITION_FANOUT
+ /// output partitions. On success, each output partition is either:
+ /// * closed, if no rows were added to the partition.
+ /// * in 'spilled_partitions_', if the partition spilled.
+ /// * in 'aggregated_partitions_', if the output partition was not spilled.
+ Status RepartitionSpilledPartition() WARN_UNUSED_RESULT;
+
+ /// Picks a partition from 'hash_partitions_' to spill. 'more_aggregate_rows' is passed
+ /// to Partition::Spill() when spilling the partition. See the Partition::Spill()
+ /// comment for further explanation.
+ Status SpillPartition(bool more_aggregate_rows) WARN_UNUSED_RESULT;
/// Moves the partitions in hash_partitions_ to aggregated_partitions_ or
/// spilled_partitions_. Partitions moved to spilled_partitions_ are unpinned.
/// input_rows is the number of input rows that have been repartitioned.
/// Used for diagnostics.
- Status MoveHashPartitions(int64_t input_rows);
+ Status MoveHashPartitions(int64_t input_rows) WARN_UNUSED_RESULT;
+
+ /// Adds a partition to the front of 'spilled_partitions_' for later processing.
+ /// 'spilled_partitions_' uses LIFO so more finely partitioned partitions are processed
+ /// first). This allows us to delete pages earlier and bottom out the recursion
+ /// earlier and also improves time locality of access to spilled data on disk.
+ void PushSpilledPartition(Partition* partition);
/// Calls Close() on every Partition in 'aggregated_partitions_',
/// 'spilled_partitions_', and 'hash_partitions_' and then resets the lists,
@@ -638,7 +690,7 @@ class PartitionedAggregationNode : public ExecNode {
/// and returns the IR function in 'fn'. Returns non-OK status if codegen
/// is unsuccessful.
Status CodegenUpdateSlot(LlvmCodeGen* codegen, int agg_fn_idx,
- SlotDescriptor* slot_desc, llvm::Function** fn);
+ SlotDescriptor* slot_desc, llvm::Function** fn) WARN_UNUSED_RESULT;
/// Codegen a call to a function implementing the UDA interface with input values
/// from 'input_vals'. 'dst_val' should contain the previous value of the aggregate
@@ -647,10 +699,10 @@ class PartitionedAggregationNode : public ExecNode {
/// the insert position of 'builder'.
Status CodegenCallUda(LlvmCodeGen* codegen, LlvmBuilder* builder, AggFn* agg_fn,
llvm::Value* agg_fn_ctx_arg, const std::vector<CodegenAnyVal>& input_vals,
- const CodegenAnyVal& dst_val, CodegenAnyVal* updated_dst_val);
+ const CodegenAnyVal& dst_val, CodegenAnyVal* updated_dst_val) WARN_UNUSED_RESULT;
/// Codegen UpdateTuple(). Returns non-OK status if codegen is unsuccessful.
- Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn);
+ Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn) WARN_UNUSED_RESULT;
/// Codegen the non-streaming process row batch loop. The loop has already been
/// compiled to IR and loaded into the codegen object. UpdateAggTuple has also been
@@ -659,26 +711,28 @@ class PartitionedAggregationNode : public ExecNode {
/// 'process_batch_no_grouping_fn_' will be updated with the codegened function
/// depending on whether this is a grouping or non-grouping aggregation.
/// Assumes AGGREGATED_ROWS = false.
- Status CodegenProcessBatch(LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode);
+ Status CodegenProcessBatch(
+ LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) WARN_UNUSED_RESULT;
/// Codegen the materialization loop for streaming preaggregations.
/// 'process_batch_streaming_fn_' will be updated with the codegened function.
Status CodegenProcessBatchStreaming(
- LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode);
+ LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) WARN_UNUSED_RESULT;
- /// We need two buffers per partition, one for the aggregated stream and one
- /// for the unaggregated stream. We need an additional buffer to read the stream
- /// we are currently repartitioning.
+ /// Compute minimum buffer requirement for grouping aggregations.
+ /// We need one buffer per partition, which is used either as the write buffer for the
+ /// aggregated stream or the unaggregated stream. We need an additional buffer to read
+ /// the stream we are currently repartitioning.
/// If we need to serialize, we need an additional buffer while spilling a partition
/// as the partitions aggregate stream needs to be serialized and rewritten.
/// We do not spill streaming preaggregations, so we do not need to reserve any buffers.
int MinRequiredBuffers() const {
- // Must be kept in sync with AggregationNode.computeResourceProfile() in fe.
- if (is_streaming_preagg_) return 0;
- return 2 * PARTITION_FANOUT + 1 + (needs_serialize_ ? 1 : 0);
+ DCHECK(!grouping_exprs_.empty());
+ // Must be kept in sync with AggregationNode.computeNodeResourceProfile() in fe.
+ if (is_streaming_preagg_) return 0; // Need 0 buffers to pass through rows.
+ return PARTITION_FANOUT + 1 + (needs_serialize_ ? 1 : 0);
}
};
-
}
#endif
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-builder-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder-ir.cc b/be/src/exec/partitioned-hash-join-builder-ir.cc
index e5f649e..df58036 100644
--- a/be/src/exec/partitioned-hash-join-builder-ir.cc
+++ b/be/src/exec/partitioned-hash-join-builder-ir.cc
@@ -19,7 +19,7 @@
#include "codegen/impala-ir.h"
#include "exec/hash-table.inline.h"
-#include "runtime/buffered-tuple-stream.inline.h"
+#include "runtime/buffered-tuple-stream-v2.inline.h"
#include "runtime/raw-value.inline.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-filter.h"
@@ -30,7 +30,7 @@
using namespace impala;
inline bool PhjBuilder::AppendRow(
- BufferedTupleStream* stream, TupleRow* row, Status* status) {
+ BufferedTupleStreamV2* stream, TupleRow* row, Status* status) {
if (LIKELY(stream->AddRow(row, status))) return true;
if (UNLIKELY(!status->ok())) return false;
return AppendRowStreamFull(stream, row, status);
@@ -73,12 +73,12 @@ Status PhjBuilder::ProcessBuildBatch(
bool PhjBuilder::Partition::InsertBatch(TPrefetchMode::type prefetch_mode,
HashTableCtx* ht_ctx, RowBatch* batch,
- const vector<BufferedTupleStream::RowIdx>& indices) {
+ const vector<BufferedTupleStreamV2::FlatRowPtr>& flat_rows, Status* status) {
// Compute the hash values and prefetch the hash table buckets.
const int num_rows = batch->num_rows();
HashTableCtx::ExprValuesCache* expr_vals_cache = ht_ctx->expr_values_cache();
const int prefetch_size = expr_vals_cache->capacity();
- const BufferedTupleStream::RowIdx* row_indices = indices.data();
+ const BufferedTupleStreamV2::FlatRowPtr* flat_rows_data = flat_rows.data();
for (int prefetch_group_row = 0; prefetch_group_row < num_rows;
prefetch_group_row += prefetch_size) {
int cur_row = prefetch_group_row;
@@ -97,9 +97,9 @@ bool PhjBuilder::Partition::InsertBatch(TPrefetchMode::type prefetch_mode,
expr_vals_cache->ResetForRead();
FOREACH_ROW_LIMIT(batch, cur_row, prefetch_size, batch_iter) {
TupleRow* row = batch_iter.Get();
- BufferedTupleStream::RowIdx row_idx = row_indices[cur_row];
+ BufferedTupleStreamV2::FlatRowPtr flat_row = flat_rows_data[cur_row];
if (!expr_vals_cache->IsRowNull()
- && UNLIKELY(!hash_tbl_->Insert(ht_ctx, row_idx, row))) {
+ && UNLIKELY(!hash_tbl_->Insert(ht_ctx, flat_row, row, status))) {
return false;
}
expr_vals_cache->NextRow();
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/exec/partitioned-hash-join-builder.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-builder.cc b/be/src/exec/partitioned-hash-join-builder.cc
index 4a5885b..a2f7c96 100644
--- a/be/src/exec/partitioned-hash-join-builder.cc
+++ b/be/src/exec/partitioned-hash-join-builder.cc
@@ -25,8 +25,10 @@
#include "exec/hash-table.inline.h"
#include "exprs/scalar-expr.h"
#include "exprs/scalar-expr-evaluator.h"
-#include "runtime/buffered-tuple-stream.h"
+#include "runtime/buffered-tuple-stream-v2.h"
+#include "runtime/exec-env.h"
#include "runtime/mem-tracker.h"
+#include "runtime/query-state.h"
#include "runtime/row-batch.h"
#include "runtime/runtime-filter-bank.h"
#include "runtime/runtime-filter.h"
@@ -44,19 +46,23 @@ static const string PREPARE_FOR_READ_FAILED_ERROR_MSG =
"the memory limit may help this query to complete successfully.";
using namespace impala;
-using namespace llvm;
-using namespace strings;
-using std::unique_ptr;
+using llvm::ConstantInt;
+using llvm::Function;
+using llvm::Type;
+using llvm::Value;
+using strings::Substitute;
PhjBuilder::PhjBuilder(int join_node_id, TJoinOp::type join_op,
const RowDescriptor* probe_row_desc, const RowDescriptor* build_row_desc,
- RuntimeState* state)
+ RuntimeState* state, BufferPool::ClientHandle* buffer_pool_client,
+ int64_t spillable_buffer_size)
: DataSink(build_row_desc),
runtime_state_(state),
join_node_id_(join_node_id),
join_op_(join_op),
probe_row_desc_(probe_row_desc),
- block_mgr_client_(NULL),
+ buffer_pool_client_(buffer_pool_client),
+ spillable_buffer_size_(spillable_buffer_size),
non_empty_build_(false),
partitions_created_(NULL),
largest_partition_percent_(NULL),
@@ -137,9 +143,6 @@ Status PhjBuilder::Prepare(RuntimeState* state, MemTracker* parent_mem_tracker)
RETURN_IF_ERROR(ScalarExprEvaluator::Create(*filter_exprs_[i], state, &pool_,
expr_mem_pool(), &filter_ctxs_[i].expr_eval));
}
- RETURN_IF_ERROR(state->block_mgr()->RegisterClient(
- Substitute("PartitionedHashJoin id=$0 builder=$1", join_node_id_, this),
- MinRequiredBuffers(), true, mem_tracker_.get(), state, &block_mgr_client_));
partitions_created_ = ADD_COUNTER(profile(), "PartitionsCreated", TUnit::UNIT);
largest_partition_percent_ =
@@ -169,6 +172,11 @@ Status PhjBuilder::Open(RuntimeState* state) {
for (const FilterContext& ctx : filter_ctxs_) {
RETURN_IF_ERROR(ctx.expr_eval->Open(state));
}
+ if (ht_allocator_ == nullptr) {
+ // Create 'ht_allocator_' on the first call to Open().
+ ht_allocator_.reset(new Suballocator(
+ state->exec_env()->buffer_pool(), buffer_pool_client_, spillable_buffer_size_));
+ }
RETURN_IF_ERROR(CreateHashPartitions(0));
AllocateRuntimeFilters();
@@ -248,7 +256,6 @@ void PhjBuilder::Close(RuntimeState* state) {
if (ctx.expr_eval != nullptr) ctx.expr_eval->Close(state);
}
ScalarExpr::Close(filter_exprs_);
- if (block_mgr_client_ != NULL) state->block_mgr()->ClearReservations(block_mgr_client_);
ScalarExpr::Close(build_exprs_);
pool_.Clear();
DataSink::Close(state);
@@ -264,13 +271,11 @@ void PhjBuilder::Reset() {
Status PhjBuilder::CreateAndPreparePartition(int level, Partition** partition) {
all_partitions_.emplace_back(new Partition(runtime_state_, this, level));
*partition = all_partitions_.back().get();
- RETURN_IF_ERROR((*partition)->build_rows()->Init(join_node_id_, profile(), true));
+ RETURN_IF_ERROR((*partition)->build_rows()->Init(join_node_id_, true));
bool got_buffer;
RETURN_IF_ERROR((*partition)->build_rows()->PrepareForWrite(&got_buffer));
- if (!got_buffer) {
- return runtime_state_->block_mgr()->MemLimitTooLowError(
- block_mgr_client_, join_node_id_);
- }
+ DCHECK(got_buffer)
+ << "Accounted in min reservation" << buffer_pool_client_->DebugString();
return Status::OK();
}
@@ -288,22 +293,11 @@ Status PhjBuilder::CreateHashPartitions(int level) {
}
bool PhjBuilder::AppendRowStreamFull(
- BufferedTupleStream* stream, TupleRow* row, Status* status) noexcept {
+ BufferedTupleStreamV2* stream, TupleRow* row, Status* status) noexcept {
while (true) {
- // Check if the stream is still using small buffers and try to switch to IO-buffers.
- if (stream->using_small_buffers()) {
- bool got_buffer;
- *status = stream->SwitchToIoBuffers(&got_buffer);
- if (!status->ok()) return false;
-
- if (got_buffer) {
- if (LIKELY(stream->AddRow(row, status))) return true;
- if (!status->ok()) return false;
- }
- }
// We ran out of memory. Pick a partition to spill. If we ran out of unspilled
// partitions, SpillPartition() will return an error status.
- *status = SpillPartition(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT);
+ *status = SpillPartition(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT);
if (!status->ok()) return false;
if (stream->AddRow(row, status)) return true;
if (!status->ok()) return false;
@@ -313,7 +307,7 @@ bool PhjBuilder::AppendRowStreamFull(
}
// TODO: can we do better with a different spilling heuristic?
-Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
+Status PhjBuilder::SpillPartition(BufferedTupleStreamV2::UnpinMode mode) {
DCHECK_EQ(hash_partitions_.size(), PARTITION_FANOUT);
int64_t max_freed_mem = 0;
int partition_idx = -1;
@@ -323,7 +317,7 @@ Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
Partition* candidate = hash_partitions_[i];
if (candidate->IsClosed()) continue;
if (candidate->is_spilled()) continue;
- int64_t mem = candidate->build_rows()->bytes_in_mem(false);
+ int64_t mem = candidate->build_rows()->BytesPinned(false);
if (candidate->hash_tbl() != NULL) {
// The hash table should not have matches, since we have not probed it yet.
// Losing match info would lead to incorrect results (IMPALA-1488).
@@ -337,9 +331,9 @@ Status PhjBuilder::SpillPartition(BufferedTupleStream::UnpinMode mode) {
}
if (partition_idx == -1) {
- // Could not find a partition to spill. This means the mem limit was just too low.
- return runtime_state_->block_mgr()->MemLimitTooLowError(
- block_mgr_client_, join_node_id_);
+ return Status(Substitute("Internal error: could not find a partition to spill in "
+ " hash join $1: \n$2\nClient:\n$3",
+ join_node_id_, DebugString(), buffer_pool_client_->DebugString()));
}
VLOG(2) << "Spilling partition: " << partition_idx << endl << DebugString();
@@ -373,8 +367,7 @@ Status PhjBuilder::BuildHashTablesAndPrepareProbeStreams() {
partition->Close(NULL);
} else if (partition->is_spilled()) {
// We don't need any build-side data for spilled partitions in memory.
- RETURN_IF_ERROR(
- partition->build_rows()->UnpinStream(BufferedTupleStream::UNPIN_ALL));
+ partition->build_rows()->UnpinStream(BufferedTupleStreamV2::UNPIN_ALL);
}
}
@@ -393,7 +386,7 @@ Status PhjBuilder::BuildHashTablesAndPrepareProbeStreams() {
RETURN_IF_ERROR(partition->BuildHashTable(&built));
// If we did not have enough memory to build this hash table, we need to spill this
// partition (clean up the hash table, unpin build).
- if (!built) RETURN_IF_ERROR(partition->Spill(BufferedTupleStream::UNPIN_ALL));
+ if (!built) RETURN_IF_ERROR(partition->Spill(BufferedTupleStreamV2::UNPIN_ALL));
}
// We may have spilled additional partitions while building hash tables, we need to
@@ -429,11 +422,11 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
while (probe_streams_to_create > 0) {
// Create stream in vector, so that it will be cleaned up after any failure.
- spilled_partition_probe_streams_.emplace_back(std::make_unique<BufferedTupleStream>(
- runtime_state_, probe_row_desc_, runtime_state_->block_mgr(), block_mgr_client_,
- false /* use_initial_small_buffers */, false /* read_write */));
- BufferedTupleStream* probe_stream = spilled_partition_probe_streams_.back().get();
- RETURN_IF_ERROR(probe_stream->Init(join_node_id_, profile(), false));
+ spilled_partition_probe_streams_.emplace_back(
+ make_unique<BufferedTupleStreamV2>(runtime_state_, probe_row_desc_,
+ buffer_pool_client_, spillable_buffer_size_, spillable_buffer_size_));
+ BufferedTupleStreamV2* probe_stream = spilled_partition_probe_streams_.back().get();
+ RETURN_IF_ERROR(probe_stream->Init(join_node_id_, false));
// Loop until either the stream gets a buffer or all partitions are spilled (in which
// case SpillPartition() returns an error).
@@ -442,7 +435,7 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
RETURN_IF_ERROR(probe_stream->PrepareForWrite(&got_buffer));
if (got_buffer) break;
- RETURN_IF_ERROR(SpillPartition(BufferedTupleStream::UNPIN_ALL));
+ RETURN_IF_ERROR(SpillPartition(BufferedTupleStreamV2::UNPIN_ALL));
++probe_streams_to_create;
}
--probe_streams_to_create;
@@ -450,7 +443,7 @@ Status PhjBuilder::InitSpilledPartitionProbeStreams() {
return Status::OK();
}
-vector<unique_ptr<BufferedTupleStream>> PhjBuilder::TransferProbeStreams() {
+vector<unique_ptr<BufferedTupleStreamV2>> PhjBuilder::TransferProbeStreams() {
return std::move(spilled_partition_probe_streams_);
}
@@ -460,7 +453,7 @@ void PhjBuilder::CloseAndDeletePartitions() {
all_partitions_.clear();
hash_partitions_.clear();
null_aware_partition_ = NULL;
- for (unique_ptr<BufferedTupleStream>& stream : spilled_partition_probe_streams_) {
+ for (unique_ptr<BufferedTupleStreamV2>& stream : spilled_partition_probe_streams_) {
stream->Close(NULL, RowBatch::FlushMode::NO_FLUSH_RESOURCES);
}
spilled_partition_probe_streams_.clear();
@@ -512,14 +505,14 @@ void PhjBuilder::PublishRuntimeFilters(int64_t num_build_rows) {
}
Status PhjBuilder::RepartitionBuildInput(
- Partition* input_partition, int level, BufferedTupleStream* input_probe_rows) {
+ Partition* input_partition, int level, BufferedTupleStreamV2* input_probe_rows) {
DCHECK_GE(level, 1);
SCOPED_TIMER(repartition_timer_);
COUNTER_ADD(num_repartitions_, 1);
RuntimeState* state = runtime_state_;
// Setup the read buffer and the new partitions.
- BufferedTupleStream* build_rows = input_partition->build_rows();
+ BufferedTupleStreamV2* build_rows = input_partition->build_rows();
DCHECK(build_rows != NULL);
bool got_read_buffer;
RETURN_IF_ERROR(build_rows->PrepareForRead(true, &got_read_buffer));
@@ -552,7 +545,7 @@ Status PhjBuilder::RepartitionBuildInput(
bool got_buffer;
RETURN_IF_ERROR(input_probe_rows->PrepareForRead(true, &got_buffer));
if (got_buffer) break;
- RETURN_IF_ERROR(SpillPartition(BufferedTupleStream::UNPIN_ALL_EXCEPT_CURRENT));
+ RETURN_IF_ERROR(SpillPartition(BufferedTupleStreamV2::UNPIN_ALL_EXCEPT_CURRENT));
}
RETURN_IF_ERROR(FlushFinal(state));
@@ -580,12 +573,9 @@ bool PhjBuilder::HashTableStoresNulls() const {
PhjBuilder::Partition::Partition(RuntimeState* state, PhjBuilder* parent, int level)
: parent_(parent), is_spilled_(false), level_(level) {
- // If we're repartitioning, we can assume the build input is fairly large and small
- // buffers will most likely just waste memory.
- bool use_initial_small_buffers = level == 0;
- build_rows_ =
- std::make_unique<BufferedTupleStream>(state, parent_->row_desc_, state->block_mgr(),
- parent_->block_mgr_client_, use_initial_small_buffers, false /* read_write */);
+ build_rows_ = make_unique<BufferedTupleStreamV2>(state, parent_->row_desc_,
+ parent_->buffer_pool_client_, parent->spillable_buffer_size_,
+ parent->spillable_buffer_size_);
}
PhjBuilder::Partition::~Partition() {
@@ -612,30 +602,15 @@ void PhjBuilder::Partition::Close(RowBatch* batch) {
}
}
-Status PhjBuilder::Partition::Spill(BufferedTupleStream::UnpinMode mode) {
+Status PhjBuilder::Partition::Spill(BufferedTupleStreamV2::UnpinMode mode) {
DCHECK(!IsClosed());
- // Close the hash table as soon as possible to release memory.
+ RETURN_IF_ERROR(parent_->runtime_state_->StartSpilling(parent_->mem_tracker()));
+ // Close the hash table and unpin the stream backing it to free memory.
if (hash_tbl() != NULL) {
hash_tbl_->Close();
hash_tbl_.reset();
}
-
- // Unpin the stream as soon as possible to increase the chances that the
- // SwitchToIoBuffers() call below will succeed.
- RETURN_IF_ERROR(build_rows_->UnpinStream(mode));
-
- if (build_rows_->using_small_buffers()) {
- bool got_buffer;
- RETURN_IF_ERROR(build_rows_->SwitchToIoBuffers(&got_buffer));
- if (!got_buffer) {
- // We'll try again to get the buffers when the stream fills up the small buffers.
- VLOG_QUERY << "Not enough memory to switch to IO-sized buffer for partition "
- << this << " of join=" << parent_->join_node_id_
- << " build small buffers=" << build_rows_->using_small_buffers();
- VLOG_FILE << GetStackTrace();
- }
- }
-
+ build_rows_->UnpinStream(mode);
if (!is_spilled_) {
COUNTER_ADD(parent_->num_spilled_partitions_, 1);
if (parent_->num_spilled_partitions_->value() == 1) {
@@ -652,14 +627,14 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
*built = false;
// Before building the hash table, we need to pin the rows in memory.
- RETURN_IF_ERROR(build_rows_->PinStream(false, built));
+ RETURN_IF_ERROR(build_rows_->PinStream(built));
if (!*built) return Status::OK();
RuntimeState* state = parent_->runtime_state_;
HashTableCtx* ctx = parent_->ht_ctx_.get();
ctx->set_level(level()); // Set the hash function for building the hash table.
RowBatch batch(parent_->row_desc_, state->batch_size(), parent_->mem_tracker());
- vector<BufferedTupleStream::RowIdx> indices;
+ vector<BufferedTupleStreamV2::FlatRowPtr> flat_rows;
bool eos = false;
// Allocate the partition-local hash table. Initialize the number of buckets based on
@@ -674,22 +649,22 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
//
// TODO: Try to allocate the hash table before pinning the stream to avoid needlessly
// reading all of the spilled rows from disk when we won't succeed anyway.
- int64_t estimated_num_buckets = build_rows()->RowConsumesMemory() ?
- HashTable::EstimateNumBuckets(build_rows()->num_rows()) :
- state->batch_size() * 2;
- hash_tbl_.reset(HashTable::Create(state, parent_->block_mgr_client_,
+ int64_t estimated_num_buckets = HashTable::EstimateNumBuckets(build_rows()->num_rows());
+ hash_tbl_.reset(HashTable::Create(parent_->ht_allocator_.get(),
true /* store_duplicates */, parent_->row_desc_->tuple_descriptors().size(),
build_rows(), 1 << (32 - NUM_PARTITIONING_BITS), estimated_num_buckets));
- if (!hash_tbl_->Init()) goto not_built;
+ bool success;
+ Status status = hash_tbl_->Init(&success);
+ if (!status.ok() || !success) goto not_built;
+ status = build_rows_->PrepareForRead(false, &success);
+ if (!status.ok()) goto not_built;
+ DCHECK(success) << "Stream was already pinned.";
- bool got_read_buffer;
- RETURN_IF_ERROR(build_rows_->PrepareForRead(false, &got_read_buffer));
- DCHECK(got_read_buffer) << "Stream was already pinned.";
do {
- RETURN_IF_ERROR(build_rows_->GetNext(&batch, &eos, &indices));
- DCHECK_EQ(batch.num_rows(), indices.size());
- DCHECK_LE(batch.num_rows(), hash_tbl_->EmptyBuckets())
- << build_rows()->RowConsumesMemory();
+ status = build_rows_->GetNext(&batch, &eos, &flat_rows);
+ if (!status.ok()) goto not_built;
+ DCHECK_EQ(batch.num_rows(), flat_rows.size());
+ DCHECK_LE(batch.num_rows(), hash_tbl_->EmptyBuckets());
TPrefetchMode::type prefetch_mode = state->query_options().prefetch_mode;
if (parent_->insert_batch_fn_ != NULL) {
InsertBatchFn insert_batch_fn;
@@ -699,11 +674,12 @@ Status PhjBuilder::Partition::BuildHashTable(bool* built) {
insert_batch_fn = parent_->insert_batch_fn_;
}
DCHECK(insert_batch_fn != NULL);
- if (UNLIKELY(!insert_batch_fn(this, prefetch_mode, ctx, &batch, indices))) {
+ if (UNLIKELY(
+ !insert_batch_fn(this, prefetch_mode, ctx, &batch, flat_rows, &status))) {
goto not_built;
}
- } else {
- if (UNLIKELY(!InsertBatch(prefetch_mode, ctx, &batch, indices))) goto not_built;
+ } else if (UNLIKELY(!InsertBatch(prefetch_mode, ctx, &batch, flat_rows, &status))) {
+ goto not_built;
}
RETURN_IF_CANCELLED(state);
RETURN_IF_ERROR(state->GetQueryStatus());
@@ -725,7 +701,7 @@ not_built:
hash_tbl_->Close();
hash_tbl_.reset();
}
- return Status::OK();
+ return status;
}
void PhjBuilder::Codegen(LlvmCodeGen* codegen) {
@@ -774,7 +750,8 @@ string PhjBuilder::DebugString() const {
DCHECK(partition->build_rows() != NULL);
ss << endl
<< " Build Rows: " << partition->build_rows()->num_rows()
- << " (Blocks pinned: " << partition->build_rows()->blocks_pinned() << ")" << endl;
+ << " (Bytes pinned: " << partition->build_rows()->BytesPinned(false) << ")"
+ << endl;
if (partition->hash_tbl() != NULL) {
ss << " Hash Table Rows: " << partition->hash_tbl()->size() << endl;
}
[08/11] incubator-impala git commit: IMPALA-4674: Part 2: port
backend exec to BufferPool
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a98b90bd/be/src/runtime/buffered-block-mgr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/buffered-block-mgr-test.cc b/be/src/runtime/buffered-block-mgr-test.cc
deleted file mode 100644
index cb294c2..0000000
--- a/be/src/runtime/buffered-block-mgr-test.cc
+++ /dev/null
@@ -1,1547 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <boost/bind.hpp>
-#include <boost/date_time/posix_time/posix_time.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/regex.hpp>
-#include <boost/scoped_ptr.hpp>
-#include <boost/thread/thread.hpp>
-#include <gutil/strings/substitute.h>
-#include <sys/stat.h>
-
-#include "codegen/llvm-codegen.h"
-#include "common/init.h"
-#include "common/object-pool.h"
-#include "runtime/buffered-block-mgr.h"
-#include "runtime/disk-io-mgr.h"
-#include "runtime/exec-env.h"
-#include "runtime/mem-tracker.h"
-#include "runtime/query-state.h"
-#include "runtime/runtime-state.h"
-#include "runtime/test-env.h"
-#include "runtime/tmp-file-mgr.h"
-#include "service/fe-support.h"
-#include "testutil/gtest-util.h"
-#include "util/cpu-info.h"
-#include "util/disk-info.h"
-#include "util/error-util.h"
-#include "util/filesystem-util.h"
-#include "util/promise.h"
-#include "util/test-info.h"
-#include "util/time.h"
-
-#include "gen-cpp/Types_types.h"
-#include "gen-cpp/ImpalaInternalService_types.h"
-
-#include "common/names.h"
-
-using boost::filesystem::directory_iterator;
-using boost::filesystem::remove;
-using boost::regex;
-
-// Note: This is the default scratch dir created by impala.
-// FLAGS_scratch_dirs + TmpFileMgr::TMP_SUB_DIR_NAME.
-const string SCRATCH_DIR = "/tmp/impala-scratch";
-
-// This suffix is appended to a tmp dir
-const string SCRATCH_SUFFIX = "/impala-scratch";
-
-// Number of millieconds to wait to ensure write completes. We don't know for sure how
-// slow the disk will be, so this is much higher than we expect the writes to take.
-const static int WRITE_WAIT_MILLIS = 10000;
-
-// How often to check for write completion
-const static int WRITE_CHECK_INTERVAL_MILLIS = 10;
-
-DECLARE_bool(disk_spill_encryption);
-
-namespace impala {
-
-class BufferedBlockMgrTest : public ::testing::Test {
- protected:
- const static int block_size_ = 1024;
-
- virtual void SetUp() {
- test_env_.reset(new TestEnv());
- ASSERT_OK(test_env_->Init());
- }
-
- virtual void TearDown() {
- TearDownMgrs();
- test_env_.reset();
-
- // Tests modify permissions, so make sure we can delete if they didn't clean up.
- for (int i = 0; i < created_tmp_dirs_.size(); ++i) {
- chmod((created_tmp_dirs_[i] + SCRATCH_SUFFIX).c_str(), S_IRWXU);
- }
- FileSystemUtil::RemovePaths(created_tmp_dirs_);
- created_tmp_dirs_.clear();
- pool_.Clear();
- }
-
- /// Reinitialize test_env_ to have multiple temporary directories.
- vector<string> InitMultipleTmpDirs(int num_dirs) {
- vector<string> tmp_dirs;
- for (int i = 0; i < num_dirs; ++i) {
- const string& dir = Substitute("/tmp/buffered-block-mgr-test.$0", i);
- // Fix permissions in case old directories were left from previous runs of test.
- chmod((dir + SCRATCH_SUFFIX).c_str(), S_IRWXU);
- EXPECT_OK(FileSystemUtil::RemoveAndCreateDirectory(dir));
- tmp_dirs.push_back(dir);
- created_tmp_dirs_.push_back(dir);
- }
- test_env_.reset(new TestEnv);
- test_env_->SetTmpFileMgrArgs(tmp_dirs, false);
- EXPECT_OK(test_env_->Init());
- EXPECT_EQ(num_dirs, test_env_->tmp_file_mgr()->NumActiveTmpDevices());
- return tmp_dirs;
- }
-
- static void ValidateBlock(BufferedBlockMgr::Block* block, int32_t data) {
- ASSERT_EQ(block->valid_data_len(), sizeof(int32_t));
- ASSERT_EQ(*reinterpret_cast<int32_t*>(block->buffer()), data);
- }
-
- static int32_t* MakeRandomSizeData(BufferedBlockMgr::Block* block) {
- // Format is int32_t size, followed by size bytes of data
- int32_t size = (rand() % 252) + 4; // So blocks have 4-256 bytes of data
- uint8_t* data = block->Allocate<uint8_t>(size);
- *(reinterpret_cast<int32_t*>(data)) = size;
- int i;
- for (i = 4; i < size-5; ++i) {
- data[i] = i;
- }
- for (; i < size; ++i) { // End marker of at least 5 0xff's
- data[i] = 0xff;
- }
- return reinterpret_cast<int32_t*>(data); // Really returns a pointer to size
- }
-
- static void ValidateRandomSizeData(BufferedBlockMgr::Block* block, int32_t size) {
- int32_t bsize = *(reinterpret_cast<int32_t*>(block->buffer()));
- uint8_t* data = reinterpret_cast<uint8_t*>(block->buffer());
- int i;
- ASSERT_EQ(block->valid_data_len(), size);
- ASSERT_EQ(size, bsize);
- for (i = 4; i < size - 5; ++i) {
- ASSERT_EQ(data[i], i);
- }
- for (; i < size; ++i) {
- ASSERT_EQ(data[i], 0xff);
- }
- }
-
- /// Helper to create a simple block manager.
- BufferedBlockMgr* CreateMgr(int64_t query_id, int max_buffers, int block_size,
- RuntimeState** query_state = NULL, TQueryOptions* query_options = NULL) {
- RuntimeState* state;
- EXPECT_OK(test_env_->CreateQueryStateWithBlockMgr(
- query_id, max_buffers, block_size, query_options, &state));
- if (query_state != NULL) *query_state = state;
- return state->block_mgr();
- }
-
- /// Create a new client tracker as a child of the RuntimeState's instance tracker.
- MemTracker* NewClientTracker(RuntimeState* state) {
- return pool_.Add(new MemTracker(-1, "client", state->instance_mem_tracker()));
- }
-
- BufferedBlockMgr* CreateMgrAndClient(int64_t query_id, int max_buffers, int block_size,
- int reserved_blocks, bool tolerates_oversubscription,
- BufferedBlockMgr::Client** client, RuntimeState** query_state = NULL,
- TQueryOptions* query_options = NULL) {
- RuntimeState* state;
- BufferedBlockMgr* mgr =
- CreateMgr(query_id, max_buffers, block_size, &state, query_options);
-
- MemTracker* client_tracker = NewClientTracker(state);
- EXPECT_OK(mgr->RegisterClient(Substitute("Client for query $0", query_id),
- reserved_blocks, tolerates_oversubscription, client_tracker, state, client));
- EXPECT_TRUE(client != NULL);
- if (query_state != NULL) *query_state = state;
- return mgr;
- }
-
- void CreateMgrsAndClients(int64_t start_query_id, int num_mgrs, int buffers_per_mgr,
- int block_size, int reserved_blocks_per_client, bool tolerates_oversubscription,
- vector<BufferedBlockMgr*>* mgrs, vector<BufferedBlockMgr::Client*>* clients) {
- for (int i = 0; i < num_mgrs; ++i) {
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* mgr = CreateMgrAndClient(start_query_id + i, buffers_per_mgr,
- block_size_, reserved_blocks_per_client, tolerates_oversubscription, &client);
- mgrs->push_back(mgr);
- clients->push_back(client);
- }
- }
-
- // Destroy all created query states and associated block managers.
- void TearDownMgrs() {
- // Tear down the query states, which DCHECKs that the memory consumption of
- // the query's trackers is zero.
- test_env_->TearDownQueries();
- }
-
- void AllocateBlocks(BufferedBlockMgr* block_mgr, BufferedBlockMgr::Client* client,
- int num_blocks, vector<BufferedBlockMgr::Block*>* blocks) {
- int32_t* data;
- Status status;
- BufferedBlockMgr::Block* new_block;
- for (int i = 0; i < num_blocks; ++i) {
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block != NULL);
- data = new_block->Allocate<int32_t>(sizeof(int32_t));
- *data = blocks->size();
- blocks->push_back(new_block);
- }
- }
-
- // Pin all blocks, expecting they are pinned successfully.
- void PinBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
- for (int i = 0; i < blocks.size(); ++i) {
- bool pinned;
- ASSERT_OK(blocks[i]->Pin(&pinned));
- ASSERT_TRUE(pinned);
- }
- }
-
- // Pin all blocks. By default, expect no errors from Unpin() calls. If
- // expected_error_codes is non-NULL, returning one of the error codes is
- // also allowed.
- void UnpinBlocks(const vector<BufferedBlockMgr::Block*>& blocks,
- const vector<TErrorCode::type>* expected_error_codes = nullptr,
- int delay_between_unpins_ms = 0) {
- for (int i = 0; i < blocks.size(); ++i) {
- Status status = blocks[i]->Unpin();
- if (!status.ok() && expected_error_codes != nullptr) {
- // Check if it's one of the expected errors.
- bool is_expected_error = false;
- for (TErrorCode::type code : *expected_error_codes) {
- if (status.code() == code) {
- is_expected_error = true;
- break;
- }
- }
- ASSERT_TRUE(is_expected_error) << status.msg().msg();
- } else {
- ASSERT_TRUE(status.ok()) << status.msg().msg();
- }
- if (delay_between_unpins_ms > 0) SleepForMs(delay_between_unpins_ms);
- }
- }
-
- void DeleteBlocks(const vector<BufferedBlockMgr::Block*>& blocks) {
- for (int i = 0; i < blocks.size(); ++i) {
- blocks[i]->Delete();
- }
- }
-
- void DeleteBlocks(const vector<pair<BufferedBlockMgr::Block*, int32_t>>& blocks) {
- for (int i = 0; i < blocks.size(); ++i) {
- blocks[i].first->Delete();
- }
- }
-
- static void WaitForWrites(BufferedBlockMgr* block_mgr) {
- vector<BufferedBlockMgr*> block_mgrs;
- block_mgrs.push_back(block_mgr);
- WaitForWrites(block_mgrs);
- }
-
- // Wait for writes issued through block managers to complete.
- static void WaitForWrites(const vector<BufferedBlockMgr*>& block_mgrs) {
- int max_attempts = WRITE_WAIT_MILLIS / WRITE_CHECK_INTERVAL_MILLIS;
- for (int i = 0; i < max_attempts; ++i) {
- SleepForMs(WRITE_CHECK_INTERVAL_MILLIS);
- if (AllWritesComplete(block_mgrs)) return;
- }
- ASSERT_TRUE(false) << "Writes did not complete after " << WRITE_WAIT_MILLIS << "ms";
- }
-
- static bool AllWritesComplete(BufferedBlockMgr* block_mgr) {
- return block_mgr->GetNumWritesOutstanding() == 0;
- }
-
- static bool AllWritesComplete(const vector<BufferedBlockMgr*>& block_mgrs) {
- for (int i = 0; i < block_mgrs.size(); ++i) {
- if (!AllWritesComplete(block_mgrs[i])) return false;
- }
- return true;
- }
-
- // Remove permissions for the temporary file at 'path' - all subsequent writes
- // to the file should fail. Expects backing file has already been allocated.
- static void DisableBackingFile(const string& path) {
- EXPECT_GT(path.size(), 0);
- EXPECT_EQ(0, chmod(path.c_str(), 0));
- LOG(INFO) << "Injected fault by removing file permissions " << path;
- }
-
- // Check that the file backing the block has dir as a prefix of its path.
- static bool BlockInDir(BufferedBlockMgr::Block* block, const string& dir) {
- return block->TmpFilePath().find(dir) == 0;
- }
-
- // Find a block in the list that is backed by a file with the given directory as prefix
- // of its path.
- static BufferedBlockMgr::Block* FindBlockForDir(
- const vector<BufferedBlockMgr::Block*>& blocks, const string& dir) {
- for (int i = 0; i < blocks.size(); ++i) {
- if (BlockInDir(blocks[i], dir)) return blocks[i];
- }
- return NULL;
- }
-
- void TestGetNewBlockImpl(int block_size) {
- Status status;
- int max_num_blocks = 5;
- vector<BufferedBlockMgr::Block*> blocks;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, false, &client);
- ASSERT_EQ(test_env_->TotalQueryMemoryConsumption(), 0);
-
- // Allocate blocks until max_num_blocks, they should all succeed and memory
- // usage should go up.
- BufferedBlockMgr::Block* new_block;
- BufferedBlockMgr::Block* first_block = NULL;
- for (int i = 0; i < max_num_blocks; ++i) {
- status = block_mgr->GetNewBlock(client, NULL, &new_block);
- ASSERT_TRUE(new_block != NULL);
- ASSERT_EQ(block_mgr->bytes_allocated(), (i + 1) * block_size);
- if (first_block == NULL) first_block = new_block;
- blocks.push_back(new_block);
- }
-
- // Trying to allocate a new one should fail.
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block == NULL);
- ASSERT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-
- // We can allocate a new block by transferring an already allocated one.
- uint8_t* old_buffer = first_block->buffer();
- ASSERT_OK(block_mgr->GetNewBlock(client, first_block, &new_block));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_EQ(old_buffer, new_block->buffer());
- ASSERT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
- ASSERT_TRUE(!first_block->is_pinned());
- blocks.push_back(new_block);
-
- // Trying to allocate a new one should still fail.
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block == NULL);
- ASSERT_EQ(block_mgr->bytes_allocated(), max_num_blocks * block_size);
-
- ASSERT_EQ(block_mgr->writes_issued(), 1);
-
- DeleteBlocks(blocks);
- TearDownMgrs();
- }
-
- void TestEvictionImpl(int block_size) {
- ASSERT_GT(block_size, 0);
- int max_num_buffers = 5;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-
- // Check counters.
- RuntimeProfile* profile = block_mgr->profile();
- RuntimeProfile::Counter* buffered_pin = profile->GetCounter("BufferedPins");
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
- ASSERT_EQ(block_mgr->bytes_allocated(), max_num_buffers * block_size);
- for (BufferedBlockMgr::Block* block : blocks) block->Unpin();
-
- // Re-pinning all blocks
- for (int i = 0; i < blocks.size(); ++i) {
- bool pinned;
- ASSERT_OK(blocks[i]->Pin(&pinned));
- ASSERT_TRUE(pinned);
- ValidateBlock(blocks[i], i);
- }
- int buffered_pins_expected = blocks.size();
- ASSERT_EQ(buffered_pin->value(), buffered_pins_expected);
-
- // Unpin all blocks
- for (BufferedBlockMgr::Block* block : blocks) block->Unpin();
- // Get two new blocks.
- AllocateBlocks(block_mgr, client, 2, &blocks);
- // At least two writes must be issued. The first (num_blocks - 2) must be in memory.
- ASSERT_GE(block_mgr->writes_issued(), 2);
- for (int i = 0; i < (max_num_buffers - 2); ++i) {
- bool pinned;
- ASSERT_OK(blocks[i]->Pin(&pinned));
- ASSERT_TRUE(pinned);
- ValidateBlock(blocks[i], i);
- }
- ASSERT_GE(buffered_pin->value(), buffered_pins_expected);
- DeleteBlocks(blocks);
- TearDownMgrs();
- }
-
- // Test that randomly issues GetFreeBlock(), Pin(), Unpin(), Delete() and Close()
- // calls. All calls made are legal - error conditions are not expected until the first
- // call to Close(). This is called 2 times with encryption+integrity on/off.
- // When executed in single-threaded mode 'tid' should be SINGLE_THREADED_TID.
- static const int SINGLE_THREADED_TID = -1;
- void TestRandomInternalImpl(RuntimeState* state, BufferedBlockMgr* block_mgr,
- int num_buffers, int tid) {
- ASSERT_TRUE(block_mgr != NULL);
- const int num_iterations = 10000;
- const int iters_before_close = num_iterations - 1000;
- bool close_called = false;
- unordered_map<BufferedBlockMgr::Block*, int> pinned_block_map;
- vector<pair<BufferedBlockMgr::Block*, int32_t>> pinned_blocks;
- unordered_map<BufferedBlockMgr::Block*, int> unpinned_block_map;
- vector<pair<BufferedBlockMgr::Block*, int32_t>> unpinned_blocks;
-
- typedef enum { Pin, New, Unpin, Delete, Close } ApiFunction;
- ApiFunction api_function;
-
- BufferedBlockMgr::Client* client;
- ASSERT_OK(
- block_mgr->RegisterClient("", 0, false, NewClientTracker(state), state, &client));
- ASSERT_TRUE(client != NULL);
-
- pinned_blocks.reserve(num_buffers);
- BufferedBlockMgr::Block* new_block;
- for (int i = 0; i < num_iterations; ++i) {
- if ((i % 20000) == 0) LOG (ERROR) << " Iteration " << i << endl;
- if (i > iters_before_close && (rand() % 5 == 0)) {
- api_function = Close;
- } else if (pinned_blocks.size() == 0 && unpinned_blocks.size() == 0) {
- api_function = New;
- } else if (pinned_blocks.size() == 0) {
- // Pin or New. Can't unpin or delete.
- api_function = static_cast<ApiFunction>(rand() % 2);
- } else if (pinned_blocks.size() >= num_buffers) {
- // Unpin or delete. Can't pin or get new.
- api_function = static_cast<ApiFunction>(2 + (rand() % 2));
- } else if (unpinned_blocks.size() == 0) {
- // Can't pin. Unpin, new or delete.
- api_function = static_cast<ApiFunction>(1 + (rand() % 3));
- } else {
- // Any api function.
- api_function = static_cast<ApiFunction>(rand() % 4);
- }
-
- pair<BufferedBlockMgr::Block*, int32_t> block_data;
- int rand_pick = 0;
- int32_t* data = NULL;
- bool pinned = false;
- Status status;
- switch (api_function) {
- case New:
- status = block_mgr->GetNewBlock(client, NULL, &new_block);
- if (close_called || (tid != SINGLE_THREADED_TID && status.IsCancelled())) {
- ASSERT_TRUE(new_block == NULL);
- ASSERT_TRUE(status.IsCancelled());
- continue;
- }
- ASSERT_OK(status);
- ASSERT_TRUE(new_block != NULL);
- data = MakeRandomSizeData(new_block);
- block_data = make_pair(new_block, *data);
-
- pinned_blocks.push_back(block_data);
- pinned_block_map.insert(make_pair(block_data.first, pinned_blocks.size() - 1));
- break;
- case Pin:
- rand_pick = rand() % unpinned_blocks.size();
- block_data = unpinned_blocks[rand_pick];
- status = block_data.first->Pin(&pinned);
- if (close_called || (tid != SINGLE_THREADED_TID && status.IsCancelled())) {
- ASSERT_TRUE(status.IsCancelled());
- // In single-threaded runs the block should not have been pinned.
- // In multi-threaded runs Pin() may return the block pinned but the status to
- // be cancelled. In this case we could move the block from unpinned_blocks
- // to pinned_blocks. We do not do that because after IsCancelled() no actual
- // block operations should take place.
- if (tid == SINGLE_THREADED_TID) ASSERT_FALSE(pinned);
- continue;
- }
- ASSERT_OK(status);
- ASSERT_TRUE(pinned);
- ValidateRandomSizeData(block_data.first, block_data.second);
- unpinned_blocks[rand_pick] = unpinned_blocks.back();
- unpinned_blocks.pop_back();
- unpinned_block_map[unpinned_blocks[rand_pick].first] = rand_pick;
-
- pinned_blocks.push_back(block_data);
- pinned_block_map.insert(make_pair(block_data.first, pinned_blocks.size() - 1));
- break;
- case Unpin:
- rand_pick = rand() % pinned_blocks.size();
- block_data = pinned_blocks[rand_pick];
- status = block_data.first->Unpin();
- if (close_called || (tid != SINGLE_THREADED_TID && status.IsCancelled())) {
- ASSERT_TRUE(status.IsCancelled());
- continue;
- }
- ASSERT_OK(status);
- pinned_blocks[rand_pick] = pinned_blocks.back();
- pinned_blocks.pop_back();
- pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick;
-
- unpinned_blocks.push_back(block_data);
- unpinned_block_map.insert(make_pair(block_data.first,
- unpinned_blocks.size() - 1));
- break;
- case Delete:
- rand_pick = rand() % pinned_blocks.size();
- block_data = pinned_blocks[rand_pick];
- block_data.first->Delete();
- pinned_blocks[rand_pick] = pinned_blocks.back();
- pinned_blocks.pop_back();
- pinned_block_map[pinned_blocks[rand_pick].first] = rand_pick;
- break;
- case Close:
- block_mgr->Cancel();
- close_called = true;
- break;
- }
- }
-
- // The client needs to delete all its blocks.
- DeleteBlocks(pinned_blocks);
- DeleteBlocks(unpinned_blocks);
- }
-
- // Single-threaded execution of the TestRandomInternalImpl.
- void TestRandomInternalSingle(int block_size) {
- ASSERT_GT(block_size, 0);
- ASSERT_TRUE(test_env_.get() != NULL);
- const int max_num_buffers = 100;
- RuntimeState* state;
- BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &state);
- TestRandomInternalImpl(state, block_mgr, max_num_buffers, SINGLE_THREADED_TID);
- TearDownMgrs();
- }
-
- // Multi-threaded execution of the TestRandomInternalImpl.
- void TestRandomInternalMulti(int num_threads, int block_size) {
- ASSERT_GT(num_threads, 0);
- ASSERT_GT(block_size, 0);
- ASSERT_TRUE(test_env_.get() != NULL);
- const int max_num_buffers = 100;
- RuntimeState* state;
- BufferedBlockMgr* block_mgr = CreateMgr(0, num_threads * max_num_buffers, block_size,
- &state);
-
- thread_group workers;
- for (int i = 0; i < num_threads; ++i) {
- thread* t = new thread(bind(&BufferedBlockMgrTest::TestRandomInternalImpl, this,
- state, block_mgr, max_num_buffers, i));
- workers.add_thread(t);
- }
- workers.join_all();
- TearDownMgrs();
- }
-
- // Repeatedly call BufferedBlockMgr::Create() and BufferedBlockMgr::~BufferedBlockMgr().
- void CreateDestroyThread(RuntimeState* state) {
- const int num_buffers = 10;
- const int iters = 10000;
- for (int i = 0; i < iters; ++i) {
- shared_ptr<BufferedBlockMgr> mgr;
- Status status = BufferedBlockMgr::Create(state, state->query_mem_tracker(),
- state->runtime_profile(), test_env_->tmp_file_mgr(), block_size_ * num_buffers,
- block_size_, &mgr);
- }
- }
-
- // IMPALA-2286: Test for races between BufferedBlockMgr::Create() and
- // BufferedBlockMgr::~BufferedBlockMgr().
- void CreateDestroyMulti() {
- const int num_threads = 8;
- thread_group workers;
- // Create a shared RuntimeState with no BufferedBlockMgr.
- RuntimeState shared_state(TQueryCtx(), test_env_->exec_env());
-
- for (int i = 0; i < num_threads; ++i) {
- thread* t = new thread(
- bind(&BufferedBlockMgrTest::CreateDestroyThread, this, &shared_state));
- workers.add_thread(t);
- }
- workers.join_all();
- shared_state.ReleaseResources();
- }
-
- // Test that in-flight IO operations are correctly handled on tear down.
- // write: if true, tear down while write operations are in flight, otherwise tear down
- // during read operations.
- void TestDestructDuringIO(bool write);
-
- /// Test for IMPALA-2252: race when tearing down runtime state and block mgr after query
- /// cancellation. Simulates query cancellation while writes are in flight. Forces the
- /// block mgr to have a longer lifetime than the runtime state. If write_error is true,
- /// force writes to hit errors. If wait_for_writes is true, wait for writes to complete
- /// before destroying block mgr.
- void TestRuntimeStateTeardown(bool write_error, bool wait_for_writes);
-
- void TestWriteError(int write_delay_ms);
-
- scoped_ptr<TestEnv> test_env_;
- ObjectPool pool_;
- vector<string> created_tmp_dirs_;
-};
-
-TEST_F(BufferedBlockMgrTest, GetNewBlock) {
- TestGetNewBlockImpl(1024);
- TestGetNewBlockImpl(8 * 1024);
- TestGetNewBlockImpl(8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, GetNewBlockSmallBlocks) {
- const int block_size = 1024;
- int max_num_blocks = 3;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, false, &client);
- MemTracker* client_tracker = block_mgr->get_tracker(client);
- ASSERT_EQ(0, test_env_->TotalQueryMemoryConsumption());
-
- vector<BufferedBlockMgr::Block*> blocks;
-
- // Allocate a small block.
- BufferedBlockMgr::Block* new_block = NULL;
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block, 128));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_EQ(block_mgr->bytes_allocated(), 0);
- ASSERT_EQ(block_mgr->mem_tracker()->consumption(), 0);
- ASSERT_EQ(client_tracker->consumption(), 128);
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_EQ(new_block->BytesRemaining(), 128);
- ASSERT_TRUE(new_block->buffer() != NULL);
- blocks.push_back(new_block);
-
- // Allocate a normal block
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size());
- ASSERT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size());
- ASSERT_EQ(client_tracker->consumption(), 128 + block_mgr->max_block_size());
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_EQ(new_block->BytesRemaining(), block_mgr->max_block_size());
- ASSERT_TRUE(new_block->buffer() != NULL);
- blocks.push_back(new_block);
-
- // Allocate another small block.
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block, 512));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_EQ(block_mgr->bytes_allocated(), block_mgr->max_block_size());
- ASSERT_EQ(block_mgr->mem_tracker()->consumption(), block_mgr->max_block_size());
- ASSERT_EQ(client_tracker->consumption(), 128 + 512 + block_mgr->max_block_size());
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_EQ(new_block->BytesRemaining(), 512);
- ASSERT_TRUE(new_block->buffer() != NULL);
- blocks.push_back(new_block);
-
- // Should be able to unpin and pin the middle block
- ASSERT_OK(blocks[1]->Unpin());
-
- bool pinned;
- ASSERT_OK(blocks[1]->Pin(&pinned));
- ASSERT_TRUE(pinned);
-
- DeleteBlocks(blocks);
- TearDownMgrs();
-}
-
-// Test that pinning more blocks than the max available buffers.
-TEST_F(BufferedBlockMgrTest, Pin) {
- int max_num_blocks = 5;
- const int block_size = 1024;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_blocks, block_size, 0, false, &client);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
-
- // Unpin them all.
- for (int i = 0; i < blocks.size(); ++i) {
- ASSERT_OK(blocks[i]->Unpin());
- }
-
- // Allocate more, this should work since we just unpinned some blocks.
- AllocateBlocks(block_mgr, client, max_num_blocks, &blocks);
-
- // Try to pin a unpinned block, this should not be possible.
- bool pinned;
- ASSERT_OK(blocks[0]->Pin(&pinned));
- ASSERT_FALSE(pinned);
-
- // Unpin all blocks.
- for (int i = 0; i < blocks.size(); ++i) {
- ASSERT_OK(blocks[i]->Unpin());
- }
-
- // Should be able to pin max_num_blocks blocks.
- for (int i = 0; i < max_num_blocks; ++i) {
- ASSERT_OK(blocks[i]->Pin(&pinned));
- ASSERT_TRUE(pinned);
- }
-
- // Can't pin any more though.
- ASSERT_OK(blocks[max_num_blocks]->Pin(&pinned));
- ASSERT_FALSE(pinned);
-
- DeleteBlocks(blocks);
- TearDownMgrs();
-}
-
-// Test the eviction policy of the block mgr. No writes issued until more than
-// the max available buffers are allocated. Writes must be issued in LIFO order.
-TEST_F(BufferedBlockMgrTest, Eviction) {
- TestEvictionImpl(1024);
- TestEvictionImpl(8 * 1024 * 1024);
-}
-
-// Test deletion and reuse of blocks.
-TEST_F(BufferedBlockMgrTest, Deletion) {
- int max_num_buffers = 5;
- const int block_size = 1024;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-
- // Check counters.
- RuntimeProfile* profile = block_mgr->profile();
- RuntimeProfile::Counter* recycled_cnt = profile->GetCounter("BlocksRecycled");
- RuntimeProfile::Counter* created_cnt = profile->GetCounter("BlocksCreated");
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- ASSERT_EQ(created_cnt->value(), max_num_buffers);
-
- DeleteBlocks(blocks);
- blocks.clear();
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- ASSERT_EQ(created_cnt->value(), max_num_buffers);
- ASSERT_EQ(recycled_cnt->value(), max_num_buffers);
-
- DeleteBlocks(blocks);
- TearDownMgrs();
-}
-
-// Delete blocks of various sizes and statuses to exercise the different code paths.
-// This relies on internal validation in block manager to detect many errors.
-TEST_F(BufferedBlockMgrTest, DeleteSingleBlocks) {
- int max_num_buffers = 16;
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* block_mgr =
- CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
- MemTracker* client_tracker = block_mgr->get_tracker(client);
-
- // Pinned I/O block.
- BufferedBlockMgr::Block* new_block;
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_TRUE(new_block->is_max_size());
- new_block->Delete();
- ASSERT_EQ(0, client_tracker->consumption());
-
- // Pinned non-I/O block.
- int small_block_size = 128;
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block, small_block_size));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_EQ(small_block_size, client_tracker->consumption());
- new_block->Delete();
- ASSERT_EQ(0, client_tracker->consumption());
-
- // Unpinned I/O block - delete after written to disk.
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_TRUE(new_block->is_max_size());
- new_block->Unpin();
- ASSERT_FALSE(new_block->is_pinned());
- WaitForWrites(block_mgr);
- new_block->Delete();
- ASSERT_EQ(client_tracker->consumption(), 0);
-
- // Unpinned I/O block - delete before written to disk.
- ASSERT_OK(block_mgr->GetNewBlock(client, NULL, &new_block));
- ASSERT_TRUE(new_block != NULL);
- ASSERT_TRUE(new_block->is_pinned());
- ASSERT_TRUE(new_block->is_max_size());
- new_block->Unpin();
- ASSERT_FALSE(new_block->is_pinned());
- new_block->Delete();
- WaitForWrites(block_mgr);
- ASSERT_EQ(client_tracker->consumption(), 0);
-
- TearDownMgrs();
-}
-
-// This exercises a code path where:
-// 1. A block A is unpinned.
-// 2. A block B is unpinned.
-// 3. A write for block A is initiated.
-// 4. Block A is pinned.
-// 5. Block B is pinned, with block A passed in to be deleted.
-// Block A's buffer will be transferred to block B.
-// 6. The write for block A completes.
-// Previously there was a bug (IMPALA-3936) where the buffer transfer happened before the
-// write completed. There were also various hangs related to missing condition variable
-// notifications.
-TEST_F(BufferedBlockMgrTest, TransferBufferDuringWrite) {
- const int trials = 5;
- const int max_num_buffers = 2;
- BufferedBlockMgr::Client* client;
- RuntimeState* query_state;
- BufferedBlockMgr* block_mgr = CreateMgrAndClient(
- 0, max_num_buffers, block_size_, 1, false, &client, &query_state);
-
- for (int trial = 0; trial < trials; ++trial) {
- for (int delay_ms = 0; delay_ms <= 10; delay_ms += 5) {
- // Force writes to be delayed to enlarge window of opportunity for bug.
- block_mgr->set_debug_write_delay_ms(delay_ms);
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, 2, &blocks);
-
- // Force the second block to be written and have its buffer freed.
- // We only have one buffer to share between the first and second blocks now.
- ASSERT_OK(blocks[1]->Unpin());
-
- // Create another client. Reserving different numbers of buffers can send it
- // down different code paths because the original client is entitled to different
- // number of buffers.
- int reserved_buffers = trial % max_num_buffers;
- BufferedBlockMgr::Client* tmp_client;
- ASSERT_OK(block_mgr->RegisterClient("tmp_client", reserved_buffers, false,
- NewClientTracker(query_state), query_state, &tmp_client));
- BufferedBlockMgr::Block* tmp_block;
- ASSERT_OK(block_mgr->GetNewBlock(tmp_client, NULL, &tmp_block));
-
- // Initiate the write, repin the block, then immediately try to swap the buffer to
- // the second block while the write is still in flight.
- ASSERT_OK(blocks[0]->Unpin());
- bool pinned;
- ASSERT_OK(blocks[0]->Pin(&pinned));
- ASSERT_TRUE(pinned);
- ASSERT_OK(blocks[1]->Pin(&pinned, blocks[0], false));
- ASSERT_TRUE(pinned);
-
- blocks[1]->Delete();
- tmp_block->Delete();
- block_mgr->ClearReservations(tmp_client);
- }
- }
-}
-
-// Test that all APIs return cancelled after close.
-TEST_F(BufferedBlockMgrTest, Close) {
- int max_num_buffers = 5;
- const int block_size = 1024;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
- block_mgr->Cancel();
-
- BufferedBlockMgr::Block* new_block;
- Status status = block_mgr->GetNewBlock(client, NULL, &new_block);
- ASSERT_TRUE(status.IsCancelled());
- ASSERT_TRUE(new_block == NULL);
- status = blocks[0]->Unpin();
- ASSERT_TRUE(status.IsCancelled());
- bool pinned;
- status = blocks[0]->Pin(&pinned);
- ASSERT_TRUE(status.IsCancelled());
-
- DeleteBlocks(blocks);
- TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, DestructDuringWrite) {
- const int trials = 20;
- const int max_num_buffers = 5;
-
- for (int trial = 0; trial < trials; ++trial) {
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* block_mgr =
- CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
- // Unpin will initiate writes.
- UnpinBlocks(blocks);
-
- // Writes should still be in flight when blocks are deleted.
- DeleteBlocks(blocks);
-
- // Destruct block manager while blocks are deleted and writes are in flight.
- TearDownMgrs();
- }
- // Destroying test environment will check that all writes have completed.
-}
-
-void BufferedBlockMgrTest::TestRuntimeStateTeardown(
- bool write_error, bool wait_for_writes) {
- const int max_num_buffers = 10;
- RuntimeState* state;
- BufferedBlockMgr::Client* client;
- CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client, &state);
-
- // Hold extra references to block mgr and query state so they outlive RuntimeState.
- shared_ptr<BufferedBlockMgr> block_mgr;
- QueryState::ScopedRef qs(state->query_id());
- Status status = BufferedBlockMgr::Create(state, state->query_mem_tracker(),
- state->runtime_profile(), test_env_->tmp_file_mgr(), 0, block_size_, &block_mgr);
- ASSERT_TRUE(status.ok());
- ASSERT_TRUE(block_mgr != NULL);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr.get(), client, max_num_buffers, &blocks);
-
- if (write_error) {
- // Force flushing blocks to disk then remove temporary file to force writes to fail.
- UnpinBlocks(blocks);
- vector<BufferedBlockMgr::Block*> more_blocks;
- AllocateBlocks(block_mgr.get(), client, max_num_buffers, &more_blocks);
-
- const string& tmp_file_path = blocks[0]->TmpFilePath();
- DeleteBlocks(more_blocks);
- PinBlocks(blocks);
- DisableBackingFile(tmp_file_path);
- }
-
- // Unpin will initiate writes. If the write error propagates fast enough, some Unpin()
- // calls may see a cancelled block mgr.
- vector<TErrorCode::type> cancelled_code = {TErrorCode::CANCELLED};
- UnpinBlocks(blocks, write_error ? &cancelled_code : nullptr);
-
- // Tear down while writes are in flight. The block mgr may outlive the runtime state
- // because it may be referenced by other runtime states. This test simulates this
- // scenario by holding onto a reference to the block mgr. This should be safe so
- // long as blocks are properly deleted before the runtime state is torn down.
- DeleteBlocks(blocks);
- test_env_->TearDownQueries();
-
- // Optionally wait for writes to complete after cancellation.
- if (wait_for_writes) WaitForWrites(block_mgr.get());
- block_mgr.reset();
-
- ASSERT_EQ(test_env_->TotalQueryMemoryConsumption(), 0);
-}
-
-TEST_F(BufferedBlockMgrTest, RuntimeStateTeardown) {
- TestRuntimeStateTeardown(false, false);
-}
-
-TEST_F(BufferedBlockMgrTest, RuntimeStateTeardownWait) {
- TestRuntimeStateTeardown(false, true);
-}
-
-TEST_F(BufferedBlockMgrTest, RuntimeStateTeardownWriteError) {
- TestRuntimeStateTeardown(true, true);
-}
-
-// Regression test for IMPALA-2927 write complete with cancelled runtime state
-TEST_F(BufferedBlockMgrTest, WriteCompleteWithCancelledRuntimeState) {
- const int max_num_buffers = 10;
- RuntimeState* state;
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* block_mgr =
- CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client, &state);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
-
- // Force flushing blocks to disk so that more writes are in flight.
- UnpinBlocks(blocks);
-
- // Cancel the runtime state and re-pin the blocks while writes are in flight to check
- // that WriteComplete() handles the case ok.
- state->set_is_cancelled();
- PinBlocks(blocks);
-
- WaitForWrites(block_mgr);
- DeleteBlocks(blocks);
-}
-
-// Remove write permissions on scratch files. Return # of scratch files.
-static int remove_scratch_perms() {
- int num_files = 0;
- directory_iterator dir_it(SCRATCH_DIR);
- for (; dir_it != directory_iterator(); ++dir_it) {
- ++num_files;
- chmod(dir_it->path().c_str(), 0);
- }
-
- return num_files;
-}
-
-// Test that the block manager behaves correctly after a write error. Delete the scratch
-// directory before an operation that would cause a write and test that subsequent API
-// calls return 'CANCELLED' correctly.
-void BufferedBlockMgrTest::TestWriteError(int write_delay_ms) {
- int max_num_buffers = 2;
- const int block_size = 1024;
- BufferedBlockMgr* block_mgr;
- BufferedBlockMgr::Client* client;
- block_mgr = CreateMgrAndClient(0, max_num_buffers, block_size, 0, false, &client);
- block_mgr->set_debug_write_delay_ms(write_delay_ms);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- // Unpin two blocks here, to ensure that backing storage is allocated in tmp file.
- UnpinBlocks(blocks);
- WaitForWrites(block_mgr);
- // Repin the blocks
- PinBlocks(blocks);
- // Remove the backing storage so that future writes will fail
- int num_files = remove_scratch_perms();
- ASSERT_GT(num_files, 0);
- vector<TErrorCode::type> expected_error_codes = {TErrorCode::CANCELLED,
- TErrorCode::SCRATCH_ALLOCATION_FAILED};
- // Give the first write a chance to fail before the second write starts.
- int interval_ms = 10;
- UnpinBlocks(blocks, &expected_error_codes, interval_ms);
- WaitForWrites(block_mgr);
- // Subsequent calls should fail.
- DeleteBlocks(blocks);
- BufferedBlockMgr::Block* new_block;
- ASSERT_TRUE(block_mgr->GetNewBlock(client, NULL, &new_block).IsCancelled());
- ASSERT_TRUE(new_block == NULL);
-
- TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, WriteError) {
- TestWriteError(0);
-}
-
-// Regression test for IMPALA-4842 - inject a delay in the write to
-// reproduce the issue.
-TEST_F(BufferedBlockMgrTest, WriteErrorWriteDelay) {
- TestWriteError(100);
-}
-
-// Test block manager error handling when temporary file space cannot be allocated to
-// back an unpinned buffer.
-TEST_F(BufferedBlockMgrTest, TmpFileAllocateError) {
- int max_num_buffers = 2;
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* block_mgr =
- CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
-
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- // Unpin a block, forcing a write.
- ASSERT_OK(blocks[0]->Unpin());
- WaitForWrites(block_mgr);
- // Remove temporary files - subsequent operations will fail.
- int num_files = remove_scratch_perms();
- ASSERT_TRUE(num_files > 0);
- // Current implementation will not fail here until it attempts to write the file.
- // This behavior is not contractual but we want to know if it changes accidentally.
- ASSERT_OK(blocks[1]->Unpin());
-
- // Write failure should cancel query
- WaitForWrites(block_mgr);
- ASSERT_TRUE(block_mgr->IsCancelled());
-
- DeleteBlocks(blocks);
- TearDownMgrs();
-}
-
-// Test that the block manager is able to blacklist a temporary device correctly after a
-// write error. The query that encountered the write error should not allocate more
-// blocks on that device, but existing blocks on the device will remain in use and future
-// queries will use the device.
-TEST_F(BufferedBlockMgrTest, WriteErrorBlacklist) {
- // Set up two buffered block managers with two temporary dirs.
- vector<string> tmp_dirs = InitMultipleTmpDirs(2);
- // Simulate two concurrent queries.
- const int NUM_BLOCK_MGRS = 2;
- const int MAX_NUM_BLOCKS = 4;
- int blocks_per_mgr = MAX_NUM_BLOCKS / NUM_BLOCK_MGRS;
- vector<BufferedBlockMgr*> block_mgrs;
- vector<BufferedBlockMgr::Client*> clients;
- CreateMgrsAndClients(
- 0, NUM_BLOCK_MGRS, blocks_per_mgr, block_size_, 0, false, &block_mgrs, &clients);
-
- // Allocate files for all 2x2 combinations by unpinning blocks.
- vector<vector<BufferedBlockMgr::Block*>> blocks;
- vector<BufferedBlockMgr::Block*> all_blocks;
- for (int i = 0; i < NUM_BLOCK_MGRS; ++i) {
- vector<BufferedBlockMgr::Block*> mgr_blocks;
- AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks);
- UnpinBlocks(mgr_blocks);
- for (int j = 0; j < blocks_per_mgr; ++j) {
- LOG(INFO) << "Manager " << i << " Block " << j << " backed by file "
- << mgr_blocks[j]->TmpFilePath();
- }
- blocks.push_back(mgr_blocks);
- all_blocks.insert(all_blocks.end(), mgr_blocks.begin(), mgr_blocks.end());
- }
- WaitForWrites(block_mgrs);
- int error_mgr = 0;
- int no_error_mgr = 1;
- const string& error_dir = tmp_dirs[0];
- const string& good_dir = tmp_dirs[1];
- // Delete one file from first scratch dir for first block manager.
- BufferedBlockMgr::Block* error_block = FindBlockForDir(blocks[error_mgr], error_dir);
- ASSERT_TRUE(error_block != NULL) << "Expected a tmp file in dir " << error_dir;
- const string& error_file_path = error_block->TmpFilePath();
- PinBlocks(all_blocks);
- DisableBackingFile(error_file_path);
- UnpinBlocks(all_blocks); // Should succeed since writes occur asynchronously
- WaitForWrites(block_mgrs);
- // Both block managers have a usable tmp directory so should still be usable.
- ASSERT_FALSE(block_mgrs[error_mgr]->IsCancelled());
- ASSERT_FALSE(block_mgrs[no_error_mgr]->IsCancelled());
- // Temporary device with error should still be active.
- vector<TmpFileMgr::DeviceId> active_tmp_devices =
- test_env_->tmp_file_mgr()->ActiveTmpDevices();
- ASSERT_EQ(tmp_dirs.size(), active_tmp_devices.size());
- for (int i = 0; i < active_tmp_devices.size(); ++i) {
- const string& device_path =
- test_env_->tmp_file_mgr()->GetTmpDirPath(active_tmp_devices[i]);
- ASSERT_EQ(string::npos, error_dir.find(device_path));
- }
-
- // The error block manager should only allocate from the device that had no error.
- // The non-error block manager should continue using both devices, since it didn't
- // encounter a write error itself.
- vector<BufferedBlockMgr::Block*> error_new_blocks;
- AllocateBlocks(
- block_mgrs[error_mgr], clients[error_mgr], blocks_per_mgr, &error_new_blocks);
- UnpinBlocks(error_new_blocks);
- WaitForWrites(block_mgrs);
- EXPECT_TRUE(FindBlockForDir(error_new_blocks, good_dir) != NULL);
- EXPECT_TRUE(FindBlockForDir(error_new_blocks, error_dir) == NULL);
- for (int i = 0; i < error_new_blocks.size(); ++i) {
- LOG(INFO) << "Newly created block backed by file "
- << error_new_blocks[i]->TmpFilePath();
- EXPECT_TRUE(BlockInDir(error_new_blocks[i], good_dir));
- }
- DeleteBlocks(error_new_blocks);
-
- PinBlocks(blocks[no_error_mgr]);
- UnpinBlocks(blocks[no_error_mgr]);
- WaitForWrites(block_mgrs);
- EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], good_dir) != NULL);
- EXPECT_TRUE(FindBlockForDir(blocks[no_error_mgr], error_dir) != NULL);
-
- // The second block manager should use the bad directory for new blocks since
- // blacklisting is per-manager, not global.
- vector<BufferedBlockMgr::Block*> no_error_new_blocks;
- AllocateBlocks(block_mgrs[no_error_mgr], clients[no_error_mgr], blocks_per_mgr,
- &no_error_new_blocks);
- UnpinBlocks(no_error_new_blocks);
- WaitForWrites(block_mgrs);
- EXPECT_TRUE(FindBlockForDir(no_error_new_blocks, good_dir) != NULL);
- EXPECT_TRUE(FindBlockForDir(no_error_new_blocks, error_dir) != NULL);
- DeleteBlocks(no_error_new_blocks);
-
- // A new block manager should use the both dirs for backing storage.
- BufferedBlockMgr::Client* new_client;
- BufferedBlockMgr* new_block_mgr =
- CreateMgrAndClient(9999, blocks_per_mgr, block_size_, 0, false, &new_client);
- vector<BufferedBlockMgr::Block*> new_mgr_blocks;
- AllocateBlocks(new_block_mgr, new_client, blocks_per_mgr, &new_mgr_blocks);
- UnpinBlocks(new_mgr_blocks);
- WaitForWrites(block_mgrs);
- EXPECT_TRUE(FindBlockForDir(new_mgr_blocks, good_dir) != NULL);
- EXPECT_TRUE(FindBlockForDir(new_mgr_blocks, error_dir) != NULL);
- DeleteBlocks(new_mgr_blocks);
-
- DeleteBlocks(all_blocks);
-}
-
-// Check that allocation error resulting from removal of directory results in blocks
-/// being allocated in other directories.
-TEST_F(BufferedBlockMgrTest, AllocationErrorHandling) {
- // Set up two buffered block managers with two temporary dirs.
- vector<string> tmp_dirs = InitMultipleTmpDirs(2);
- // Simulate two concurrent queries.
- int num_block_mgrs = 2;
- int max_num_blocks = 4;
- int blocks_per_mgr = max_num_blocks / num_block_mgrs;
- vector<RuntimeState*> runtime_states;
- vector<BufferedBlockMgr*> block_mgrs;
- vector<BufferedBlockMgr::Client*> clients;
- CreateMgrsAndClients(
- 0, num_block_mgrs, blocks_per_mgr, block_size_, 0, false, &block_mgrs, &clients);
-
- // Allocate files for all 2x2 combinations by unpinning blocks.
- vector<vector<BufferedBlockMgr::Block*>> blocks;
- for (int i = 0; i < num_block_mgrs; ++i) {
- vector<BufferedBlockMgr::Block*> mgr_blocks;
- LOG(INFO) << "Iter " << i;
- AllocateBlocks(block_mgrs[i], clients[i], blocks_per_mgr, &mgr_blocks);
- blocks.push_back(mgr_blocks);
- }
- const string& bad_dir = tmp_dirs[0];
- const string& bad_scratch_subdir = bad_dir + SCRATCH_SUFFIX;
- chmod(bad_scratch_subdir.c_str(), 0);
- // The block mgr should attempt to allocate space in bad dir for one block, which will
- // cause an error when it tries to create/expand the file. It should recover and just
- // use the good dir.
- UnpinBlocks(blocks[0]);
- // Directories remain on active list even when they experience errors.
- ASSERT_EQ(2, test_env_->tmp_file_mgr()->NumActiveTmpDevices());
- // Blocks should not be written to bad dir even if it remains non-writable.
- UnpinBlocks(blocks[1]);
- // All writes should succeed.
- WaitForWrites(block_mgrs);
- for (int i = 0; i < blocks.size(); ++i) {
- DeleteBlocks(blocks[i]);
- }
-}
-
-// Test that block manager fails cleanly when all directories are inaccessible at runtime.
-TEST_F(BufferedBlockMgrTest, NoDirsAllocationError) {
- vector<string> tmp_dirs = InitMultipleTmpDirs(2);
- int max_num_buffers = 2;
- RuntimeState* runtime_state;
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* block_mgr = CreateMgrAndClient(
- 0, max_num_buffers, block_size_, 0, false, &client, &runtime_state);
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- for (int i = 0; i < tmp_dirs.size(); ++i) {
- const string& tmp_scratch_subdir = tmp_dirs[i] + SCRATCH_SUFFIX;
- chmod(tmp_scratch_subdir.c_str(), 0);
- }
- ErrorLogMap error_log;
- runtime_state->GetErrors(&error_log);
- ASSERT_TRUE(error_log.empty());
- // Unpin the blocks. Unpinning may fail if it hits a write error before this thread is
- // done unpinning.
- vector<TErrorCode::type> cancelled_code = {TErrorCode::CANCELLED};
- UnpinBlocks(blocks, &cancelled_code);
-
- LOG(INFO) << "Waiting for writes.";
- // Write failure should cancel query.
- WaitForWrites(block_mgr);
- LOG(INFO) << "writes done.";
- ASSERT_TRUE(block_mgr->IsCancelled());
- runtime_state->GetErrors(&error_log);
- ASSERT_FALSE(error_log.empty());
- stringstream error_string;
- PrintErrorMap(&error_string, error_log);
- LOG(INFO) << "Errors: " << error_string.str();
- // SCRATCH_ALLOCATION_FAILED error should exist in the error log.
- ErrorLogMap::const_iterator it = error_log.find(TErrorCode::SCRATCH_ALLOCATION_FAILED);
- ASSERT_NE(it, error_log.end());
- ASSERT_GT(it->second.count, 0);
- DeleteBlocks(blocks);
-}
-
-// Test that block manager can still allocate buffers when spilling is disabled.
-TEST_F(BufferedBlockMgrTest, NoTmpDirs) {
- InitMultipleTmpDirs(0);
- int max_num_buffers = 3;
- BufferedBlockMgr::Client* client;
- BufferedBlockMgr* block_mgr =
- CreateMgrAndClient(0, max_num_buffers, block_size_, 0, false, &client);
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- DeleteBlocks(blocks);
-}
-
-// Test that block manager can still allocate buffers when spilling is disabled by
-// setting scratch_limit = 0.
-TEST_F(BufferedBlockMgrTest, ScratchLimitZero) {
- int max_num_buffers = 3;
- BufferedBlockMgr::Client* client;
- TQueryOptions query_options;
- query_options.scratch_limit = 0;
- BufferedBlockMgr* block_mgr = CreateMgrAndClient(
- 0, max_num_buffers, block_size_, 0, false, &client, NULL, &query_options);
- vector<BufferedBlockMgr::Block*> blocks;
- AllocateBlocks(block_mgr, client, max_num_buffers, &blocks);
- DeleteBlocks(blocks);
-}
-
-// Create two clients with different number of reserved buffers.
-TEST_F(BufferedBlockMgrTest, MultipleClients) {
- int client1_buffers = 3;
- int client2_buffers = 5;
- int max_num_buffers = client1_buffers + client2_buffers;
- const int block_size = 1024;
- RuntimeState* runtime_state;
- BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
- BufferedBlockMgr::Client* client1 = NULL;
- BufferedBlockMgr::Client* client2 = NULL;
- ASSERT_OK(block_mgr->RegisterClient("", client1_buffers, false,
- NewClientTracker(runtime_state), runtime_state, &client1));
- ASSERT_TRUE(client1 != NULL);
- ASSERT_OK(block_mgr->RegisterClient("", client2_buffers, false,
- NewClientTracker(runtime_state), runtime_state, &client2));
- ASSERT_TRUE(client2 != NULL);
-
- // Reserve client 1's and 2's buffers. They should succeed.
- bool reserved = block_mgr->TryAcquireTmpReservation(client1, 1);
- ASSERT_TRUE(reserved);
- reserved = block_mgr->TryAcquireTmpReservation(client2, 1);
- ASSERT_TRUE(reserved);
-
- vector<BufferedBlockMgr::Block*> client1_blocks;
- // Allocate all of client1's reserved blocks, they should all succeed.
- AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks);
-
- // Try allocating one more, that should fail.
- BufferedBlockMgr::Block* block;
- ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
- ASSERT_TRUE(block == NULL);
-
- // Trying to reserve should also fail.
- reserved = block_mgr->TryAcquireTmpReservation(client1, 1);
- ASSERT_FALSE(reserved);
-
- // Allocate all of client2's reserved blocks, these should succeed.
- vector<BufferedBlockMgr::Block*> client2_blocks;
- AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks);
-
- // Try allocating one more from client 2, that should fail.
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
- ASSERT_TRUE(block == NULL);
-
- // Unpin one block from client 1.
- ASSERT_OK(client1_blocks[0]->Unpin());
-
- // Client 2 should still not be able to allocate.
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
- ASSERT_TRUE(block == NULL);
-
- // Client 2 should still not be able to reserve.
- reserved = block_mgr->TryAcquireTmpReservation(client2, 1);
- ASSERT_FALSE(reserved);
-
- // Client 1 should be able to though.
- ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
- ASSERT_TRUE(block != NULL);
- client1_blocks.push_back(block);
-
- // Unpin two of client 1's blocks (client 1 should have 3 unpinned blocks now).
- ASSERT_OK(client1_blocks[1]->Unpin());
- ASSERT_OK(client1_blocks[2]->Unpin());
-
- // Clear client 1's reservation
- block_mgr->ClearReservations(client1);
-
- // Client 2 should be able to reserve 1 buffers now (there are 2 left);
- reserved = block_mgr->TryAcquireTmpReservation(client2, 1);
- ASSERT_TRUE(reserved);
-
- // Client one can only pin 1.
- bool pinned;
- ASSERT_OK(client1_blocks[0]->Pin(&pinned));
- ASSERT_TRUE(pinned);
- // Can't get this one.
- ASSERT_OK(client1_blocks[1]->Pin(&pinned));
- ASSERT_FALSE(pinned);
-
- // Client 2 can pick up the one reserved buffer
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
- ASSERT_TRUE(block != NULL);
- client2_blocks.push_back(block);
-
- // But not a second
- BufferedBlockMgr::Block* block2;
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block2));
- ASSERT_TRUE(block2 == NULL);
-
- // Unpin client 2's block it got from the reservation. Sine this is a tmp
- // reservation, client 1 can pick it up again (it is not longer reserved).
- ASSERT_OK(block->Unpin());
- ASSERT_OK(client1_blocks[1]->Pin(&pinned));
- ASSERT_TRUE(pinned);
-
- DeleteBlocks(client1_blocks);
- DeleteBlocks(client2_blocks);
- TearDownMgrs();
-}
-
-// Create two clients with different number of reserved buffers and some additional.
-TEST_F(BufferedBlockMgrTest, MultipleClientsExtraBuffers) {
- int client1_buffers = 1;
- int client2_buffers = 1;
- int max_num_buffers = client1_buffers + client2_buffers + 2;
- const int block_size = 1024;
- RuntimeState* runtime_state;
- BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
-
- BufferedBlockMgr::Client* client1 = NULL;
- BufferedBlockMgr::Client* client2 = NULL;
- BufferedBlockMgr::Block* block = NULL;
- ASSERT_OK(block_mgr->RegisterClient("", client1_buffers, false,
- NewClientTracker(runtime_state), runtime_state, &client1));
- ASSERT_TRUE(client1 != NULL);
- ASSERT_OK(block_mgr->RegisterClient("", client2_buffers, false,
- NewClientTracker(runtime_state), runtime_state, &client2));
- ASSERT_TRUE(client2 != NULL);
-
- vector<BufferedBlockMgr::Block*> client1_blocks;
- // Allocate all of client1's reserved blocks, they should all succeed.
- AllocateBlocks(block_mgr, client1, client1_buffers, &client1_blocks);
-
- // Allocate all of client2's reserved blocks, these should succeed.
- vector<BufferedBlockMgr::Block*> client2_blocks;
- AllocateBlocks(block_mgr, client2, client2_buffers, &client2_blocks);
-
- // We have two spare buffers now. Each client should be able to allocate it.
- ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
- ASSERT_TRUE(block != NULL);
- client1_blocks.push_back(block);
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
- ASSERT_TRUE(block != NULL);
- client2_blocks.push_back(block);
-
- // Now we are completely full, no one should be able to allocate a new block.
- ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
- ASSERT_TRUE(block == NULL);
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
- ASSERT_TRUE(block == NULL);
-
- DeleteBlocks(client1_blocks);
- DeleteBlocks(client2_blocks);
- TearDownMgrs();
-}
-
-// Create multiple clients causing oversubscription.
-TEST_F(BufferedBlockMgrTest, ClientOversubscription) {
- Status status;
- int client1_buffers = 1;
- int client2_buffers = 2;
- int client3_buffers = 2;
- int max_num_buffers = 2;
- const int block_size = 1024;
- RuntimeState* runtime_state;
- BufferedBlockMgr* block_mgr = CreateMgr(0, max_num_buffers, block_size, &runtime_state);
- vector<BufferedBlockMgr::Block*> blocks;
-
- BufferedBlockMgr::Client* client1 = NULL;
- BufferedBlockMgr::Client* client2 = NULL;
- BufferedBlockMgr::Client* client3 = NULL;
- BufferedBlockMgr::Block* block = NULL;
- ASSERT_OK(block_mgr->RegisterClient("", client1_buffers, false,
- NewClientTracker(runtime_state), runtime_state, &client1));
- ASSERT_TRUE(client1 != NULL);
- ASSERT_OK(block_mgr->RegisterClient("", client2_buffers, false,
- NewClientTracker(runtime_state), runtime_state, &client2));
- ASSERT_TRUE(client2 != NULL);
- ASSERT_OK(block_mgr->RegisterClient("", client3_buffers, true,
- NewClientTracker(runtime_state), runtime_state, &client3));
- ASSERT_TRUE(client3 != NULL);
-
- // Client one allocates first block, should work.
- ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
- ASSERT_TRUE(block != NULL);
- blocks.push_back(block);
-
- // Client two allocates first block, should work.
- ASSERT_OK(block_mgr->GetNewBlock(client2, NULL, &block));
- ASSERT_TRUE(block != NULL);
- blocks.push_back(block);
-
- // At this point we've used both buffers. Client one reserved one so subsequent
- // calls should fail with no error (but returns no block).
- ASSERT_OK(block_mgr->GetNewBlock(client1, NULL, &block));
- ASSERT_TRUE(block == NULL);
-
- // Allocate with client two. Since client two reserved 2 buffers, this should fail
- // with MEM_LIMIT_EXCEEDED.
- ASSERT_TRUE(block_mgr->GetNewBlock(client2, NULL, &block).IsMemLimitExceeded());
-
- // Allocate with client three. Since client three can tolerate oversubscription,
- // this should fail with no error even though it was a reserved request.
- ASSERT_OK(block_mgr->GetNewBlock(client3, NULL, &block));
- ASSERT_TRUE(block == NULL);
-
- DeleteBlocks(blocks);
- TearDownMgrs();
-}
-
-TEST_F(BufferedBlockMgrTest, SingleRandom_plain) {
- FLAGS_disk_spill_encryption = false;
- TestRandomInternalSingle(1024);
- TestRandomInternalSingle(8 * 1024);
- TestRandomInternalSingle(8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi2Random_plain) {
- FLAGS_disk_spill_encryption = false;
- TestRandomInternalMulti(2, 1024);
- TestRandomInternalMulti(2, 8 * 1024);
- TestRandomInternalMulti(2, 8 * 1024 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi4Random_plain) {
- FLAGS_disk_spill_encryption = false;
- TestRandomInternalMulti(4, 1024);
- TestRandomInternalMulti(4, 8 * 1024);
- TestRandomInternalMulti(4, 8 * 1024 * 1024);
-}
-
-// TODO: Enable when we improve concurrency/scalability of block mgr.
-// TEST_F(BufferedBlockMgrTest, Multi8Random_plain) {
-// FLAGS_disk_spill_encryption = false;
-// TestRandomInternalMulti(8);
-// }
-
-TEST_F(BufferedBlockMgrTest, SingleRandom_encryption) {
- FLAGS_disk_spill_encryption = true;
- TestRandomInternalSingle(8 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi2Random_encryption) {
- FLAGS_disk_spill_encryption = true;
- TestRandomInternalMulti(2, 8 * 1024);
-}
-
-TEST_F(BufferedBlockMgrTest, Multi4Random_encryption) {
- FLAGS_disk_spill_encryption = true;
- TestRandomInternalMulti(4, 8 * 1024);
-}
-
-// TODO: Enable when we improve concurrency/scalability of block mgr.
-// TEST_F(BufferedBlockMgrTest, Multi8Random_encryption) {
-// FLAGS_disk_spill_encryption = true;
-// TestRandomInternalMulti(8);
-// }
-
-
-TEST_F(BufferedBlockMgrTest, CreateDestroyMulti) {
- CreateDestroyMulti();
-}
-
-}
-
-int main(int argc, char** argv) {
- ::testing::InitGoogleTest(&argc, argv);
- impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
- impala::InitFeSupport();
- impala::LlvmCodeGen::InitializeLlvm();
- return RUN_ALL_TESTS();
-}