You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2019/09/09 11:42:04 UTC

[hive] branch master updated: HIVE-21397: BloomFilter for hive Managed [ACID] table does not work as expected (Denys Kuzmenko, reviewed by Gopal Vijayaraghavan and Adam Szita)

This is an automated email from the ASF dual-hosted git repository.

szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 6dbc115  HIVE-21397: BloomFilter for hive Managed [ACID] table does not work as expected (Denys Kuzmenko, reviewed by Gopal Vijayaraghavan and Adam Szita)
6dbc115 is described below

commit 6dbc115d4a35361c07ad2f3f0175825149b8ef35
Author: Denys Kuzmenko <dk...@cloudera.com>
AuthorDate: Mon Sep 9 13:39:02 2019 +0200

    HIVE-21397: BloomFilter for hive Managed [ACID] table does not work as expected (Denys Kuzmenko, reviewed by Gopal Vijayaraghavan and Adam Szita)
---
 .../acid_bloom_filter_orc_file_dump.q              |  33 +++
 .../acid_bloom_filter_orc_file_dump.q.out          | 309 +++++++++++++++++++++
 2 files changed, 342 insertions(+)

diff --git a/ql/src/test/queries/clientpositive/acid_bloom_filter_orc_file_dump.q b/ql/src/test/queries/clientpositive/acid_bloom_filter_orc_file_dump.q
new file mode 100644
index 0000000..30daaf8
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/acid_bloom_filter_orc_file_dump.q
@@ -0,0 +1,33 @@
+SET hive.vectorized.execution.enabled=FALSE;
+SET hive.mapred.mode=nonstrict;
+
+SET hive.support.concurrency=TRUE;
+SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+
+DROP TABLE if exists bloomTest;
+
+CREATE TABLE bloomTest(
+  msisdn  STRING,
+  imsi    VARCHAR(20),
+  imei    BIGINT,
+  cell_id BIGINT)
+
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
+TBLPROPERTIES (
+  'bucketing_version'='2',
+  'orc.bloom.filter.columns'='msisdn,cell_id,imsi',
+  'orc.bloom.filter.fpp'='0.02',
+  'transactional'='true'
+);
+
+INSERT INTO bloomTest VALUES ('12345', '12345', 12345, 12345);
+INSERT INTO bloomTest VALUES ('2345', '2345', 2345, 2345);
+
+SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecOrcFileDump;
+SELECT * FROM bloomTest LIMIT 1;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/acid_bloom_filter_orc_file_dump.q.out b/ql/src/test/results/clientpositive/acid_bloom_filter_orc_file_dump.q.out
new file mode 100644
index 0000000..cfbe9cc
--- /dev/null
+++ b/ql/src/test/results/clientpositive/acid_bloom_filter_orc_file_dump.q.out
@@ -0,0 +1,309 @@
+PREHOOK: query: DROP TABLE if exists bloomTest
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE if exists bloomTest
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE bloomTest(
+  msisdn  STRING,
+  imsi    VARCHAR(20),
+  imei    BIGINT,
+  cell_id BIGINT)
+
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
+TBLPROPERTIES (
+  'bucketing_version'='2',
+  'orc.bloom.filter.columns'='msisdn,cell_id,imsi',
+  'orc.bloom.filter.fpp'='0.02',
+  'transactional'='true'
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@bloomTest
+POSTHOOK: query: CREATE TABLE bloomTest(
+  msisdn  STRING,
+  imsi    VARCHAR(20),
+  imei    BIGINT,
+  cell_id BIGINT)
+
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
+TBLPROPERTIES (
+  'bucketing_version'='2',
+  'orc.bloom.filter.columns'='msisdn,cell_id,imsi',
+  'orc.bloom.filter.fpp'='0.02',
+  'transactional'='true'
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@bloomTest
+PREHOOK: query: INSERT INTO bloomTest VALUES ('12345', '12345', 12345, 12345)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@bloomtest
+POSTHOOK: query: INSERT INTO bloomTest VALUES ('12345', '12345', 12345, 12345)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@bloomtest
+POSTHOOK: Lineage: bloomtest.cell_id SCRIPT []
+POSTHOOK: Lineage: bloomtest.imei SCRIPT []
+POSTHOOK: Lineage: bloomtest.imsi SCRIPT []
+POSTHOOK: Lineage: bloomtest.msisdn SCRIPT []
+PREHOOK: query: INSERT INTO bloomTest VALUES ('2345', '2345', 2345, 2345)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@bloomtest
+POSTHOOK: query: INSERT INTO bloomTest VALUES ('2345', '2345', 2345, 2345)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@bloomtest
+POSTHOOK: Lineage: bloomtest.cell_id SCRIPT []
+POSTHOOK: Lineage: bloomtest.imei SCRIPT []
+POSTHOOK: Lineage: bloomtest.imsi SCRIPT []
+POSTHOOK: Lineage: bloomtest.msisdn SCRIPT []
+PREHOOK: query: SELECT * FROM bloomTest LIMIT 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bloomtest
+#### A masked pattern was here ####
+-- BEGIN ORC FILE DUMP --
+#### A masked pattern was here ####
+File Version: 0.12 with ORC_517
+Rows: 1
+Compression: ZLIB
+Compression size: 32768
+Type: struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint,row:struct<msisdn:string,imsi:varchar(20),imei:bigint,cell_id:bigint>>
+
+Stripe Statistics:
+  Stripe 1:
+    Column 0: count: 1 hasNull: false
+    Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+    Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+    Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+    Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+    Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+    Column 6: count: 1 hasNull: false
+    Column 7: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+    Column 8: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+    Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+    Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+
+File Statistics:
+  Column 0: count: 1 hasNull: false
+  Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+  Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+  Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+  Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+  Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+  Column 6: count: 1 hasNull: false
+  Column 7: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+  Column 8: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+  Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+  Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+
+Stripes:
+  Stripe: offset: 3 data: 73 rows: 1 tail: 103 index: 595
+    Stream: column 0 section ROW_INDEX start: 3 length 11
+    Stream: column 1 section ROW_INDEX start: 14 length 24
+    Stream: column 2 section ROW_INDEX start: 38 length 24
+    Stream: column 3 section ROW_INDEX start: 62 length 29
+    Stream: column 4 section ROW_INDEX start: 91 length 24
+    Stream: column 5 section ROW_INDEX start: 115 length 24
+    Stream: column 6 section ROW_INDEX start: 139 length 11
+    Stream: column 7 section ROW_INDEX start: 150 length 30
+    Stream: column 7 section BLOOM_FILTER_UTF8 start: 180 length 112
+    Stream: column 8 section ROW_INDEX start: 292 length 30
+    Stream: column 8 section BLOOM_FILTER_UTF8 start: 322 length 112
+    Stream: column 9 section ROW_INDEX start: 434 length 27
+    Stream: column 10 section ROW_INDEX start: 461 length 27
+    Stream: column 10 section BLOOM_FILTER_UTF8 start: 488 length 110
+    Stream: column 1 section DATA start: 598 length 6
+    Stream: column 2 section DATA start: 604 length 6
+    Stream: column 3 section DATA start: 610 length 9
+    Stream: column 4 section DATA start: 619 length 6
+    Stream: column 5 section DATA start: 625 length 6
+    Stream: column 7 section DATA start: 631 length 7
+    Stream: column 7 section LENGTH start: 638 length 6
+    Stream: column 8 section DATA start: 644 length 7
+    Stream: column 8 section LENGTH start: 651 length 6
+    Stream: column 9 section DATA start: 657 length 7
+    Stream: column 10 section DATA start: 664 length 7
+    Encoding column 0: DIRECT
+    Encoding column 1: DIRECT_V2
+    Encoding column 2: DIRECT_V2
+    Encoding column 3: DIRECT_V2
+    Encoding column 4: DIRECT_V2
+    Encoding column 5: DIRECT_V2
+    Encoding column 6: DIRECT
+    Encoding column 7: DIRECT_V2
+    Encoding column 8: DIRECT_V2
+    Encoding column 9: DIRECT_V2
+    Encoding column 10: DIRECT_V2
+    Row group indices for column 0:
+      Entry 0: count: 1 hasNull: false positions: 
+    Row group indices for column 1:
+      Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+    Row group indices for column 2:
+      Entry 0: count: 1 hasNull: false min: 2 max: 2 sum: 2 positions: 0,0,0
+    Row group indices for column 3:
+      Entry 0: count: 1 hasNull: false min: 536870912 max: 536870912 sum: 536870912 positions: 0,0,0
+    Row group indices for column 4:
+      Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+    Row group indices for column 5:
+      Entry 0: count: 1 hasNull: false min: 2 max: 2 sum: 2 positions: 0,0,0
+    Row group indices for column 6:
+      Entry 0: count: 1 hasNull: false positions: 
+    Row group indices for column 7:
+      Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 4 positions: 0,0,0,0,0
+    Bloom filters for column 7:
+      Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+      Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+    Row group indices for column 8:
+      Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 4 positions: 0,0,0,0,0
+    Bloom filters for column 8:
+      Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+      Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+    Row group indices for column 9:
+      Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 2345 positions: 0,0,0
+    Row group indices for column 10:
+      Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 2345 positions: 0,0,0
+    Bloom filters for column 10:
+      Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+      Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+
+File length: 1203 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+
+User Metadata:
+  hive.acid.key.index=2,536870912,0;
+  hive.acid.stats=1,0,0
+  hive.acid.version=2
+________________________________________________________________________________________________________________________
+
+-- END ORC FILE DUMP --
+-- BEGIN ORC FILE DUMP --
+#### A masked pattern was here ####
+File Version: 0.12 with ORC_517
+Rows: 1
+Compression: ZLIB
+Compression size: 32768
+Type: struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint,row:struct<msisdn:string,imsi:varchar(20),imei:bigint,cell_id:bigint>>
+
+Stripe Statistics:
+  Stripe 1:
+    Column 0: count: 1 hasNull: false
+    Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+    Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+    Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+    Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+    Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+    Column 6: count: 1 hasNull: false
+    Column 7: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+    Column 8: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+    Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+    Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+
+File Statistics:
+  Column 0: count: 1 hasNull: false
+  Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+  Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+  Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+  Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+  Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+  Column 6: count: 1 hasNull: false
+  Column 7: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+  Column 8: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+  Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+  Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+
+Stripes:
+  Stripe: offset: 3 data: 75 rows: 1 tail: 100 index: 597
+    Stream: column 0 section ROW_INDEX start: 3 length 11
+    Stream: column 1 section ROW_INDEX start: 14 length 24
+    Stream: column 2 section ROW_INDEX start: 38 length 24
+    Stream: column 3 section ROW_INDEX start: 62 length 29
+    Stream: column 4 section ROW_INDEX start: 91 length 24
+    Stream: column 5 section ROW_INDEX start: 115 length 24
+    Stream: column 6 section ROW_INDEX start: 139 length 11
+    Stream: column 7 section ROW_INDEX start: 150 length 31
+    Stream: column 7 section BLOOM_FILTER_UTF8 start: 181 length 111
+    Stream: column 8 section ROW_INDEX start: 292 length 31
+    Stream: column 8 section BLOOM_FILTER_UTF8 start: 323 length 111
+    Stream: column 9 section ROW_INDEX start: 434 length 29
+    Stream: column 10 section ROW_INDEX start: 463 length 29
+    Stream: column 10 section BLOOM_FILTER_UTF8 start: 492 length 108
+    Stream: column 1 section DATA start: 600 length 6
+    Stream: column 2 section DATA start: 606 length 6
+    Stream: column 3 section DATA start: 612 length 9
+    Stream: column 4 section DATA start: 621 length 6
+    Stream: column 5 section DATA start: 627 length 6
+    Stream: column 7 section DATA start: 633 length 8
+    Stream: column 7 section LENGTH start: 641 length 6
+    Stream: column 8 section DATA start: 647 length 8
+    Stream: column 8 section LENGTH start: 655 length 6
+    Stream: column 9 section DATA start: 661 length 7
+    Stream: column 10 section DATA start: 668 length 7
+    Encoding column 0: DIRECT
+    Encoding column 1: DIRECT_V2
+    Encoding column 2: DIRECT_V2
+    Encoding column 3: DIRECT_V2
+    Encoding column 4: DIRECT_V2
+    Encoding column 5: DIRECT_V2
+    Encoding column 6: DIRECT
+    Encoding column 7: DIRECT_V2
+    Encoding column 8: DIRECT_V2
+    Encoding column 9: DIRECT_V2
+    Encoding column 10: DIRECT_V2
+    Row group indices for column 0:
+      Entry 0: count: 1 hasNull: false positions: 
+    Row group indices for column 1:
+      Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+    Row group indices for column 2:
+      Entry 0: count: 1 hasNull: false min: 1 max: 1 sum: 1 positions: 0,0,0
+    Row group indices for column 3:
+      Entry 0: count: 1 hasNull: false min: 536870912 max: 536870912 sum: 536870912 positions: 0,0,0
+    Row group indices for column 4:
+      Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+    Row group indices for column 5:
+      Entry 0: count: 1 hasNull: false min: 1 max: 1 sum: 1 positions: 0,0,0
+    Row group indices for column 6:
+      Entry 0: count: 1 hasNull: false positions: 
+    Row group indices for column 7:
+      Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 5 positions: 0,0,0,0,0
+    Bloom filters for column 7:
+      Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+      Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+    Row group indices for column 8:
+      Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 5 positions: 0,0,0,0,0
+    Bloom filters for column 8:
+      Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+      Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+    Row group indices for column 9:
+      Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 12345 positions: 0,0,0
+    Row group indices for column 10:
+      Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 12345 positions: 0,0,0
+    Bloom filters for column 10:
+      Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+      Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+
+File length: 1211 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+
+User Metadata:
+  hive.acid.key.index=1,536870912,0;
+  hive.acid.stats=1,0,0
+  hive.acid.version=2
+________________________________________________________________________________________________________________________
+
+-- END ORC FILE DUMP --
+12345	12345	12345	12345