You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2019/09/09 11:42:04 UTC
[hive] branch master updated: HIVE-21397: BloomFilter for hive
Managed [ACID] table does not work as expected (Denys Kuzmenko,
reviewed by Gopal Vijayaraghavan and Adam Szita)
This is an automated email from the ASF dual-hosted git repository.
szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 6dbc115 HIVE-21397: BloomFilter for hive Managed [ACID] table does not work as expected (Denys Kuzmenko, reviewed by Gopal Vijayaraghavan and Adam Szita)
6dbc115 is described below
commit 6dbc115d4a35361c07ad2f3f0175825149b8ef35
Author: Denys Kuzmenko <dk...@cloudera.com>
AuthorDate: Mon Sep 9 13:39:02 2019 +0200
HIVE-21397: BloomFilter for hive Managed [ACID] table does not work as expected (Denys Kuzmenko, reviewed by Gopal Vijayaraghavan and Adam Szita)
---
.../acid_bloom_filter_orc_file_dump.q | 33 +++
.../acid_bloom_filter_orc_file_dump.q.out | 309 +++++++++++++++++++++
2 files changed, 342 insertions(+)
diff --git a/ql/src/test/queries/clientpositive/acid_bloom_filter_orc_file_dump.q b/ql/src/test/queries/clientpositive/acid_bloom_filter_orc_file_dump.q
new file mode 100644
index 0000000..30daaf8
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/acid_bloom_filter_orc_file_dump.q
@@ -0,0 +1,33 @@
+SET hive.vectorized.execution.enabled=FALSE;
+SET hive.mapred.mode=nonstrict;
+
+SET hive.support.concurrency=TRUE;
+SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+
+DROP TABLE if exists bloomTest;
+
+CREATE TABLE bloomTest(
+ msisdn STRING,
+ imsi VARCHAR(20),
+ imei BIGINT,
+ cell_id BIGINT)
+
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
+TBLPROPERTIES (
+ 'bucketing_version'='2',
+ 'orc.bloom.filter.columns'='msisdn,cell_id,imsi',
+ 'orc.bloom.filter.fpp'='0.02',
+ 'transactional'='true'
+);
+
+INSERT INTO bloomTest VALUES ('12345', '12345', 12345, 12345);
+INSERT INTO bloomTest VALUES ('2345', '2345', 2345, 2345);
+
+SET hive.exec.post.hooks=org.apache.hadoop.hive.ql.hooks.PostExecOrcFileDump;
+SELECT * FROM bloomTest LIMIT 1;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/acid_bloom_filter_orc_file_dump.q.out b/ql/src/test/results/clientpositive/acid_bloom_filter_orc_file_dump.q.out
new file mode 100644
index 0000000..cfbe9cc
--- /dev/null
+++ b/ql/src/test/results/clientpositive/acid_bloom_filter_orc_file_dump.q.out
@@ -0,0 +1,309 @@
+PREHOOK: query: DROP TABLE if exists bloomTest
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE if exists bloomTest
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE bloomTest(
+ msisdn STRING,
+ imsi VARCHAR(20),
+ imei BIGINT,
+ cell_id BIGINT)
+
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
+TBLPROPERTIES (
+ 'bucketing_version'='2',
+ 'orc.bloom.filter.columns'='msisdn,cell_id,imsi',
+ 'orc.bloom.filter.fpp'='0.02',
+ 'transactional'='true'
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@bloomTest
+POSTHOOK: query: CREATE TABLE bloomTest(
+ msisdn STRING,
+ imsi VARCHAR(20),
+ imei BIGINT,
+ cell_id BIGINT)
+
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
+
+TBLPROPERTIES (
+ 'bucketing_version'='2',
+ 'orc.bloom.filter.columns'='msisdn,cell_id,imsi',
+ 'orc.bloom.filter.fpp'='0.02',
+ 'transactional'='true'
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@bloomTest
+PREHOOK: query: INSERT INTO bloomTest VALUES ('12345', '12345', 12345, 12345)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@bloomtest
+POSTHOOK: query: INSERT INTO bloomTest VALUES ('12345', '12345', 12345, 12345)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@bloomtest
+POSTHOOK: Lineage: bloomtest.cell_id SCRIPT []
+POSTHOOK: Lineage: bloomtest.imei SCRIPT []
+POSTHOOK: Lineage: bloomtest.imsi SCRIPT []
+POSTHOOK: Lineage: bloomtest.msisdn SCRIPT []
+PREHOOK: query: INSERT INTO bloomTest VALUES ('2345', '2345', 2345, 2345)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@bloomtest
+POSTHOOK: query: INSERT INTO bloomTest VALUES ('2345', '2345', 2345, 2345)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@bloomtest
+POSTHOOK: Lineage: bloomtest.cell_id SCRIPT []
+POSTHOOK: Lineage: bloomtest.imei SCRIPT []
+POSTHOOK: Lineage: bloomtest.imsi SCRIPT []
+POSTHOOK: Lineage: bloomtest.msisdn SCRIPT []
+PREHOOK: query: SELECT * FROM bloomTest LIMIT 1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@bloomtest
+#### A masked pattern was here ####
+-- BEGIN ORC FILE DUMP --
+#### A masked pattern was here ####
+File Version: 0.12 with ORC_517
+Rows: 1
+Compression: ZLIB
+Compression size: 32768
+Type: struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint,row:struct<msisdn:string,imsi:varchar(20),imei:bigint,cell_id:bigint>>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 1 hasNull: false
+ Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+ Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+ Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+ Column 6: count: 1 hasNull: false
+ Column 7: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+ Column 8: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+ Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+ Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+
+File Statistics:
+ Column 0: count: 1 hasNull: false
+ Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+ Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+ Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 2 max: 2 sum: 2
+ Column 6: count: 1 hasNull: false
+ Column 7: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+ Column 8: count: 1 hasNull: false bytesOnDisk: 13 min: 2345 max: 2345 sum: 4
+ Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+ Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 2345 max: 2345 sum: 2345
+
+Stripes:
+ Stripe: offset: 3 data: 73 rows: 1 tail: 103 index: 595
+ Stream: column 0 section ROW_INDEX start: 3 length 11
+ Stream: column 1 section ROW_INDEX start: 14 length 24
+ Stream: column 2 section ROW_INDEX start: 38 length 24
+ Stream: column 3 section ROW_INDEX start: 62 length 29
+ Stream: column 4 section ROW_INDEX start: 91 length 24
+ Stream: column 5 section ROW_INDEX start: 115 length 24
+ Stream: column 6 section ROW_INDEX start: 139 length 11
+ Stream: column 7 section ROW_INDEX start: 150 length 30
+ Stream: column 7 section BLOOM_FILTER_UTF8 start: 180 length 112
+ Stream: column 8 section ROW_INDEX start: 292 length 30
+ Stream: column 8 section BLOOM_FILTER_UTF8 start: 322 length 112
+ Stream: column 9 section ROW_INDEX start: 434 length 27
+ Stream: column 10 section ROW_INDEX start: 461 length 27
+ Stream: column 10 section BLOOM_FILTER_UTF8 start: 488 length 110
+ Stream: column 1 section DATA start: 598 length 6
+ Stream: column 2 section DATA start: 604 length 6
+ Stream: column 3 section DATA start: 610 length 9
+ Stream: column 4 section DATA start: 619 length 6
+ Stream: column 5 section DATA start: 625 length 6
+ Stream: column 7 section DATA start: 631 length 7
+ Stream: column 7 section LENGTH start: 638 length 6
+ Stream: column 8 section DATA start: 644 length 7
+ Stream: column 8 section LENGTH start: 651 length 6
+ Stream: column 9 section DATA start: 657 length 7
+ Stream: column 10 section DATA start: 664 length 7
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Encoding column 4: DIRECT_V2
+ Encoding column 5: DIRECT_V2
+ Encoding column 6: DIRECT
+ Encoding column 7: DIRECT_V2
+ Encoding column 8: DIRECT_V2
+ Encoding column 9: DIRECT_V2
+ Encoding column 10: DIRECT_V2
+ Row group indices for column 0:
+ Entry 0: count: 1 hasNull: false positions:
+ Row group indices for column 1:
+ Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+ Row group indices for column 2:
+ Entry 0: count: 1 hasNull: false min: 2 max: 2 sum: 2 positions: 0,0,0
+ Row group indices for column 3:
+ Entry 0: count: 1 hasNull: false min: 536870912 max: 536870912 sum: 536870912 positions: 0,0,0
+ Row group indices for column 4:
+ Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+ Row group indices for column 5:
+ Entry 0: count: 1 hasNull: false min: 2 max: 2 sum: 2 positions: 0,0,0
+ Row group indices for column 6:
+ Entry 0: count: 1 hasNull: false positions:
+ Row group indices for column 7:
+ Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 4 positions: 0,0,0,0,0
+ Bloom filters for column 7:
+ Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Row group indices for column 8:
+ Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 4 positions: 0,0,0,0,0
+ Bloom filters for column 8:
+ Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Row group indices for column 9:
+ Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 2345 positions: 0,0,0
+ Row group indices for column 10:
+ Entry 0: count: 1 hasNull: false min: 2345 max: 2345 sum: 2345 positions: 0,0,0
+ Bloom filters for column 10:
+ Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+
+File length: 1203 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+
+User Metadata:
+ hive.acid.key.index=2,536870912,0;
+ hive.acid.stats=1,0,0
+ hive.acid.version=2
+________________________________________________________________________________________________________________________
+
+-- END ORC FILE DUMP --
+-- BEGIN ORC FILE DUMP --
+#### A masked pattern was here ####
+File Version: 0.12 with ORC_517
+Rows: 1
+Compression: ZLIB
+Compression size: 32768
+Type: struct<operation:int,originalTransaction:bigint,bucket:int,rowId:bigint,currentTransaction:bigint,row:struct<msisdn:string,imsi:varchar(20),imei:bigint,cell_id:bigint>>
+
+Stripe Statistics:
+ Stripe 1:
+ Column 0: count: 1 hasNull: false
+ Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+ Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+ Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+ Column 6: count: 1 hasNull: false
+ Column 7: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+ Column 8: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+ Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+ Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+
+File Statistics:
+ Column 0: count: 1 hasNull: false
+ Column 1: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 2: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+ Column 3: count: 1 hasNull: false bytesOnDisk: 9 min: 536870912 max: 536870912 sum: 536870912
+ Column 4: count: 1 hasNull: false bytesOnDisk: 6 min: 0 max: 0 sum: 0
+ Column 5: count: 1 hasNull: false bytesOnDisk: 6 min: 1 max: 1 sum: 1
+ Column 6: count: 1 hasNull: false
+ Column 7: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+ Column 8: count: 1 hasNull: false bytesOnDisk: 14 min: 12345 max: 12345 sum: 5
+ Column 9: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+ Column 10: count: 1 hasNull: false bytesOnDisk: 7 min: 12345 max: 12345 sum: 12345
+
+Stripes:
+ Stripe: offset: 3 data: 75 rows: 1 tail: 100 index: 597
+ Stream: column 0 section ROW_INDEX start: 3 length 11
+ Stream: column 1 section ROW_INDEX start: 14 length 24
+ Stream: column 2 section ROW_INDEX start: 38 length 24
+ Stream: column 3 section ROW_INDEX start: 62 length 29
+ Stream: column 4 section ROW_INDEX start: 91 length 24
+ Stream: column 5 section ROW_INDEX start: 115 length 24
+ Stream: column 6 section ROW_INDEX start: 139 length 11
+ Stream: column 7 section ROW_INDEX start: 150 length 31
+ Stream: column 7 section BLOOM_FILTER_UTF8 start: 181 length 111
+ Stream: column 8 section ROW_INDEX start: 292 length 31
+ Stream: column 8 section BLOOM_FILTER_UTF8 start: 323 length 111
+ Stream: column 9 section ROW_INDEX start: 434 length 29
+ Stream: column 10 section ROW_INDEX start: 463 length 29
+ Stream: column 10 section BLOOM_FILTER_UTF8 start: 492 length 108
+ Stream: column 1 section DATA start: 600 length 6
+ Stream: column 2 section DATA start: 606 length 6
+ Stream: column 3 section DATA start: 612 length 9
+ Stream: column 4 section DATA start: 621 length 6
+ Stream: column 5 section DATA start: 627 length 6
+ Stream: column 7 section DATA start: 633 length 8
+ Stream: column 7 section LENGTH start: 641 length 6
+ Stream: column 8 section DATA start: 647 length 8
+ Stream: column 8 section LENGTH start: 655 length 6
+ Stream: column 9 section DATA start: 661 length 7
+ Stream: column 10 section DATA start: 668 length 7
+ Encoding column 0: DIRECT
+ Encoding column 1: DIRECT_V2
+ Encoding column 2: DIRECT_V2
+ Encoding column 3: DIRECT_V2
+ Encoding column 4: DIRECT_V2
+ Encoding column 5: DIRECT_V2
+ Encoding column 6: DIRECT
+ Encoding column 7: DIRECT_V2
+ Encoding column 8: DIRECT_V2
+ Encoding column 9: DIRECT_V2
+ Encoding column 10: DIRECT_V2
+ Row group indices for column 0:
+ Entry 0: count: 1 hasNull: false positions:
+ Row group indices for column 1:
+ Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+ Row group indices for column 2:
+ Entry 0: count: 1 hasNull: false min: 1 max: 1 sum: 1 positions: 0,0,0
+ Row group indices for column 3:
+ Entry 0: count: 1 hasNull: false min: 536870912 max: 536870912 sum: 536870912 positions: 0,0,0
+ Row group indices for column 4:
+ Entry 0: count: 1 hasNull: false min: 0 max: 0 sum: 0 positions: 0,0,0
+ Row group indices for column 5:
+ Entry 0: count: 1 hasNull: false min: 1 max: 1 sum: 1 positions: 0,0,0
+ Row group indices for column 6:
+ Entry 0: count: 1 hasNull: false positions:
+ Row group indices for column 7:
+ Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 5 positions: 0,0,0,0,0
+ Bloom filters for column 7:
+ Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Row group indices for column 8:
+ Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 5 positions: 0,0,0,0,0
+ Bloom filters for column 8:
+ Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Row group indices for column 9:
+ Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 12345 positions: 0,0,0
+ Row group indices for column 10:
+ Entry 0: count: 1 hasNull: false min: 12345 max: 12345 sum: 12345 positions: 0,0,0
+ Bloom filters for column 10:
+ Entry 0: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+ Stripe level merge: numHashFunctions: 6 bitCount: 81472 popCount: 6 loadFactor: 0.0001 expectedFpp: 1.5953551E-25
+
+File length: 1211 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
+
+User Metadata:
+ hive.acid.key.index=1,536870912,0;
+ hive.acid.stats=1,0,0
+ hive.acid.version=2
+________________________________________________________________________________________________________________________
+
+-- END ORC FILE DUMP --
+12345 12345 12345 12345