You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues-all@impala.apache.org by "Tianyi Wang (JIRA)" <ji...@apache.org> on 2018/06/07 20:24:00 UTC
[jira] [Comment Edited] (IMPALA-3040) test_caching_ddl failed with unexpected get_num_cache_requests

    [ https://issues.apache.org/jira/browse/IMPALA-3040?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16504136#comment-16504136 ] 

Tianyi Wang edited comment on IMPALA-3040 at 6/7/18 8:23 PM:
-------------------------------------------------------------

In the case I looked into the problem is that the cache directive /test-warehouse/cachedb.db/cached_tbl_local/j=1 never got removed. The relevant namenode log:
{noformat}
2018-05-29 22:31:55,636 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3000 milliseconds
2018-05-29 22:31:55,637 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 14 directive(s) and 7 block(s) in 1 millisecond(s).
2018-05-29 22:31:58,636 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3000 milliseconds
2018-05-29 22:31:58,637 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 14 directive(s) and 7 block(s) in 1 millisecond(s).
2018-05-29 22:32:01,636 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3000 milliseconds
2018-05-29 22:32:01,637 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 14 directive(s) and 7 block(s) in 1 millisecond(s).
2018-05-29 22:32:03,483 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: addDirective of {path: /test-warehouse/cachedb.db/cached_tbl_local/j=1, replication: 1, pool: testPool, expiration: 26687997791:19:48:13.951} successful.
2018-05-29 22:32:03,679 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning because of pending operations
2018-05-29 22:32:03,680 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 15 directive(s) and 7 block(s) in 1 millisecond(s).
2018-05-29 22:32:04,738 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* allocateBlock: /hbase/WALs/localhost,16203,1527647516612/localhost%2C16203%2C1527647516612.default.1527658324725. BP-1128346793-127.0.0.1-1527647465315 blk_1073755283_14459{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-45a1fee3-50a1-44cc-8230-916d08f9b968:NORMAL:127.0.0.1:31001|RBW]]}
2018-05-29 22:32:04,740 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* allocateBlock: /hbase/WALs/localhost,16201,1527647513859/localhost%2C16201%2C1527647513859.default.1527658324731. BP-1128346793-127.0.0.1-1527647465315 blk_1073755284_14460{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-223dfda7-738f-4020-bf4c-29a4642750e1:NORMAL:127.0.0.1:31002|RBW]]}
2018-05-29 22:32:04,748 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* fsync: /hbase/WALs/localhost,16203,1527647516612/localhost%2C16203%2C1527647516612.default.1527658324725 for DFSClient_NONMAPREDUCE_-454483218_1
2018-05-29 22:32:04,748 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* fsync: /hbase/WALs/localhost,16201,1527647513859/localhost%2C16201%2C1527647513859.default.1527658324731 for DFSClient_NONMAPREDUCE_1562981572_1
2018-05-29 22:32:04,755 INFO BlockStateChange: BLOCK* addStoredBlock: blockMap updated: 127.0.0.1:31002 is added to blk_1073755263_14439{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-223dfda7-738f-4020-bf4c-29a4642750e1:NORMAL:127.0.0.1:31002|RBW]]} size 83
2018-05-29 22:32:04,755 INFO BlockStateChange: BLOCK* addStoredBlock: blockMap updated: 127.0.0.1:31001 is added to blk_1073755264_14440{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-45a1fee3-50a1-44cc-8230-916d08f9b968:NORMAL:127.0.0.1:31001|RBW]]} size 83
2018-05-29 22:32:04,756 INFO org.apache.hadoop.hdfs.StateChange: DIR* completeFile: /hbase/WALs/localhost,16203,1527647516612/localhost%2C16203%2C1527647516612.default.1527654724633 is closed by DFSClient_NONMAPREDUCE_-454483218_1
2018-05-29 22:32:04,756 INFO org.apache.hadoop.hdfs.StateChange: DIR* completeFile: /hbase/WALs/localhost,16201,1527647513859/localhost%2C16201%2C1527647513859.default.1527654724642 is closed by DFSClient_NONMAPREDUCE_1562981572_1
2018-05-29 22:32:04,765 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* allocateBlock: /hbase/WALs/localhost,16202,1527647515279/localhost%2C16202%2C1527647515279.default.1527658324757. BP-1128346793-127.0.0.1-1527647465315 blk_1073755285_14461{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-223dfda7-738f-4020-bf4c-29a4642750e1:NORMAL:127.0.0.1:31002|RBW]]}
2018-05-29 22:32:04,771 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* fsync: /hbase/WALs/localhost,16202,1527647515279/localhost%2C16202%2C1527647515279.default.1527658324757 for DFSClient_NONMAPREDUCE_-1203988066_1
2018-05-29 22:32:04,778 INFO BlockStateChange: BLOCK* addStoredBlock: blockMap updated: 127.0.0.1:31000 is added to blk_1073755265_14441{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-32944451-dc57-41dc-8020-5c9c1e27be16:NORMAL:127.0.0.1:31000|RBW]]} size 83
2018-05-29 22:32:04,778 INFO org.apache.hadoop.hdfs.StateChange: DIR* completeFile: /hbase/WALs/localhost,16202,1527647515279/localhost%2C16202%2C1527647515279.default.1527654724676 is closed by DFSClient_NONMAPREDUCE_-1203988066_1
2018-05-29 22:32:06,679 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3000 milliseconds
2018-05-29 22:32:06,680 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 15 directive(s) and 7 block(s) in 1 millisecond(s).
2018-05-29 22:32:09,495 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* allocateBlock: /hbase/WALs/localhost,16202,1527647515279/localhost%2C16202%2C1527647515279.meta.1527658329232.meta. BP-1128346793-127.0.0.1-1527647465315 blk_1073755286_14462{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-45a1fee3-50a1-44cc-8230-916d08f9b968:NORMAL:127.0.0.1:31001|RBW]]}
2018-05-29 22:32:09,679 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3000 milliseconds
2018-05-29 22:32:09,986 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 15 directive(s) and 7 block(s) in 307 millisecond(s).
2018-05-29 22:32:10,069 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* fsync: /hbase/WALs/localhost,16202,1527647515279/localhost%2C16202%2C1527647515279.meta.1527658329232.meta for DFSClient_NONMAPREDUCE_-1203988066_1
2018-05-29 22:32:10,500 INFO BlockStateChange: BLOCK* addStoredBlock: blockMap updated: 127.0.0.1:31000 is added to blk_1073755266_14442{blockUCState=UNDER_CONSTRUCTION, primaryNodeIndex=-1, replicas=[ReplicaUnderConstruction[[DISK]DS-32944451-dc57-41dc-8020-5c9c1e27be16:NORMAL:127.0.0.1:31000|RBW]]} size 83
2018-05-29 22:32:10,500 INFO org.apache.hadoop.hdfs.StateChange: DIR* completeFile: /hbase/WALs/localhost,16202,1527647515279/localhost%2C16202%2C1527647515279.meta.1527654729154.meta is closed by DFSClient_NONMAPREDUCE_-1203988066_1
2018-05-29 22:32:12,680 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3001 milliseconds
2018-05-29 22:32:12,680 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 15 directive(s) and 7 block(s) in 0 millisecond(s).
2018-05-29 22:32:15,681 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3001 milliseconds
2018-05-29 22:32:15,681 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 15 directive(s) and 7 block(s) in 0 millisecond(s).
2018-05-29 22:32:18,682 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Rescanning after 3001 milliseconds
2018-05-29 22:32:18,682 INFO org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor: Scanned 15 directive(s) and 7 block(s) in 0 millisecond(s).
2018-05-29 22:32:19,469 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 17 successful.
2018-05-29 22:32:19,471 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 19 successful.
2018-05-29 22:32:19,472 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 18 successful.
2018-05-29 22:32:19,473 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 23 successful.
2018-05-29 22:32:19,476 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 22 successful.
2018-05-29 22:32:19,478 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 21 successful.
2018-05-29 22:32:19,479 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 25 successful.
2018-05-29 22:32:19,480 INFO org.apache.hadoop.hdfs.server.namenode.CacheManager: removeDirective of 24 successful.
{noformat}
The cache directive ID is assigned by an ever-increasing counter, so we can know that /test-warehouse/cachedb.db/cached_tbl_local/j=1 is assigned id=26 and is not removed. Catalogd log at this time doesn't have anything interesting, so I added some log in https://gerrit.cloudera.org/c/10640/ to see if catalogd called removeDirective() at all. 


was (Author: tianyiwang):
In the case I looked into the problem is that the cache directive /test-warehouse/cachedb.db/cached_tbl_local/j=1 never got removed. 

> test_caching_ddl failed with unexpected get_num_cache_requests
> --------------------------------------------------------------
>
>                 Key: IMPALA-3040
>                 URL: https://issues.apache.org/jira/browse/IMPALA-3040
>             Project: IMPALA
>          Issue Type: Bug
>          Components: Catalog
>    Affects Versions: Impala 2.5.0
>            Reporter: Tim Armstrong
>            Assignee: Tianyi Wang
>            Priority: Major
>              Labels: flaky
>
> This test failed on the integration job.
> http://sandbox.jenkins.cloudera.com/view/Impala/view/Builds%20-%202.5.0%20release/job/impala-cdh5.7.0-integration/3/testReport/junit/query_test.test_hdfs_caching/TestHdfsCachingDdl/test_caching_ddl_exec_option____disable_codegen___False___abort_on_error___1___exec_single_node_rows_threshold___0___batch_size___0___num_nodes___0____table_format__text_none_/
> {code}
> query_test.test_hdfs_caching.TestHdfsCachingDdl.test_caching_ddl[exec_option: {'disable_codegen': False, 'abort_on_error': 1, 'exec_single_node_rows_threshold': 0, 'batch_size': 0, 'num_nodes': 0} | table_format: text/none] (from pytest)
> Failing for the past 1 build (Since Failed#3 )
> Took 47 sec.
> add description
> Error Message
> assert 9 == 10  +  where 10 = get_num_cache_requests()
> Stacktrace
> query_test/test_hdfs_caching.py:132: in test_caching_ddl
>     assert num_entries_pre == get_num_cache_requests()
> E   assert 9 == 10
> E    +  where 10 = get_num_cache_requests()
> Standard Error
> -- connecting to: localhost:21000
> -- executing against localhost:21000
> use default;
> SET sync_ddl=1;
> -- executing against localhost:21000
> drop database if exists `cachedb` cascade;
> -- executing against localhost:21000
> create database cachedb;
> -- executing against localhost:21000
> use functional;
> SET disable_codegen=False;
> SET abort_on_error=1;
> SET exec_single_node_rows_threshold=0;
> SET batch_size=0;
> SET num_nodes=0;
> -- executing against localhost:21000
> use cachedb;
> -- executing against localhost:21000
> create table cached_tbl_nopart (i int) cached in 'testPool';
> -- executing against localhost:21000
> insert into cached_tbl_nopart select 1;
> -- executing against localhost:21000
> select * from cached_tbl_nopart;
> -- executing against localhost:21000
> create table like_cached_tbl_nopart like cached_tbl_nopart;
> -- executing against localhost:21000
> show table stats like_cached_tbl_nopart;
> -- executing against localhost:21000
> show table stats cached_tbl_nopart;
> -- executing against localhost:21000
> alter table cached_tbl_nopart set uncached;
> -- executing against localhost:21000
> show table stats cached_tbl_nopart;
> -- executing against localhost:21000
> drop table if exists cached_tbl_part;
> -- executing against localhost:21000
> create table cached_tbl_part (i int) partitioned by (j int) cached in 'testPool' with replication = 9;
> -- executing against localhost:21000
> alter table cached_tbl_part add partition (j=0);
> -- executing against localhost:21000
> alter table cached_tbl_part add partition (j=1) uncached;
> -- executing against localhost:21000
> alter table cached_tbl_part add partition (j=2) cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> drop table if exists cached_tbl_part;
> -- executing against localhost:21000
> create table cached_tbl_part (i int) partitioned by (j int) cached in 'testPool';
> -- executing against localhost:21000
> alter table cached_tbl_part add partition (j=0);
> -- executing against localhost:21000
> alter table cached_tbl_part add partition (j=1) uncached;
> -- executing against localhost:21000
> alter table cached_tbl_part add partition (j=2) cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part partition (j=2) set uncached;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part partition (j=2) set uncached;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part partition (j=1) set cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> insert into cached_tbl_part partition(j) values(3, 3), (4, 4);
> -- executing against localhost:21000
> alter table cached_tbl_part partition (j=3) set cached in 'testPool' with replication = 4;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part set uncached;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part set uncached;
> -- executing against localhost:21000
> alter table cached_tbl_part set cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part set cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part partition (j=3) set cached in 'testPool' with replication = 4;
> -- executing against localhost:21000
> alter table cached_tbl_part partition (j=1) set uncached;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part set cached in 'testPool' with replication = 8;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part add partition(j=5) cached in 'testPool' with replication = 3;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part partition(j=2) set cached in 'testPool' with replication = 3;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part add partition(j=6) cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> alter table cached_tbl_part add partition(j=7)
> location 'file:///data/2/jenkins/workspace/impala-cdh5.7.0-integration/repos/Impala/testdata/data/cached_tbl_part';
> -- executing against localhost:21000
> alter table cached_tbl_part add partition(j=7)
> location 'file:///data/2/jenkins/workspace/impala-cdh5.7.0-integration/repos/Impala/testdata/data/cache_tbl_part' uncached;
> -- executing against localhost:21000
> show partitions cached_tbl_part;
> -- executing against localhost:21000
> create table uncached_tbl_local(i int)
> location 'file:///data/2/jenkins/workspace/impala-cdh5.7.0-integration/repos/Impala/testdata/data/uncached_tbl_local';
> -- executing against localhost:21000
> alter table uncached_tbl_local set cached in 'testPool';
> -- executing against localhost:21000
> create table cached_tbl_local(i int) partitioned by (j int);
> -- executing against localhost:21000
> alter table cached_tbl_local add partition(j=0)
> location 'file:///data/2/jenkins/workspace/impala-cdh5.7.0-integration/repos/Impala/testdata/data/cached_local_part';
> -- executing against localhost:21000
> alter table cached_tbl_local partition(j=0) set cached in 'testPool';
> -- executing against localhost:21000
> alter table cached_tbl_local set cached in 'testPool';
> -- executing against localhost:21000
> alter table cached_tbl_local add partition(j=1) cached in 'testPool';
> -- executing against localhost:21000
> alter table cached_tbl_local set cached in 'testPool';
> -- executing against localhost:21000
> show partitions cached_tbl_local;
> -- executing against localhost:21000
> drop table cachedb.cached_tbl_part;
> -- executing against localhost:21000
> drop table cachedb.cached_tbl_nopart;
> -- executing against localhost:21000
> drop table cachedb.cached_tbl_local;
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-all-unsubscribe@impala.apache.org
For additional commands, e-mail: issues-all-help@impala.apache.org