You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Taewoo Kim (JIRA)" <ji...@apache.org> on 2018/11/27 19:36:00 UTC

[jira] [Closed] (ASTERIXDB-2487) Cluster becomes UNUSUABLE with "java.lang.IllegalStateException: Couldn't find any checkpoints for resource"

     [ https://issues.apache.org/jira/browse/ASTERIXDB-2487?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Taewoo Kim closed ASTERIXDB-2487.
---------------------------------
    Resolution: Invalid

> Cluster becomes UNUSUABLE with "java.lang.IllegalStateException: Couldn't find any checkpoints for resource"
> ------------------------------------------------------------------------------------------------------------
>
>                 Key: ASTERIXDB-2487
>                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-2487
>             Project: Apache AsterixDB
>          Issue Type: Bug
>            Reporter: Taewoo Kim
>            Priority: Major
>         Attachments: nc-1.log
>
>
> The Clouberry cluster became UNUSUABLE after the nc-1 (among five NCs) generated the following exception.
>  
> {code:java}
> 21:32:10.659 [Executor-10173:1] WARN org.apache.asterix.app.nc.IndexCheckpointManager - Couldn't find any checkpoint file for index io1/storage/partition_0/twitter/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/0/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382. Content of dir are null.
> 21:32:10.659 [Executor-10172:1] WARN org.apache.asterix.app.nc.IndexCheckpointManager - Couldn't find any checkpoint file for index io2/storage/partition_1/twitter/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c/0/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c. Content of dir are null.
> 21:32:10.660 [Executor-10173:1] ERROR org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness - FLUSH operation.afterFinalize failed on {"class" : "LSMBTree", "dir" : "/home/waans11/asterixdb/io1/storage/partition_0/twitter/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/0/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382", "memory" : [{"class":"LSMBTreeMemoryComponent", "state":"READABLE_UNWRITABLE_FLUSHING", "writers":0, "readers":1, "pendingFlushes":0, "id":"[9,9]"}, {"class":"LSMBTreeMemoryComponent", "state":"INACTIVE", "writers":0, "readers":0, "pendingFlushes":0, "id":"[8,8]"}], "disk" : 3, "num-scheduled-flushes":1, "current-memory-component":1}
> 21:32:10.660 [Executor-10173:1] ERROR org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness - FLUSH operation.afterFinalize failed on {"class" : "LSMBTree", "dir" : "/home/waans11/asterixdb/io1/storage/partition_0/twitter/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/0/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382", "memory" : [{"class":"LSMBTreeMemoryComponent", "state":"READABLE_UNWRITABLE_FLUSHING", "writers":0, "readers":1, "pendingFlushes":0, "id":"[9,9]"}, {"class":"LSMBTreeMemoryComponent", "state":"INACTIVE", "writers":0, "readers":0, "pendingFlushes":0, "id":"[8,8]"}], "disk" : 3, "num-scheduled-flushes":1, "current-memory-component":1}
> java.lang.IllegalStateException: Couldn't find any checkpoints for resource: io1/storage/partition_0/twitter/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/0/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382
> at org.apache.asterix.app.nc.IndexCheckpointManager.getLatest(IndexCheckpointManager.java:145) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.app.nc.IndexCheckpointManager.flushed(IndexCheckpointManager.java:86) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.addComponentToCheckpoint(LSMIOOperationCallback.java:136) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.afterFinalize(LSMIOOperationCallback.java:123) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.doIo(LSMHarness.java:544) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.flush(LSMHarness.java:513) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMTreeIndexAccessor.flush(LSMTreeIndexAccessor.java:122) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:38) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:29) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_161]
> at java.lang.Thread.run(Thread.java:748) [?:1.8.0_161]
> 21:32:10.663 [Executor-10172:1] ERROR org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness - FLUSH operation.afterFinalize failed on {"class" : "LSMBTree", "dir" : "/home/waans11/asterixdb/io2/storage/partition_1/twitter/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c/0/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c", "memory" : [{"class":"LSMBTreeMemoryComponent", "state":"READABLE_UNWRITABLE_FLUSHING", "writers":0, "readers":1, "pendingFlushes":0, "id":"[24,24]"}, {"class":"LSMBTreeMemoryComponent", "state":"INACTIVE", "writers":0, "readers":0, "pendingFlushes":0, "id":"[23,23]"}], "disk" : 4, "num-scheduled-flushes":1, "current-memory-component":1}
> java.lang.IllegalStateException: Couldn't find any checkpoints for resource: io2/storage/partition_1/twitter/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c/0/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c
> at org.apache.asterix.app.nc.IndexCheckpointManager.getLatest(IndexCheckpointManager.java:145) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.app.nc.IndexCheckpointManager.flushed(IndexCheckpointManager.java:86) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.addComponentToCheckpoint(LSMIOOperationCallback.java:136) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.afterFinalize(LSMIOOperationCallback.java:123) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.addComponentToCheckpoint(LSMIOOperationCallback.java:136) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.afterFinalize(LSMIOOperationCallback.java:123) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.doIo(LSMHarness.java:544) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.flush(LSMHarness.java:513) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMTreeIndexAccessor.flush(LSMTreeIndexAccessor.java:122) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:38) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:29) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_161]
> at java.lang.Thread.run(Thread.java:748) [?:1.8.0_161]
> 21:32:10.690 [Executor-10172:1] ERROR org.apache.asterix.app.nc.HaltCallback - Operation org.apache.hyracks.storage.am.lsm.btree.impls.LSMBTreeFlushOperation@b305d582 has failed
> java.lang.IllegalStateException: Couldn't find any checkpoints for resource: io2/storage/partition_1/twitter/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c/0/ds_tweet_9460370bb0ca1c98a779b1bcc6861c2c
> at org.apache.asterix.app.nc.IndexCheckpointManager.getLatest(IndexCheckpointManager.java:145) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.app.nc.IndexCheckpointManager.flushed(IndexCheckpointManager.java:86) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.addComponentToCheckpoint(LSMIOOperationCallback.java:136) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.afterFinalize(LSMIOOperationCallback.java:123) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.doIo(LSMHarness.java:544) ~[hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.flush(LSMHarness.java:513) ~[hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMTreeIndexAccessor.flush(LSMTreeIndexAccessor.java:122) ~[hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:38) ~[hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:29) ~[hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_161]
> at java.lang.Thread.run(Thread.java:748) [?:1.8.0_161]{code}
> Add: Based on [~mhubail]'s advice, I checked the directory and found that the files exist there. 
>  
> {code:java}
> [waans11@americium ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382]$ ll -al
> total 43684
> drwxr-xr-x 2 waans11 waans11 151 Nov 26 21:32 .
> drwxr-xr-x 3 waans11 waans11 55 Nov 24 22:36 ..
> -rw-r--r-- 1 waans11 waans11 43387480 Nov 26 08:22 0_5_b
> -rw-r--r-- 1 waans11 waans11 262160 Nov 26 08:22 0_5_f
> -rw-r--r-- 1 waans11 waans11 262160 Nov 26 14:28 6_6_b
> -rw-r--r-- 1 waans11 waans11 262160 Nov 26 14:28 6_6_f
> -rw-r--r-- 1 waans11 waans11 262160 Nov 26 19:40 7_7_b
> -rw-r--r-- 1 waans11 waans11 262160 Nov 26 19:40 7_7_f
> -rw-r--r-- 1 waans11 waans11 107 Nov 26 14:28 .idx_checkpoint_7
> -rw-r--r-- 1 waans11 waans11 107 Nov 26 19:40 .idx_checkpoint_8
> -rw-r--r-- 1 waans11 waans11 2953 Nov 24 22:36 .metadata{code}
> Added: The log file says "too many open files." now.
> {code:java}
> 11:51:39.478 [Executor-2982:1] WARN org.apache.asterix.app.nc.IndexCheckpointManager - Couldn't find any checkpoint file for index io2/storage/partition_1/twitter/ds_tweet_f91463b65f69e395e352ab7701a2a5fc/0/ds_tweet_f91463b65f69e395e352ab7701a2a5fc. Content of dir are null.
> 11:51:39.478 [Executor-2982:1] ERROR org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness - FLUSH operation.afterFinalize failed on {"class" : "LSMBTree", "dir" : "/home/waans11/asterixdb/io2/storage/partition_1/twitter/ds_tweet_f91463b65f69e395e352ab7701a2a5fc/0/ds_tweet_f91463b65f69e395e352ab7701a2a5fc", "memory" : [{"class":"LSMBTreeMemoryComponent", "state":"READABLE_UNWRITABLE_FLUSHING", "writers":0, "readers":1, "pendingFlushes":0, "id":"[5,5]"}, {"class":"LSMBTreeMemoryComponent", "state":"INACTIVE", "writers":0, "readers":0, "pendingFlushes":0, "id":"null"}], "disk" : 4, "num-scheduled-flushes":1, "current-memory-component":1}
> java.lang.IllegalStateException: Couldn't find any checkpoints for resource: io2/storage/partition_1/twitter/ds_tweet_f91463b65f69e395e352ab7701a2a5fc/0/ds_tweet_f91463b65f69e395e352ab7701a2a5fc
> at org.apache.asterix.app.nc.IndexCheckpointManager.getLatest(IndexCheckpointManager.java:145) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.app.nc.IndexCheckpointManager.flushed(IndexCheckpointManager.java:86) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.addComponentToCheckpoint(LSMIOOperationCallback.java:136) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.asterix.common.ioopcallbacks.LSMIOOperationCallback.afterFinalize(LSMIOOperationCallback.java:123) ~[asterix-common-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.doIo(LSMHarness.java:544) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMHarness.flush(LSMHarness.java:513) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.LSMTreeIndexAccessor.flush(LSMTreeIndexAccessor.java:122) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:38) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at org.apache.hyracks.storage.am.lsm.common.impls.FlushOperation.call(FlushOperation.java:29) [hyracks-storage-am-lsm-common-0.3.5-SNAPSHOT.jar:0.3.5-SNAPSHOT]
> at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_161]
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_161]
> at java.lang.Thread.run(Thread.java:748) [?:1.8.0_161]
> 11:51:39.729 [Executor-2983:1] WARN org.apache.asterix.app.nc.IndexCheckpointManager - Couldn't read index checkpoint file: io2/storage/partition_1/twitter/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/0/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/.idx_checkpoint_10
> java.nio.file.FileSystemException: io2/storage/partition_1/twitter/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/0/ds_tweet_e9ad9c2394f7dc7b6a69fb43e52a7382/.idx_checkpoint_10: Too many open files
> at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91) ~[?:1.8.0_161]
> at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102) ~[?:1.8.0_161]
> at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107) ~[?:1.8.0_161]
> at sun.nio.fs.UnixFileSystemProvider.newByteChannel(UnixFileSystemProvider.java:214) ~[?:1.8.0_161]
> at java.nio.file.Files.newByteChannel(Files.java:361) ~[?:1.8.0_161]
> at java.nio.file.Files.newByteChannel(Files.java:407) ~[?:1.8.0_161]
> at java.nio.file.Files.readAllBytes(Files.java:3152) ~[?:1.8.0_161]
> at org.apache.asterix.app.nc.IndexCheckpointManager.read(IndexCheckpointManager.java:215) ~[asterix-app-0.9.5-SNAPSHOT.jar:0.9.5-SNAPSHOT]{code}
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)