You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@ignite.apache.org by Muhammed Favas <fa...@expeedsoftware.com> on 2019/09/12 10:23:43 UTC

Slowness in initial data load using COPY command

HI,

I was trying to load data using csv file (each file contains 5 million rows of record which is approx.. 4 GB of data) using COPY command.
In initial stage of loading it was quit fast, but later the load process start slowing down and showing hardly 1% of CPU usage. My cluster has 5 nodes, each with 8 core CPU and 32 GB RAM.
When I checked one node's log, I have seen some sever messages which is like what I have give below.

Can some one help me to understand the error details give below and how can I improve my data load speed.

[08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] Copied file [src=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000006.wal, dst=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003066.wal]
[08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] Starting to copy WAL segment [absIdx=3067, segIdx=7, origFile=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000007.wal, dstFile=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003067.wal]
[08:51:08,512][SEVERE][tcp-disco-msg-worker-#2][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [threadName=data-streamer-stripe-2, blockedFor=31s]
[08:51:08,512][WARNING][tcp-disco-msg-worker-#2][G] Thread [name="data-streamer-stripe-2-#11", id=24, state=WAITING, blockCnt=0, waitCnt=81018]
    Lock [object=java.util.concurrent.locks.ReentrantLock$NonfairSync@23aede76, ownerName=data-streamer-stripe-4-#13, ownerId=26]

[08:51:08,512][SEVERE][tcp-disco-msg-worker-#2][] Critical system error detected. Will be handled accordingly to configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=[SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, heartbeatTs=1568278237174]]]
class org.apache.ignite.IgniteException: GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, heartbeatTs=1568278237174]
        at org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1831)
        at org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1826)
        at org.apache.ignite.internal.worker.WorkersRegistry.onIdle(WorkersRegistry.java:233)
        at org.apache.ignite.internal.util.worker.GridWorker.onIdle(GridWorker.java:297)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.lambda$new$0(ServerImpl.java:2663)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorker.body(ServerImpl.java:7181)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.body(ServerImpl.java:2700)
        at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorkerThread.body(ServerImpl.java:7119)
        at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
[08:51:08,513][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] No deadlocked threads detected.
[08:51:08,853][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] Thread dump at 2019/09/12 08:51:08 UTC
Thread [name="sys-#1257", id=1683, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="sys-#1256", id=1682, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="sys-#1255", id=1681, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="sys-#1254", id=1680, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="query-#1253", id=1679, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5de43f6a, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)


Regards,
Favas


Re: Slowness in initial data load using COPY command

Posted by Andrei Aleksandrov <ae...@gmail.com>.
Hi,

You can try to investigate your cluster state first:

1)First of all try to take a look at your logs and find there long JVM 
pauses. Also, you can collect the GC log and see the memory usage. It 
possible that the provided memory for your node isn't enough because it 
was used for something else.

In case of big memory usage you can take a look in heap dump. It will 
show what takes your memory.

Also in case of big pauses you can investigate the GC JVM configuration. 
Check that use at least G1 and set the required GC pause.

2)In case if you store your WAL into some network storage then please 
check that there were no connectivity issues that possible blocked the 
WAL copying.

3)Check that there are no connectivity issues between different nodes. 
You can see messages about NODE_FAILED in your server node. If you see 
it than it should be investigated.

4)Finally think about disabling of the WAL during initial data loading 
and using IgniteDataStreamer via some java thick client.

If nothing helps then try to provide logs.

According how CSV data can be loaded:

1)Using spark integration

https://www.gridgain.com/resources/blog/apacher-ignitetm-and-apacher-sparktm-integration-using-ignite-rdds

2)Using java code (some logic to read CSV) + IgniteDataStreamer

https://ignite.apache.org/releases/latest/javadoc/org/apache/ignite/IgniteDataStreamer.html

3)Using kafka integration

https://apacheignite-mix.readme.io/docs/kafka-streamer

BR,
Andrei

9/12/2019 3:20 PM, Muhammed Favas пишет:
>
> Thanks Oleg,
>
> I need to have persistence enabled for my system.
>
> I believe due to the locking/wait issue, the data load is slowing down.
>
> Can anyone suggest a solution on how to avoid this situation and make 
> data load faster?
>
> Note: I am using COPY command to lad data into table from csv.
>
> *Regards,*
>
> *Favas ***
>
> *From:* Oleg Popov <o....@livelace.ru>
> *Sent:* Thursday, September 12, 2019 4:09 PM
> *To:* user <us...@ignite.apache.org>
> *Subject:* Re: Slowness in initial data load using COPY command
>
> Hello. The same thing I see when I move data from MariaDB to Apache 
> Ignite (PutAll).
>
> I have tens of databases, but such things happens for a 100k rows 
> database and for 400k+ rows databases as well.
>
> I tried to:
>
> 1. Increase threads
>
> 2. Disable WaL/persistance.
>
> 3. Put WaL/data on a SSD.
>
> Nothing was helped. And now I have to completely disable failureHandler:
>
> <*property **name**="failureHandler"*><*bean **class**="org.apache.ignite.failure.NoOpFailureHandler"*/></*property*>
>
> ------------------------------------------------------------------------
>
> *From: *"Muhammed Favas" <favas.muhammed@expeedsoftware.com 
> <ma...@expeedsoftware.com>>
> *To: *"user" <user@ignite.apache.org <ma...@ignite.apache.org>>
> *Sent: *Thursday, September 12, 2019 1:23:43 PM
> *Subject: *Slowness in initial data load using COPY command
>
> HI,
>
> I was trying to load data using csv file (each file contains 5 million 
> rows of record which is approx.. 4 GB of data) using COPY command.
>
> In initial stage of loading it was quit fast, but later the load 
> process start slowing down and showing hardly 1% of CPU usage. My 
> cluster has 5 nodes, each with 8 core CPU and 32 GB RAM.
>
> When I checked one node’s log, I have seen some sever messages which 
> is like what I have give below.
>
> Can some one help me to understand the error details give below and 
> how can I improve my data load speed.
>
> [08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] 
> Copied file 
> [src=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000006.wal, 
> dst=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003066.wal]
>
> [08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] 
> Starting to copy WAL segment [absIdx=3067, segIdx=7, 
> origFile=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000007.wal, 
> dstFile=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003067.wal]
>
> [08:51:08,512][SEVERE][tcp-disco-msg-worker-#2][G] Blocked 
> system-critical thread has been detected. This can lead to 
> cluster-wide undefined behaviour [threadName=data-streamer-stripe-2, 
> blockedFor=31s]
>
> [08:51:08,512][WARNING][tcp-disco-msg-worker-#2][G] Thread 
> [name="data-streamer-stripe-2-#11", id=24, state=WAITING, blockCnt=0, 
> waitCnt=81018]
>
> Lock 
> [object=java.util.concurrent.locks.ReentrantLock$NonfairSync@23aede76, 
> ownerName=data-streamer-stripe-4-#13, ownerId=26]
>
> [08:51:08,512][SEVERE][tcp-disco-msg-worker-#2][] Critical system 
> error detected. Will be handled accordingly to configured handler 
> [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, 
> super=AbstractFailureHandler 
> [ignoredFailureTypes=[SYSTEM_WORKER_BLOCKED, 
> SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext 
> [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: 
> GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, 
> finished=false, heartbeatTs=1568278237174]]]
>
> class org.apache.ignite.IgniteException: GridWorker 
> [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, 
> heartbeatTs=1568278237174]
>
>         at 
> org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1831)
>
>         at 
> org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1826)
>
>         at 
> org.apache.ignite.internal.worker.WorkersRegistry.onIdle(WorkersRegistry.java:233)
>
>         at 
> org.apache.ignite.internal.util.worker.GridWorker.onIdle(GridWorker.java:297)
>
>         at 
> org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.lambda$new$0(ServerImpl.java:2663)
>
>         at 
> org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorker.body(ServerImpl.java:7181)
>
>         at 
> org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.body(ServerImpl.java:2700)
>
>         at 
> org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
>
>         at 
> org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorkerThread.body(ServerImpl.java:7119)
>
>         at 
> org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
>
> [08:51:08,513][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] No 
> deadlocked threads detected.
>
> [08:51:08,853][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] 
> Thread dump at 2019/09/12 08:51:08 UTC
>
> Thread [name="sys-#1257", id=1683, state=TIMED_WAITING, blockCnt=0, 
> waitCnt=1]
>
> Lock 
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, 
> ownerName=null, ownerId=-1]
>
>         at sun.misc.Unsafe.park(Native Method)
>
>         at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>
>         at 
> java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="sys-#1256", id=1682, state=TIMED_WAITING, blockCnt=0, 
> waitCnt=1]
>
> Lock 
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, 
> ownerName=null, ownerId=-1]
>
>         at sun.misc.Unsafe.park(Native Method)
>
>         at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>
>         at 
> java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="sys-#1255", id=1681, state=TIMED_WAITING, blockCnt=0, 
> waitCnt=1]
>
> Lock 
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, 
> ownerName=null, ownerId=-1]
>
>         at sun.misc.Unsafe.park(Native Method)
>
>         at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>
>         at 
> java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="sys-#1254", id=1680, state=TIMED_WAITING, blockCnt=0, 
> waitCnt=1]
>
> Lock 
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, 
> ownerName=null, ownerId=-1]
>
>         at sun.misc.Unsafe.park(Native Method)
>
>         at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>
>         at 
> java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>
>         at java.lang.Thread.run(Thread.java:748)
>
> Thread [name="query-#1253", id=1679, state=TIMED_WAITING, blockCnt=0, 
> waitCnt=1]
>
> Lock 
> [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5de43f6a, 
> ownerName=null, ownerId=-1]
>
>         at sun.misc.Unsafe.park(Native Method)
>
>         at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
>
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
>
>         at 
> java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
>
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>
>         at java.lang.Thread.run(Thread.java:748)
>
> *Regards,*
>
> *Favas *
>

RE: Slowness in initial data load using COPY command

Posted by Muhammed Favas <fa...@expeedsoftware.com>.
Thanks Oleg,

I need to have persistence enabled for my system.
I believe due to the locking/wait issue, the data load is slowing down.

Can anyone suggest a solution on how to avoid this situation and make data load faster?

Note: I am using COPY command to lad data into table from csv.

Regards,
Favas

From: Oleg Popov <o....@livelace.ru>
Sent: Thursday, September 12, 2019 4:09 PM
To: user <us...@ignite.apache.org>
Subject: Re: Slowness in initial data load using COPY command

Hello. The same thing I see when I move data from MariaDB to Apache Ignite (PutAll).
I have tens of databases, but such things happens for a 100k rows database and for 400k+ rows databases as well.
I tried to:

1. Increase threads
2. Disable WaL/persistance.
3. Put WaL/data on a SSD.

Nothing was helped. And now I have to completely disable failureHandler:


<property name="failureHandler">
    <bean class="org.apache.ignite.failure.NoOpFailureHandler"/>
</property>



________________________________
From: "Muhammed Favas" <fa...@expeedsoftware.com>>
To: "user" <us...@ignite.apache.org>>
Sent: Thursday, September 12, 2019 1:23:43 PM
Subject: Slowness in initial data load using COPY command

HI,

I was trying to load data using csv file (each file contains 5 million rows of record which is approx.. 4 GB of data) using COPY command.
In initial stage of loading it was quit fast, but later the load process start slowing down and showing hardly 1% of CPU usage. My cluster has 5 nodes, each with 8 core CPU and 32 GB RAM.
When I checked one node’s log, I have seen some sever messages which is like what I have give below.

Can some one help me to understand the error details give below and how can I improve my data load speed.

[08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] Copied file [src=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000006.wal, dst=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003066.wal]
[08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] Starting to copy WAL segment [absIdx=3067, segIdx=7, origFile=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000007.wal, dstFile=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003067.wal]
[08:51:08,512][SEVERE][tcp-disco-msg-worker-#2][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [threadName=data-streamer-stripe-2, blockedFor=31s]
[08:51:08,512][WARNING][tcp-disco-msg-worker-#2][G] Thread [name="data-streamer-stripe-2-#11", id=24, state=WAITING, blockCnt=0, waitCnt=81018]
    Lock [object=java.util.concurrent.locks.ReentrantLock$NonfairSync@23aede76, ownerName=data-streamer-stripe-4-#13, ownerId=26]

[08:51:08,512][SEVERE][tcp-disco-msg-worker-#2][] Critical system error detected. Will be handled accordingly to configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=[SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, heartbeatTs=1568278237174]]]
class org.apache.ignite.IgniteException: GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, heartbeatTs=1568278237174]
        at org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1831)
        at org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1826)
        at org.apache.ignite.internal.worker.WorkersRegistry.onIdle(WorkersRegistry.java:233)
        at org.apache.ignite.internal.util.worker.GridWorker.onIdle(GridWorker.java:297)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.lambda$new$0(ServerImpl.java:2663)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorker.body(ServerImpl.java:7181)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.body(ServerImpl.java:2700)
        at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
        at org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorkerThread.body(ServerImpl.java:7119)
        at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
[08:51:08,513][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] No deadlocked threads detected.
[08:51:08,853][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] Thread dump at 2019/09/12 08:51:08 UTC
Thread [name="sys-#1257", id=1683, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="sys-#1256", id=1682, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="sys-#1255", id=1681, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="sys-#1254", id=1680, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

Thread [name="query-#1253", id=1679, state=TIMED_WAITING, blockCnt=0, waitCnt=1]
    Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5de43f6a, ownerName=null, ownerId=-1]
        at sun.misc.Unsafe.park(Native Method)
        at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467)
        at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)


Regards,
Favas



Re: Slowness in initial data load using COPY command

Posted by Oleg Popov <o....@livelace.ru>.
Hello. The same thing I see when I move data from MariaDB to Apache Ignite (PutAll). 
I have tens of databases, but such things happens for a 100k rows database and for 400k+ rows databases as well. 
I tried to: 

1. Increase threads 
2. Disable WaL/persistance. 
3. Put WaL/data on a SSD. 

Nothing was helped. And now I have to completely disable failureHandler: 

< property name ="failureHandler" > 
< bean class ="org.apache.ignite.failure.NoOpFailureHandler" /> 
</ property > 




From: "Muhammed Favas" <fa...@expeedsoftware.com> 
To: "user" <us...@ignite.apache.org> 
Sent: Thursday, September 12, 2019 1:23:43 PM 
Subject: Slowness in initial data load using COPY command 



HI, 



I was trying to load data using csv file (each file contains 5 million rows of record which is approx.. 4 GB of data) using COPY command. 

In initial stage of loading it was quit fast, but later the load process start slowing down and showing hardly 1% of CPU usage. My cluster has 5 nodes, each with 8 core CPU and 32 GB RAM. 

When I checked one node’s log, I have seen some sever messages which is like what I have give below. 



Can some one help me to understand the error details give below and how can I improve my data load speed. 



[08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] Copied file [src=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000006.wal, dst=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003066.wal] 

[08:51:00,910][INFO][wal-file-archiver%null-#54][FileWriteAheadLogManager] Starting to copy WAL segment [absIdx=3067, segIdx=7, origFile=/data/apache-ignite-2.7.5-bin/work/db/wal/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000000007.wal, dstFile=/data/apache-ignite-2.7.5-bin/work/db/wal/archive/node00-18be5852-ed47-40a7-a256-ebbaa3376d39/0000000000003067.wal] 

[08:51:08,512 ][SEVERE][tcp-disco-msg-worker-#2][G] Blocked system-critical thread has been detected. This can lead to cluster-wide undefined behaviour [threadName=data-streamer-stripe-2, blockedFor=31s] 

[08:51:08,512][WARNING][tcp-disco-msg-worker-#2][G] Thread [name="data-streamer-stripe-2-#11", id=24, state=WAITING, blockCnt=0, waitCnt=81018] 

Lock [object=java.util.concurrent.locks.ReentrantLock$NonfairSync@23aede76, ownerName=data-streamer-stripe-4-#13, ownerId=26] 



[08:51:08,512 ][SEVERE][tcp-disco-msg-worker-#2][] Critical system error detected. Will be handled accordingly to configured handler [hnd=StopNodeOrHaltFailureHandler [tryStop=false, timeout=0, super=AbstractFailureHandler [ignoredFailureTypes=[SYSTEM_WORKER_BLOCKED, SYSTEM_CRITICAL_OPERATION_TIMEOUT]]], failureCtx=FailureContext [type=SYSTEM_WORKER_BLOCKED, err=class o.a.i.IgniteException: GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, heartbeatTs=1568278237174]]] 

class org.apache.ignite.IgniteException: GridWorker [name=data-streamer-stripe-2, igniteInstanceName=null, finished=false, heartbeatTs=1568278237174] 

at org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1831) 

at org.apache.ignite.internal.IgnitionEx$IgniteNamedInstance$2.apply(IgnitionEx.java:1826) 

at org.apache.ignite.internal.worker.WorkersRegistry.onIdle(WorkersRegistry.java:233) 

at org.apache.ignite.internal.util.worker.GridWorker.onIdle(GridWorker.java:297) 

at org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.lambda$new$0(ServerImpl.java:2663) 

at org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorker.body(ServerImpl.java:7181) 

at org.apache.ignite.spi.discovery.tcp.ServerImpl$RingMessageWorker.body(ServerImpl.java:2700) 

at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120) 

at org.apache.ignite.spi.discovery.tcp.ServerImpl$MessageWorkerThread.body(ServerImpl.java:7119) 

at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62) 

[08:51:08,513][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] No deadlocked threads detected. 

[08:51:08,853][WARNING][tcp-disco-msg-worker-#2][FailureProcessor] Thread dump at 2019/09/12 08:51:08 UTC 

Thread [name="sys-#1257", id=1683, state=TIMED_WAITING, blockCnt=0, waitCnt=1] 

Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1] 

at sun.misc.Unsafe.park(Native Method) 

at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) 

at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) 

at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467) 

at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073) 

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) 

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 

at java.lang.Thread.run(Thread.java:748) 



Thread [name="sys-#1256", id=1682, state=TIMED_WAITING, blockCnt=0, waitCnt=1] 

Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1] 

at sun.misc.Unsafe.park(Native Method) 

at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) 

at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) 

at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467) 

at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073) 

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) 

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 

at java.lang.Thread.run(Thread.java:748) 



Thread [name="sys-#1255", id=1681, state=TIMED_WAITING, blockCnt=0, waitCnt=1] 

Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1] 

at sun.misc.Unsafe.park(Native Method) 

at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) 

at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) 

at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467) 

at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073) 

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) 

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 

at java.lang.Thread.run(Thread.java:748) 



Thread [name="sys-#1254", id=1680, state=TIMED_WAITING, blockCnt=0, waitCnt=1] 

Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5ece7692, ownerName=null, ownerId=-1] 

at sun.misc.Unsafe.park(Native Method) 

at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) 

at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) 

at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467) 

at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073) 

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) 

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 

at java.lang.Thread.run(Thread.java:748) 



Thread [name="query-#1253", id=1679, state=TIMED_WAITING, blockCnt=0, waitCnt=1] 

Lock [object=java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject@5de43f6a, ownerName=null, ownerId=-1] 

at sun.misc.Unsafe.park(Native Method) 

at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) 

at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) 

at java.util.concurrent.LinkedBlockingQueue.poll(LinkedBlockingQueue.java:467) 

at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1073) 

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1134) 

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 

at java.lang.Thread.run(Thread.java:748) 





Regards, 

Favas