You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@ignite.apache.org by "novacean.alex" <no...@gmail.com> on 2019/11/06 09:20:23 UTC

Data is lost during rebalance

Hello,

I am a new user of Ignite and i'm trying to get a cluster with 3 server
nodes up and running in Kubernetes. Everything works perfectly until one
ignite node gets restarted. During the rebalance process i noticed that
~20.000 entries are lost, This happens with each restart. If two ignite
nodes are restarted at the same time at the end of the rebalance process of
the ~40.000 entries are lost.

Before the restart:
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/misskeystest.png> 


After restart and rebalance:
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/misskeuresults.png> 


After the rebalance process is done @n2 is missing 18705 entries.

This is the config file i am using:

<?xml version="1.0" encoding="UTF-8"?>
    <beans xmlns="http://www.springframework.org/schema/beans"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:util="http://www.springframework.org/schema/util"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.springframework.org/schema/beans        
http://www.springframework.org/schema/beans/spring-beans.xsd        
http://www.springframework.org/schema/util        
http://www.springframework.org/schema/util/spring-util.xsd
http://www.springframework.org/schema/context	       
https://www.springframework.org/schema/context/spring-context.xsd">
        <context:property-placeholder />
        <bean class="org.apache.ignite.configuration.IgniteConfiguration">
            <property name="consistentId"
value="${IGNITE_OVERRIDE_CONSISTENT_ID}"/>
            <property name="systemThreadPoolSize" value="2000"/>
            <property name="publicThreadPoolSize" value="500"/>
            <property name="stripedPoolSize" value="800"/>
            <property name="rebalanceThreadPoolSize" value="16"/>
            <property name="dataStorageConfiguration">
                <bean
class="org.apache.ignite.configuration.DataStorageConfiguration">
                    <property name="pageSize" value="16384"/>
                    <property name="defaultDataRegionConfiguration">
                        <bean
class="org.apache.ignite.configuration.DataRegionConfiguration">
                            <property name="persistenceEnabled"
value="false"/>
                            <property name="maxSize" value="#{12L * 1024L *
1024L * 1024L}"/>
                            <property name="initialSize" value="#{10L *
1024L * 1024L}"/>
                            <property name="pageEvictionMode"
value="RANDOM_2_LRU"/>
                            <property name="metricsEnabled" value="true"/>
                        </bean>
                    </property>
                </bean>
            </property>
            <property name="cacheConfiguration">
              <bean
class="org.apache.ignite.configuration.CacheConfiguration">
                  <property name="copyOnRead" value="false"/>
                  <property name="statisticsEnabled" value="true"/>
                  <property name="name" value="session-cache"/>
                  <property name="onheapCacheEnabled" value="true"/>
                  <property name="rebalanceMode" value="ASYNC"/>
                  <property name="cacheMode" value="REPLICATED"/>
                  <property name="readFromBackup" value="false"/>
                  <property name="rebalanceBatchSize" value="#{80L * 1024L *
1024L}"/>
                  <property name="rebalanceThrottle" value="2500"/>
                  <property name="writeSynchronizationMode"
value="PRIMARY_SYNC"/>
                  <property name="evictionPolicy">
                      <bean
class="org.apache.ignite.cache.eviction.lru.LruEvictionPolicy">
                         <property name="maxSize" value="1000"/>
                      </bean>
                  </property>
              </bean>
            </property>
             <property name="communicationSpi">
                  <bean
class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
                      <property name="slowClientQueueLimit" value="999"/>
                      <property name="messageQueueLimit" value="5000"/>
                      <property name="directBuffer" value="true"/>
                      <property name="directSendBuffer" value="true"/>
                        <property name="selectorsCount" value="64"/>
                      <property name="connectionsPerNode" value="24"/>
                      <property name="usePairedConnections" value="true"/>
                      <property name="ackSendThreshold" value="4"/>
                  </bean>
             </property>
            <property name="discoverySpi">
                <bean
class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
                    <property name="ipFinder">
                        <bean
class="org.apache.ignite.spi.discovery.tcp.ipfinder.kubernetes.TcpDiscoveryKubernetesIpFinder">
                            <property name="namespace" value="xxx"/>
                            <property name="serviceName" value="xxx"/>
                            <property name="masterUrl" value="xxx"/>
                        </bean>
                    </property>
                </bean>
            </property>
        </bean>
    </beans>

I have't found any other issue related to this so my guess is that must be a
configuration problem.
Any help would be greatly appreciated.

Thank you,
Alex.



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Re: Data is lost during rebalance

Posted by "novacean.alex" <no...@gmail.com>.
Upon further research it appears that disabling page eviction for the default
data region is not something i should do. 
According to Apache Ignite Documentation if Ignite Native Persistence is off
and page eviction is disabled for the default data region, OutOfMemory
errors will be thrown.
The 'persistenceEnabled' property is set to false in my config, as i am not
able to use it, so i need the page eviction enabled in order to avoid
OutOfMemory errors.



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Re: Data is lost during rebalance

Posted by Вячеслав Коптилин <sl...@gmail.com>.
Hi Alex,

I will take a look at log files.

Thanks,
S.

ср, 6 нояб. 2019 г. в 17:51, novacean.alex <no...@gmail.com>:

> This is the test i am performing:
>
> 1. I have an Ignite Cluster of 3 Server Nodes running in Kubernetes. (the
> cluster is created using a StatefulSet)
> 2. Once the cluster is up i use a Deployment to run 5 pods each with a
> Client Node that performs put operations on my cache.
> 3. Once i see in the logs "Page-based evictions started. Consider
> increasing
> 'maxSize' on Data Region configuration: default" on all 3 Server Nodes i
> delete the deployment, thus stopping the put opperations.
> 4. Once the deployment is deleted i restart node-2 from the cluster and
> once
> it is up again i wait for it to finish rebalancing.
> 5. The result of the test so far is that entries are missing from node-2
> after the rebalance is complete.
>
> Log Files before the restart:
> node-0.log
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-0.log>
> node-1.log
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-1.log>
> node-2.log
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-2.log>
>
> Log Files after the restart:
>
> node-0.log
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-0.log>
> node-1.log
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-1.log>
> node-2.log
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-2.log>
>
> I want to use the cluster to store web-session and i have ~400 request per
> second of cahe read and put operation.
> Would disabling the page eviction for the default data region end up and
> getting OOM Exceptions?
> If no, i will try and disable it as you suggested and perform more tests.
>
> Thanks,
> Alex.
>
>
>
> --
> Sent from: http://apache-ignite-users.70518.x6.nabble.com/
>

Re: Data is lost during rebalance

Posted by "novacean.alex" <no...@gmail.com>.
This is the test i am performing:

1. I have an Ignite Cluster of 3 Server Nodes running in Kubernetes. (the
cluster is created using a StatefulSet)
2. Once the cluster is up i use a Deployment to run 5 pods each with a
Client Node that performs put operations on my cache.
3. Once i see in the logs "Page-based evictions started. Consider increasing
'maxSize' on Data Region configuration: default" on all 3 Server Nodes i
delete the deployment, thus stopping the put opperations.
4. Once the deployment is deleted i restart node-2 from the cluster and once
it is up again i wait for it to finish rebalancing.
5. The result of the test so far is that entries are missing from node-2
after the rebalance is complete.

Log Files before the restart:
node-0.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-0.log>  
node-1.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-1.log>  
node-2.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-2.log>  

Log Files after the restart:

node-0.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-0.log>  
node-1.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-1.log>  
node-2.log
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/node-2.log>  

I want to use the cluster to store web-session and i have ~400 request per
second of cahe read and put operation.
Would disabling the page eviction for the default data region end up and
getting OOM Exceptions?
If no, i will try and disable it as you suggested and perform more tests.

Thanks,
Alex.



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Re: Data is lost during rebalance

Posted by Вячеслав Коптилин <sl...@gmail.com>.
We are definitely missing something obvious. )))

1. Let's check log files for the following message: "Page-based evictions
started. Consider increasing 'maxSize' on Data Region configuration:"
2. Please try to disable page eviction for the default data region.
3. Could you please describe your test scenario in detail and attach log
files from all nodes.

Thanks,
S.

ср, 6 нояб. 2019 г. в 15:48, novacean.alex <no...@gmail.com>:

> Hello Slava,
>
> Apparently i celebrated to early. The first test i performed after i used
> the 'backup' cache property in the config it was a success indeed, but the
> test was performed with a half full cache. When performing the test with
> the
> cache full the results are the same as previously, entries missing after re
> balance.
>
> The code snipped you provided returned : REPLICATED. And in visor i also
> had
> this which i missed to share earlier:
> <http://apache-ignite-users.70518.x6.nabble.com/file/t2660/cachemode.png>
>
> Thank you,
> Alex Novacean.
>
>
>
> --
> Sent from: http://apache-ignite-users.70518.x6.nabble.com/
>

Re: Data is lost during rebalance

Posted by "novacean.alex" <no...@gmail.com>.
Hello Slava,

Apparently i celebrated to early. The first test i performed after i used
the 'backup' cache property in the config it was a success indeed, but the
test was performed with a half full cache. When performing the test with the
cache full the results are the same as previously, entries missing after re
balance.

The code snipped you provided returned : REPLICATED. And in visor i also had
this which i missed to share earlier:
<http://apache-ignite-users.70518.x6.nabble.com/file/t2660/cachemode.png> 

Thank you,
Alex Novacean.



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Re: Data is lost during rebalance

Posted by Вячеслав Коптилин <sl...@gmail.com>.
Hi Alex,

Oh... I missed the fact that your cache is replicated. In that case, you
don't need to specify the number of backups.
Could you please check, that the cache mode is replicated? You can try the
following code snippet:

System.out.println(ignite.cache("session-cache").getConfiguration(CacheConfiguration.class).getCacheMode());

Thanks,

S.


ср, 6 нояб. 2019 г. в 14:53, novacean.alex <no...@gmail.com>:

> Hello Slava,
>
> Thank you very much for the answer. It worked! Now every time 1 ignite node
> gets restarted it re-balances the exact number of keys.
>
> I was aware of the "backup" cache property but i think i misunderstood it's
> usages. As my cache is REPLICATED and in the documentations says that */"In
> Ignite, replicated caches are implemented in a way similar to partitioned
> caches where every key has a primary copy and is also backed up on all
> other
> nodes in the cluster."/ * i thought that really implies that i already have
> the backups.
>
> Thank you again,
> Alex.
>
>
>
> --
> Sent from: http://apache-ignite-users.70518.x6.nabble.com/
>

Re: Data is lost during rebalance

Posted by "novacean.alex" <no...@gmail.com>.
Hello Slava,

Thank you very much for the answer. It worked! Now every time 1 ignite node
gets restarted it re-balances the exact number of keys.

I was aware of the "backup" cache property but i think i misunderstood it's
usages. As my cache is REPLICATED and in the documentations says that */"In
Ignite, replicated caches are implemented in a way similar to partitioned
caches where every key has a primary copy and is also backed up on all other
nodes in the cluster."/ * i thought that really implies that i already have
the backups.

Thank you again,
Alex.



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Re: Data is lost during rebalance

Posted by Вячеслав Коптилин <sl...@gmail.com>.
Hello Alex,

You need to specify the number of backups for your cache. For instance,

    <property name="cacheConfiguration">
        <bean class="org.apache.ignite.configuration.CacheConfiguration">

            ...            <!-- Number of backup nodes. -->
            <property name="backups" value="1"/>
            ...
        </bean>
    </property>


Please take a look at the page for details:
https://apacheignite.readme.io/docs/primary-and-backup-copies

Thanks,
S.

ср, 6 нояб. 2019 г. в 12:20, novacean.alex <no...@gmail.com>:

> Hello,
>
> I am a new user of Ignite and i'm trying to get a cluster with 3 server
> nodes up and running in Kubernetes. Everything works perfectly until one
> ignite node gets restarted. During the rebalance process i noticed that
> ~20.000 entries are lost, This happens with each restart. If two ignite
> nodes are restarted at the same time at the end of the rebalance process of
> the ~40.000 entries are lost.
>
> Before the restart:
> <
> http://apache-ignite-users.70518.x6.nabble.com/file/t2660/misskeystest.png>
>
>
>
> After restart and rebalance:
> <
> http://apache-ignite-users.70518.x6.nabble.com/file/t2660/misskeuresults.png>
>
>
>
> After the rebalance process is done @n2 is missing 18705 entries.
>
> This is the config file i am using:
>
> <?xml version="1.0" encoding="UTF-8"?>
>     <beans xmlns="http://www.springframework.org/schema/beans"
> xmlns:context="http://www.springframework.org/schema/context"
> xmlns:util="http://www.springframework.org/schema/util"
> xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
> xsi:schemaLocation="http://www.springframework.org/schema/beans
> http://www.springframework.org/schema/beans/spring-beans.xsd
> http://www.springframework.org/schema/util
> http://www.springframework.org/schema/util/spring-util.xsd
> http://www.springframework.org/schema/context
> https://www.springframework.org/schema/context/spring-context.xsd">
>         <context:property-placeholder />
>         <bean class="org.apache.ignite.configuration.IgniteConfiguration">
>             <property name="consistentId"
> value="${IGNITE_OVERRIDE_CONSISTENT_ID}"/>
>             <property name="systemThreadPoolSize" value="2000"/>
>             <property name="publicThreadPoolSize" value="500"/>
>             <property name="stripedPoolSize" value="800"/>
>             <property name="rebalanceThreadPoolSize" value="16"/>
>             <property name="dataStorageConfiguration">
>                 <bean
> class="org.apache.ignite.configuration.DataStorageConfiguration">
>                     <property name="pageSize" value="16384"/>
>                     <property name="defaultDataRegionConfiguration">
>                         <bean
> class="org.apache.ignite.configuration.DataRegionConfiguration">
>                             <property name="persistenceEnabled"
> value="false"/>
>                             <property name="maxSize" value="#{12L * 1024L *
> 1024L * 1024L}"/>
>                             <property name="initialSize" value="#{10L *
> 1024L * 1024L}"/>
>                             <property name="pageEvictionMode"
> value="RANDOM_2_LRU"/>
>                             <property name="metricsEnabled" value="true"/>
>                         </bean>
>                     </property>
>                 </bean>
>             </property>
>             <property name="cacheConfiguration">
>               <bean
> class="org.apache.ignite.configuration.CacheConfiguration">
>                   <property name="copyOnRead" value="false"/>
>                   <property name="statisticsEnabled" value="true"/>
>                   <property name="name" value="session-cache"/>
>                   <property name="onheapCacheEnabled" value="true"/>
>                   <property name="rebalanceMode" value="ASYNC"/>
>                   <property name="cacheMode" value="REPLICATED"/>
>                   <property name="readFromBackup" value="false"/>
>                   <property name="rebalanceBatchSize" value="#{80L * 1024L
> *
> 1024L}"/>
>                   <property name="rebalanceThrottle" value="2500"/>
>                   <property name="writeSynchronizationMode"
> value="PRIMARY_SYNC"/>
>                   <property name="evictionPolicy">
>                       <bean
> class="org.apache.ignite.cache.eviction.lru.LruEvictionPolicy">
>                          <property name="maxSize" value="1000"/>
>                       </bean>
>                   </property>
>               </bean>
>             </property>
>              <property name="communicationSpi">
>                   <bean
> class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
>                       <property name="slowClientQueueLimit" value="999"/>
>                       <property name="messageQueueLimit" value="5000"/>
>                       <property name="directBuffer" value="true"/>
>                       <property name="directSendBuffer" value="true"/>
>                         <property name="selectorsCount" value="64"/>
>                       <property name="connectionsPerNode" value="24"/>
>                       <property name="usePairedConnections" value="true"/>
>                       <property name="ackSendThreshold" value="4"/>
>                   </bean>
>              </property>
>             <property name="discoverySpi">
>                 <bean
> class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
>                     <property name="ipFinder">
>                         <bean
>
> class="org.apache.ignite.spi.discovery.tcp.ipfinder.kubernetes.TcpDiscoveryKubernetesIpFinder">
>                             <property name="namespace" value="xxx"/>
>                             <property name="serviceName" value="xxx"/>
>                             <property name="masterUrl" value="xxx"/>
>                         </bean>
>                     </property>
>                 </bean>
>             </property>
>         </bean>
>     </beans>
>
> I have't found any other issue related to this so my guess is that must be
> a
> configuration problem.
> Any help would be greatly appreciated.
>
> Thank you,
> Alex.
>
>
>
> --
> Sent from: http://apache-ignite-users.70518.x6.nabble.com/
>