You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sling.apache.org by ro...@apache.org on 2017/11/07 09:29:28 UTC

[sling-org-apache-sling-discovery-oak] 03/07: SLING-6924 : avoid harmless error message when new instance joins : a slowly starting new instance might first be visible via discovery-lite before it sets the leaderElectionid property - if that's the case there used to be a NPE - which is harmless but not nice as its an error

This is an automated email from the ASF dual-hosted git repository.

rombert pushed a commit to annotated tag org.apache.sling.discovery.oak-1.2.20
in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-discovery-oak.git

commit bf294ac5e21fa979ace5f2dbef2fd78bf0d24b5e
Author: Stefan Egli <st...@apache.org>
AuthorDate: Tue Jul 4 12:09:01 2017 +0000

    SLING-6924 : avoid harmless error message when new instance joins : a slowly starting new instance might first be visible via discovery-lite before it sets the leaderElectionid property - if that's the case there used to be a NPE - which is harmless but not nice as its an error
    
    git-svn-id: https://svn.apache.org/repos/asf/sling/trunk/bundles/extensions/discovery/oak@1800761 13f79535-47bb-0310-9956-ffa450edef68
---
 pom.xml                                            |   4 +-
 .../oak/cluster/OakClusterViewService.java         |  29 +++-
 .../discovery/oak/OakDiscoveryServiceTest.java     | 152 +++++++++++++++++++++
 3 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index 1fc8bf7..570fa41 100644
--- a/pom.xml
+++ b/pom.xml
@@ -167,7 +167,7 @@
 		<dependency>
 			<groupId>org.apache.sling</groupId>
 			<artifactId>org.apache.sling.discovery.base</artifactId>
-			<version>2.0.1-SNAPSHOT</version>
+			<version>2.0.3-SNAPSHOT</version>
             <scope>provided</scope>
 		</dependency>
         <!-- besides including discovery.base' normal jar above, 
@@ -176,7 +176,7 @@
         <dependency>
             <groupId>org.apache.sling</groupId>
             <artifactId>org.apache.sling.discovery.base</artifactId>
-            <version>2.0.1-SNAPSHOT</version>
+            <version>2.0.3-SNAPSHOT</version>
             <scope>test</scope>
             <type>test-jar</type>
         </dependency>
diff --git a/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java b/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java
index 48cd976..af30f20 100644
--- a/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java
+++ b/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java
@@ -179,6 +179,20 @@ public class OakClusterViewService implements ClusterViewService {
             }
             String leaderElectionId = getLeaderElectionId(resourceResolver,
                     slingId);
+            // SLING-6924 : leaderElectionId can be null here
+            // this means that another instance is just starting up, has already
+            // created its oak lease, thus is already visible from an oak discover-lite
+            // point of view - but upper level code here in discovery.oak has not yet
+            // set the leaderElectionId. This is rare but valid case
+            if (leaderElectionId == null) {
+                // then at this stage the clusterView is not yet established
+                // in a few moments it will but at this point not.
+                // so falling back to treating this as NO_ESTABLISHED_VIEW
+                // and with the heartbeat interval this situation will
+                // resolve itself upon one of the next pings
+                throw new UndefinedClusterViewException(Reason.NO_ESTABLISHED_VIEW,
+                        "no leaderElectionId available yet for slingId="+slingId);
+            }
             leaderElectionIds.put(id, leaderElectionId);
         }
 
@@ -293,9 +307,20 @@ public class OakClusterViewService implements ClusterViewService {
             throw new IllegalStateException("slingId must not be null");
         }
         final String myClusterNodePath = config.getClusterInstancesPath()+"/"+slingId;
-        ValueMap resourceMap = resourceResolver.getResource(myClusterNodePath)
-                .adaptTo(ValueMap.class);
+        // SLING-6924 case 1 : /var/discovery/oak/clusterInstances/<slingId> can be non existant == null
+        final Resource myClusterNode = resourceResolver.getResource(myClusterNodePath);
+        if (myClusterNode == null) {
+            // SLING-6924 : return null case 1
+            return null;
+        }
+        ValueMap resourceMap = myClusterNode.adaptTo(ValueMap.class);
+        // SLING-6924 case 2 : /var/discovery/oak/clusterInstances/<slingId> can exist BUT leaderElectionId not yet set
+        //    namely the "leaderElectionId" is only written when resetLeaderElectionId() is called - which happens
+        //    on OakViewChecker.activate (or when isolated) - and this activate *can* happen after properties
+        //    or announcements have been written - those end up below /var/discovery/oak/clusterInstances/<slingId>/
         String result = resourceMap.get("leaderElectionId", String.class);
+        
+        // SLING-6924 : return null case 2 (if leaderElectionId is indeed null, that is)
         return result;
     }
 
diff --git a/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java b/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java
index f43831e..4a41e7f 100644
--- a/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java
+++ b/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java
@@ -20,20 +20,33 @@ package org.apache.sling.discovery.oak;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.UUID;
 
+import org.apache.sling.api.resource.LoginException;
+import org.apache.sling.api.resource.ModifiableValueMap;
+import org.apache.sling.api.resource.Resource;
 import org.apache.sling.api.resource.ResourceResolver;
 import org.apache.sling.api.resource.ResourceResolverFactory;
+import org.apache.sling.discovery.TopologyEvent;
+import org.apache.sling.discovery.base.its.setup.OSGiMock;
 import org.apache.sling.discovery.base.its.setup.VirtualInstance;
+import org.apache.sling.discovery.base.its.setup.mock.DummyResourceResolverFactory;
+import org.apache.sling.discovery.base.its.setup.mock.MockFactory;
 import org.apache.sling.discovery.commons.providers.base.DummyListener;
 import org.apache.sling.discovery.commons.providers.spi.base.DescriptorHelper;
 import org.apache.sling.discovery.commons.providers.spi.base.DiscoveryLiteConfig;
+import org.apache.sling.discovery.commons.providers.spi.base.DiscoveryLiteDescriptor;
 import org.apache.sling.discovery.commons.providers.spi.base.DiscoveryLiteDescriptorBuilder;
 import org.apache.sling.discovery.commons.providers.spi.base.DummySlingSettingsService;
 import org.apache.sling.discovery.commons.providers.spi.base.IdMapService;
+import org.apache.sling.discovery.oak.its.setup.OakTestConfig;
 import org.apache.sling.discovery.oak.its.setup.OakVirtualInstanceBuilder;
+import org.apache.sling.discovery.oak.its.setup.SimulatedLease;
 import org.apache.sling.discovery.oak.its.setup.SimulatedLeaseCollection;
 import org.junit.Test;
 import org.slf4j.Logger;
@@ -111,6 +124,10 @@ public class OakDiscoveryServiceTest {
         discoBuilder.setFinal(true);
         DescriptorHelper.setDiscoveryLiteDescriptor(builder.getResourceResolverFactory(),
                 discoBuilder);
+        // SLING-6924 : need to simulate a OakViewChecker.activate to trigger resetLeaderElectionId
+        // otherwise no TOPOLOGY_INIT will be generated as without a leaderElectionId we now
+        // consider a view as NO_ESTABLISHED_VIEW
+        OSGiMock.activate(builder.getViewChecker());
         discoveryService.checkForTopologyChange();
         assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
         assertEquals(1, listener.countEvents());
@@ -181,4 +198,139 @@ public class OakDiscoveryServiceTest {
         assertEquals(3, listener.countEvents());
     }
 
+    @Test
+    public void testNotYetInitializedLeaderElectionid() throws Exception {
+        logger.info("testNotYetInitializedLeaderElectionid: start");
+        OakVirtualInstanceBuilder builder1 =
+                (OakVirtualInstanceBuilder) new OakVirtualInstanceBuilder()
+                .setDebugName("instance")
+                .newRepository("/foo/barrx/foo/", true)
+                .setConnectorPingInterval(999)
+                .setConnectorPingTimeout(999);
+        VirtualInstance instance1 = builder1.build();
+        logger.info("testNotYetInitializedLeaderElectionid: created 1 instance, binding listener...");
+
+        DummyListener listener = new DummyListener();
+        OakDiscoveryService discoveryService = (OakDiscoveryService) instance1.getDiscoveryService();
+        discoveryService.bindTopologyEventListener(listener);
+
+        logger.info("testNotYetInitializedLeaderElectionid: waiting 2sec, listener should not get anything yet");
+        assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
+        assertEquals(0, listener.countEvents());
+
+        logger.info("testNotYetInitializedLeaderElectionid: issuing 2 heartbeats with each instance should let the topology get established");
+        instance1.heartbeatsAndCheckView();
+        instance1.heartbeatsAndCheckView();
+
+        logger.info("testNotYetInitializedLeaderElectionid: listener should get an event within 2sec from now at latest");
+        assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
+        assertEquals(1, listener.countEvents());
+
+        SimulatedLeaseCollection c = builder1.getSimulatedLeaseCollection();
+        String secondSlingId = UUID.randomUUID().toString();
+        final SimulatedLease newIncomingInstance = new SimulatedLease(instance1.getResourceResolverFactory(), c, secondSlingId);
+        c.hooked(newIncomingInstance);
+        c.incSeqNum(1);
+        newIncomingInstance.updateLeaseAndDescriptor(new OakTestConfig());
+        
+        logger.info("testNotYetInitializedLeaderElectionid: issuing another 2 heartbeats");
+        instance1.heartbeatsAndCheckView();
+        instance1.heartbeatsAndCheckView();
+        
+        // there are different properties that an instance must set in the repository such that it finally becomes visible.
+        // these include:
+        // 1) idmap : it must map the oak id to sling id
+        // 2) node named after its own slingId under /var/discovery/oak/clusterInstances/<slingId>
+        // 3) store the leaderElectionId under /var/discovery/oak/clusterInstances/<slingId>
+        // in all 3 cases the code must work fine if that node/property doesn't exist
+        // and that's exactly what we're testing here.
+
+
+        // initially not even the idmap is updated, so we're stuck with TOPOLOGY_CHANGING
+
+        // due to the nature of the syncService/minEventDelay we now explicitly first sleep 2sec before waiting for async events for another 2sec
+        logger.info("testNotYetInitializedLeaderElectionid: sleeping 2sec for topology change to happen");
+        Thread.sleep(2000);
+        logger.info("testNotYetInitializedLeaderElectionid: ensuring no async events are still in the pipe - for another 2sec");
+        assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
+        logger.info("testNotYetInitializedLeaderElectionid: now listener should have received 2 events, INIT and CHANGING, it got: "+listener.countEvents());
+        assertEquals(2, listener.countEvents());
+        List<TopologyEvent> events = listener.getEvents();
+        assertEquals(TopologyEvent.Type.TOPOLOGY_INIT, events.get(0).getType());
+        assertEquals(TopologyEvent.Type.TOPOLOGY_CHANGING, events.get(1).getType());
+        
+        // let's update the idmap first then
+        DummyResourceResolverFactory factory1 = (DummyResourceResolverFactory) instance1.getResourceResolverFactory();
+        ResourceResolverFactory factory2 = MockFactory.mockResourceResolverFactory(factory1.getSlingRepository());
+
+        ResourceResolver resourceResolver = getResourceResolver(instance1.getResourceResolverFactory());
+        DiscoveryLiteDescriptor descriptor =
+                DiscoveryLiteDescriptor.getDescriptorFrom(resourceResolver);
+        resourceResolver.close();
+        
+        DiscoveryLiteDescriptorBuilder dlb = prefill(descriptor);
+        dlb.me(2);
+        DescriptorHelper.setDiscoveryLiteDescriptor(factory2, dlb);
+
+        IdMapService secondIdMapService = IdMapService.testConstructor((DiscoveryLiteConfig) builder1.getConnectorConfig(), new DummySlingSettingsService(secondSlingId), factory2);
+        
+        instance1.heartbeatsAndCheckView();
+        instance1.heartbeatsAndCheckView();
+        Thread.sleep(2000);
+        assertEquals(2, listener.countEvents());
+        
+        
+        // now let's add the /var/discovery/oak/clusterInstances/<slingId> node
+        resourceResolver = getResourceResolver(factory2);
+        Resource clusterInstancesRes = resourceResolver.getResource(builder1.getConnectorConfig().getClusterInstancesPath());
+        assertNull(clusterInstancesRes.getChild(secondSlingId));
+        resourceResolver.create(clusterInstancesRes, secondSlingId, null);
+        resourceResolver.commit();
+        assertNotNull(clusterInstancesRes.getChild(secondSlingId));
+        resourceResolver.close();
+
+        instance1.heartbeatsAndCheckView();
+        instance1.heartbeatsAndCheckView();
+        Thread.sleep(2000);
+        assertEquals(2, listener.countEvents());
+
+        // now let's add the leaderElectionId
+        resourceResolver = getResourceResolver(factory2);
+        Resource instanceResource = resourceResolver.getResource(builder1.getConnectorConfig().getClusterInstancesPath() + "/" + secondSlingId);
+        assertNotNull(instanceResource);
+        instanceResource.adaptTo(ModifiableValueMap.class).put("leaderElectionId", "0");
+        resourceResolver.commit();
+        resourceResolver.close();
+        
+        instance1.heartbeatsAndCheckView();
+        instance1.heartbeatsAndCheckView();
+        Thread.sleep(2000);
+        assertEquals(3, listener.countEvents());
+        assertEquals(TopologyEvent.Type.TOPOLOGY_CHANGED, events.get(2).getType());
+    }
+
+    private DiscoveryLiteDescriptorBuilder prefill(DiscoveryLiteDescriptor d) throws Exception {
+        DiscoveryLiteDescriptorBuilder b = new DiscoveryLiteDescriptorBuilder();
+        b.setFinal(true);
+        long seqnum = d.getSeqNum();
+        b.seq((int) seqnum);
+        b.activeIds(box(d.getActiveIds()));
+        b.deactivatingIds(box(d.getDeactivatingIds()));
+        b.me(d.getMyId());
+        b.id(d.getViewId());
+        return b;
+    }
+
+    private Integer[] box(final int[] ids) {
+        //TODO: use Guava
+        List<Integer> list = new ArrayList<Integer>(ids.length);
+        for (Integer i : ids) {
+            list.add(i);
+        }
+        return list.toArray(new Integer[list.size()]);
+    }
+    
+    private ResourceResolver getResourceResolver(ResourceResolverFactory resourceResolverFactory) throws LoginException {
+        return resourceResolverFactory.getServiceResourceResolver(null);
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
"commits@sling.apache.org" <co...@sling.apache.org>.