You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@sling.apache.org by ro...@apache.org on 2017/11/07 09:29:28 UTC
[sling-org-apache-sling-discovery-oak] 03/07: SLING-6924 : avoid
harmless error message when new instance joins : a slowly starting new
instance might first be visible via discovery-lite before it sets the
leaderElectionid property - if that's the case there used to be a NPE -
which is harmless but not nice as its an error
This is an automated email from the ASF dual-hosted git repository.
rombert pushed a commit to annotated tag org.apache.sling.discovery.oak-1.2.20
in repository https://gitbox.apache.org/repos/asf/sling-org-apache-sling-discovery-oak.git
commit bf294ac5e21fa979ace5f2dbef2fd78bf0d24b5e
Author: Stefan Egli <st...@apache.org>
AuthorDate: Tue Jul 4 12:09:01 2017 +0000
SLING-6924 : avoid harmless error message when new instance joins : a slowly starting new instance might first be visible via discovery-lite before it sets the leaderElectionid property - if that's the case there used to be a NPE - which is harmless but not nice as its an error
git-svn-id: https://svn.apache.org/repos/asf/sling/trunk/bundles/extensions/discovery/oak@1800761 13f79535-47bb-0310-9956-ffa450edef68
---
pom.xml | 4 +-
.../oak/cluster/OakClusterViewService.java | 29 +++-
.../discovery/oak/OakDiscoveryServiceTest.java | 152 +++++++++++++++++++++
3 files changed, 181 insertions(+), 4 deletions(-)
diff --git a/pom.xml b/pom.xml
index 1fc8bf7..570fa41 100644
--- a/pom.xml
+++ b/pom.xml
@@ -167,7 +167,7 @@
<dependency>
<groupId>org.apache.sling</groupId>
<artifactId>org.apache.sling.discovery.base</artifactId>
- <version>2.0.1-SNAPSHOT</version>
+ <version>2.0.3-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<!-- besides including discovery.base' normal jar above,
@@ -176,7 +176,7 @@
<dependency>
<groupId>org.apache.sling</groupId>
<artifactId>org.apache.sling.discovery.base</artifactId>
- <version>2.0.1-SNAPSHOT</version>
+ <version>2.0.3-SNAPSHOT</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
diff --git a/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java b/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java
index 48cd976..af30f20 100644
--- a/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java
+++ b/src/main/java/org/apache/sling/discovery/oak/cluster/OakClusterViewService.java
@@ -179,6 +179,20 @@ public class OakClusterViewService implements ClusterViewService {
}
String leaderElectionId = getLeaderElectionId(resourceResolver,
slingId);
+ // SLING-6924 : leaderElectionId can be null here
+ // this means that another instance is just starting up, has already
+ // created its oak lease, thus is already visible from an oak discover-lite
+ // point of view - but upper level code here in discovery.oak has not yet
+ // set the leaderElectionId. This is rare but valid case
+ if (leaderElectionId == null) {
+ // then at this stage the clusterView is not yet established
+ // in a few moments it will but at this point not.
+ // so falling back to treating this as NO_ESTABLISHED_VIEW
+ // and with the heartbeat interval this situation will
+ // resolve itself upon one of the next pings
+ throw new UndefinedClusterViewException(Reason.NO_ESTABLISHED_VIEW,
+ "no leaderElectionId available yet for slingId="+slingId);
+ }
leaderElectionIds.put(id, leaderElectionId);
}
@@ -293,9 +307,20 @@ public class OakClusterViewService implements ClusterViewService {
throw new IllegalStateException("slingId must not be null");
}
final String myClusterNodePath = config.getClusterInstancesPath()+"/"+slingId;
- ValueMap resourceMap = resourceResolver.getResource(myClusterNodePath)
- .adaptTo(ValueMap.class);
+ // SLING-6924 case 1 : /var/discovery/oak/clusterInstances/<slingId> can be non existant == null
+ final Resource myClusterNode = resourceResolver.getResource(myClusterNodePath);
+ if (myClusterNode == null) {
+ // SLING-6924 : return null case 1
+ return null;
+ }
+ ValueMap resourceMap = myClusterNode.adaptTo(ValueMap.class);
+ // SLING-6924 case 2 : /var/discovery/oak/clusterInstances/<slingId> can exist BUT leaderElectionId not yet set
+ // namely the "leaderElectionId" is only written when resetLeaderElectionId() is called - which happens
+ // on OakViewChecker.activate (or when isolated) - and this activate *can* happen after properties
+ // or announcements have been written - those end up below /var/discovery/oak/clusterInstances/<slingId>/
String result = resourceMap.get("leaderElectionId", String.class);
+
+ // SLING-6924 : return null case 2 (if leaderElectionId is indeed null, that is)
return result;
}
diff --git a/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java b/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java
index f43831e..4a41e7f 100644
--- a/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java
+++ b/src/test/java/org/apache/sling/discovery/oak/OakDiscoveryServiceTest.java
@@ -20,20 +20,33 @@ package org.apache.sling.discovery.oak;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+import java.util.ArrayList;
+import java.util.List;
import java.util.UUID;
+import org.apache.sling.api.resource.LoginException;
+import org.apache.sling.api.resource.ModifiableValueMap;
+import org.apache.sling.api.resource.Resource;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.api.resource.ResourceResolverFactory;
+import org.apache.sling.discovery.TopologyEvent;
+import org.apache.sling.discovery.base.its.setup.OSGiMock;
import org.apache.sling.discovery.base.its.setup.VirtualInstance;
+import org.apache.sling.discovery.base.its.setup.mock.DummyResourceResolverFactory;
+import org.apache.sling.discovery.base.its.setup.mock.MockFactory;
import org.apache.sling.discovery.commons.providers.base.DummyListener;
import org.apache.sling.discovery.commons.providers.spi.base.DescriptorHelper;
import org.apache.sling.discovery.commons.providers.spi.base.DiscoveryLiteConfig;
+import org.apache.sling.discovery.commons.providers.spi.base.DiscoveryLiteDescriptor;
import org.apache.sling.discovery.commons.providers.spi.base.DiscoveryLiteDescriptorBuilder;
import org.apache.sling.discovery.commons.providers.spi.base.DummySlingSettingsService;
import org.apache.sling.discovery.commons.providers.spi.base.IdMapService;
+import org.apache.sling.discovery.oak.its.setup.OakTestConfig;
import org.apache.sling.discovery.oak.its.setup.OakVirtualInstanceBuilder;
+import org.apache.sling.discovery.oak.its.setup.SimulatedLease;
import org.apache.sling.discovery.oak.its.setup.SimulatedLeaseCollection;
import org.junit.Test;
import org.slf4j.Logger;
@@ -111,6 +124,10 @@ public class OakDiscoveryServiceTest {
discoBuilder.setFinal(true);
DescriptorHelper.setDiscoveryLiteDescriptor(builder.getResourceResolverFactory(),
discoBuilder);
+ // SLING-6924 : need to simulate a OakViewChecker.activate to trigger resetLeaderElectionId
+ // otherwise no TOPOLOGY_INIT will be generated as without a leaderElectionId we now
+ // consider a view as NO_ESTABLISHED_VIEW
+ OSGiMock.activate(builder.getViewChecker());
discoveryService.checkForTopologyChange();
assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
assertEquals(1, listener.countEvents());
@@ -181,4 +198,139 @@ public class OakDiscoveryServiceTest {
assertEquals(3, listener.countEvents());
}
+ @Test
+ public void testNotYetInitializedLeaderElectionid() throws Exception {
+ logger.info("testNotYetInitializedLeaderElectionid: start");
+ OakVirtualInstanceBuilder builder1 =
+ (OakVirtualInstanceBuilder) new OakVirtualInstanceBuilder()
+ .setDebugName("instance")
+ .newRepository("/foo/barrx/foo/", true)
+ .setConnectorPingInterval(999)
+ .setConnectorPingTimeout(999);
+ VirtualInstance instance1 = builder1.build();
+ logger.info("testNotYetInitializedLeaderElectionid: created 1 instance, binding listener...");
+
+ DummyListener listener = new DummyListener();
+ OakDiscoveryService discoveryService = (OakDiscoveryService) instance1.getDiscoveryService();
+ discoveryService.bindTopologyEventListener(listener);
+
+ logger.info("testNotYetInitializedLeaderElectionid: waiting 2sec, listener should not get anything yet");
+ assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
+ assertEquals(0, listener.countEvents());
+
+ logger.info("testNotYetInitializedLeaderElectionid: issuing 2 heartbeats with each instance should let the topology get established");
+ instance1.heartbeatsAndCheckView();
+ instance1.heartbeatsAndCheckView();
+
+ logger.info("testNotYetInitializedLeaderElectionid: listener should get an event within 2sec from now at latest");
+ assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
+ assertEquals(1, listener.countEvents());
+
+ SimulatedLeaseCollection c = builder1.getSimulatedLeaseCollection();
+ String secondSlingId = UUID.randomUUID().toString();
+ final SimulatedLease newIncomingInstance = new SimulatedLease(instance1.getResourceResolverFactory(), c, secondSlingId);
+ c.hooked(newIncomingInstance);
+ c.incSeqNum(1);
+ newIncomingInstance.updateLeaseAndDescriptor(new OakTestConfig());
+
+ logger.info("testNotYetInitializedLeaderElectionid: issuing another 2 heartbeats");
+ instance1.heartbeatsAndCheckView();
+ instance1.heartbeatsAndCheckView();
+
+ // there are different properties that an instance must set in the repository such that it finally becomes visible.
+ // these include:
+ // 1) idmap : it must map the oak id to sling id
+ // 2) node named after its own slingId under /var/discovery/oak/clusterInstances/<slingId>
+ // 3) store the leaderElectionId under /var/discovery/oak/clusterInstances/<slingId>
+ // in all 3 cases the code must work fine if that node/property doesn't exist
+ // and that's exactly what we're testing here.
+
+
+ // initially not even the idmap is updated, so we're stuck with TOPOLOGY_CHANGING
+
+ // due to the nature of the syncService/minEventDelay we now explicitly first sleep 2sec before waiting for async events for another 2sec
+ logger.info("testNotYetInitializedLeaderElectionid: sleeping 2sec for topology change to happen");
+ Thread.sleep(2000);
+ logger.info("testNotYetInitializedLeaderElectionid: ensuring no async events are still in the pipe - for another 2sec");
+ assertEquals(0, discoveryService.getViewStateManager().waitForAsyncEvents(2000));
+ logger.info("testNotYetInitializedLeaderElectionid: now listener should have received 2 events, INIT and CHANGING, it got: "+listener.countEvents());
+ assertEquals(2, listener.countEvents());
+ List<TopologyEvent> events = listener.getEvents();
+ assertEquals(TopologyEvent.Type.TOPOLOGY_INIT, events.get(0).getType());
+ assertEquals(TopologyEvent.Type.TOPOLOGY_CHANGING, events.get(1).getType());
+
+ // let's update the idmap first then
+ DummyResourceResolverFactory factory1 = (DummyResourceResolverFactory) instance1.getResourceResolverFactory();
+ ResourceResolverFactory factory2 = MockFactory.mockResourceResolverFactory(factory1.getSlingRepository());
+
+ ResourceResolver resourceResolver = getResourceResolver(instance1.getResourceResolverFactory());
+ DiscoveryLiteDescriptor descriptor =
+ DiscoveryLiteDescriptor.getDescriptorFrom(resourceResolver);
+ resourceResolver.close();
+
+ DiscoveryLiteDescriptorBuilder dlb = prefill(descriptor);
+ dlb.me(2);
+ DescriptorHelper.setDiscoveryLiteDescriptor(factory2, dlb);
+
+ IdMapService secondIdMapService = IdMapService.testConstructor((DiscoveryLiteConfig) builder1.getConnectorConfig(), new DummySlingSettingsService(secondSlingId), factory2);
+
+ instance1.heartbeatsAndCheckView();
+ instance1.heartbeatsAndCheckView();
+ Thread.sleep(2000);
+ assertEquals(2, listener.countEvents());
+
+
+ // now let's add the /var/discovery/oak/clusterInstances/<slingId> node
+ resourceResolver = getResourceResolver(factory2);
+ Resource clusterInstancesRes = resourceResolver.getResource(builder1.getConnectorConfig().getClusterInstancesPath());
+ assertNull(clusterInstancesRes.getChild(secondSlingId));
+ resourceResolver.create(clusterInstancesRes, secondSlingId, null);
+ resourceResolver.commit();
+ assertNotNull(clusterInstancesRes.getChild(secondSlingId));
+ resourceResolver.close();
+
+ instance1.heartbeatsAndCheckView();
+ instance1.heartbeatsAndCheckView();
+ Thread.sleep(2000);
+ assertEquals(2, listener.countEvents());
+
+ // now let's add the leaderElectionId
+ resourceResolver = getResourceResolver(factory2);
+ Resource instanceResource = resourceResolver.getResource(builder1.getConnectorConfig().getClusterInstancesPath() + "/" + secondSlingId);
+ assertNotNull(instanceResource);
+ instanceResource.adaptTo(ModifiableValueMap.class).put("leaderElectionId", "0");
+ resourceResolver.commit();
+ resourceResolver.close();
+
+ instance1.heartbeatsAndCheckView();
+ instance1.heartbeatsAndCheckView();
+ Thread.sleep(2000);
+ assertEquals(3, listener.countEvents());
+ assertEquals(TopologyEvent.Type.TOPOLOGY_CHANGED, events.get(2).getType());
+ }
+
+ private DiscoveryLiteDescriptorBuilder prefill(DiscoveryLiteDescriptor d) throws Exception {
+ DiscoveryLiteDescriptorBuilder b = new DiscoveryLiteDescriptorBuilder();
+ b.setFinal(true);
+ long seqnum = d.getSeqNum();
+ b.seq((int) seqnum);
+ b.activeIds(box(d.getActiveIds()));
+ b.deactivatingIds(box(d.getDeactivatingIds()));
+ b.me(d.getMyId());
+ b.id(d.getViewId());
+ return b;
+ }
+
+ private Integer[] box(final int[] ids) {
+ //TODO: use Guava
+ List<Integer> list = new ArrayList<Integer>(ids.length);
+ for (Integer i : ids) {
+ list.add(i);
+ }
+ return list.toArray(new Integer[list.size()]);
+ }
+
+ private ResourceResolver getResourceResolver(ResourceResolverFactory resourceResolverFactory) throws LoginException {
+ return resourceResolverFactory.getServiceResourceResolver(null);
+ }
}
--
To stop receiving notification emails like this one, please contact
"commits@sling.apache.org" <co...@sling.apache.org>.