You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2019/03/04 16:59:00 UTC
[lucene-solr] 02/02: SOLR-12923: harden testEventQueue by replacing
the arbitrary sleep call with a countdown latch
This is an automated email from the ASF dual-hosted git repository.
hossman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
commit 7f7357696f9efe63147bacc3e1ed3d800d389d28
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Fri Mar 1 20:20:22 2019 -0700
SOLR-12923: harden testEventQueue by replacing the arbitrary sleep call with a countdown latch
---
.../cloud/autoscaling/TriggerIntegrationTest.java | 53 +++++++++++++-----
.../autoscaling/sim/TestSimTriggerIntegration.java | 65 +++++++++++++++-------
2 files changed, 82 insertions(+), 36 deletions(-)
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
index 23f24c3..756617e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
@@ -75,7 +75,6 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
private static volatile CountDownLatch actionCompleted;
private static AtomicBoolean triggerFired;
private static Set<TriggerEvent> events = ConcurrentHashMap.newKeySet();
- public static volatile long eventQueueActionWait = 5000;
private static SolrCloudManager cloudManager;
static final long WAIT_FOR_DELTA_NANOS = TimeUnit.MILLISECONDS.toNanos(5);
@@ -151,7 +150,6 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
events.clear();
listenerEvents.clear();
lastActionExecutedAt.set(0);
- eventQueueActionWait = 5000;
while (cluster.getJettySolrRunners().size() < 2) {
// perhaps a test stopped a node but didn't start it back
// lets start a node
@@ -394,25 +392,29 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
}
public static class TestEventQueueAction extends TriggerActionBase {
-
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ public static volatile CountDownLatch stall = new CountDownLatch(0);
public TestEventQueueAction() {
log.info("TestEventQueueAction instantiated");
}
@Override
public void process(TriggerEvent event, ActionContext actionContext) {
- log.info("-- event: " + event);
+ // make a local copy of the latch so we're using it consistently even as test thread changes tings
+ final CountDownLatch stallLatch = stall;
+ log.info("processing: stall={} event={} ", stallLatch, event);
events.add(event);
- long eventQueueActionWaitCopy = eventQueueActionWait;
getActionStarted().countDown();
try {
- log.info("-- Going to sleep for {} ms", eventQueueActionWaitCopy);
- Thread.sleep(eventQueueActionWaitCopy);
- log.info("-- Woke up after sleeping for {} ms", eventQueueActionWaitCopy);
- triggerFired.compareAndSet(false, true);
+ if (stallLatch.await(60, TimeUnit.SECONDS)) {
+ log.info("Firing trigger event after await()ing 'stall' countdown");
+ triggerFired.set(true);
+ } else {
+ log.error("Timed out await()ing 'stall' countdown");
+ }
getActionCompleted().countDown();
} catch (InterruptedException e) {
- log.info("-- Interrupted");
+ log.info("Interrupted");
getActionInterrupted().countDown();
}
}
@@ -454,19 +456,32 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
actionInitCalled.await(60, TimeUnit.SECONDS));
+ // setup the trigger action to stall so we can test interupting it w/overseer change
+ // NOTE: we will never release this latch, instead we expect the interupt on overseer shutdown
+ TestEventQueueAction.stall = new CountDownLatch(1);
+
// add node to generate the event
JettySolrRunner newNode = cluster.startJettySolrRunner();
cluster.waitForAllNodes(30);
assertTrue("Action did not start even after await()ing an excessive amount of time",
actionStarted.await(60, TimeUnit.SECONDS));
- eventQueueActionWait = 1;
+
// event should be there
- NodeAddedTrigger.NodeAddedEvent nodeAddedEvent = (NodeAddedTrigger.NodeAddedEvent) events.iterator().next();
+ final TriggerEvent nodeAddedEvent = events.iterator().next();
assertNotNull(nodeAddedEvent);
+ assertNotNull(nodeAddedEvent.getId());
+ assertNotNull(nodeAddedEvent.getEventType());
+ assertNotNull(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+
// but action did not complete yet so the event is still enqueued
assertFalse(triggerFired.get());
+
+ // we know the event action has started, so we can re-set state for the next instance
+ // that will run after the overseer change
events.clear();
actionStarted = new CountDownLatch(1);
+ TestEventQueueAction.stall = new CountDownLatch(0); // so replay won't wait
+
// kill overseer leader
JettySolrRunner j = cluster.stopJettySolrRunner(overseerLeaderIndex);
cluster.waitForJettyToStop(j);
@@ -477,12 +492,20 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
// it should fire again from enqueued event
assertTrue("Action did not (re-)start even after await()ing an excessive amount of time",
actionStarted.await(60, TimeUnit.SECONDS));
- TriggerEvent replayedEvent = events.iterator().next();
- assertTrue(replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME) != null);
- assertTrue(events + "\n" + replayedEvent.toString(), replayedEvent.getProperty(TriggerEventQueue.DEQUEUE_TIME) != null);
+
+ final TriggerEvent replayedEvent = events.iterator().next();
+ assertNotNull(replayedEvent);
+
assertTrue("Action did not complete even after await()ing an excessive amount of time",
actionCompleted.await(60, TimeUnit.SECONDS));
assertTrue(triggerFired.get());
+
+ assertEquals(nodeAddedEvent.getId(), replayedEvent.getId());
+ assertEquals(nodeAddedEvent.getEventTime(), replayedEvent.getEventTime());
+ assertEquals(nodeAddedEvent.getEventType(), replayedEvent.getEventType());
+ assertEquals(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME),
+ replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+ assertEquals(Boolean.TRUE, replayedEvent.getProperty(TriggerEvent.REPLAYING));
}
static Map<String, List<CapturedEvent>> listenerEvents = new HashMap<>();
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java
index 871e083..f680781 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java
@@ -677,38 +677,42 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
}
public static class TestEventQueueAction extends TriggerActionBase {
-
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ public static volatile CountDownLatch stall = new CountDownLatch(0);
public TestEventQueueAction() {
log.info("TestEventQueueAction instantiated");
}
@Override
public void process(TriggerEvent event, ActionContext actionContext) {
- log.info("-- event: " + event);
+ // make a local copy of the latch so we're using it consistently even as test thread changes tings
+ final CountDownLatch stallLatch = stall;
+ log.info("processing: stall={} event={} ", stallLatch, event);
events.add(event);
getActionStarted().countDown();
try {
- Thread.sleep(eventQueueActionWait);
- triggerFired.compareAndSet(false, true);
+ if (stallLatch.await(60, TimeUnit.SECONDS)) {
+ log.info("Firing trigger event after await()ing 'stall' countdown");
+ triggerFired.set(true);
+ } else {
+ log.error("Timed out await()ing 'stall' countdown");
+ }
getActionCompleted().countDown();
} catch (InterruptedException e) {
+ log.info("Interrupted");
getActionInterrupted().countDown();
- return;
}
}
@Override
- public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, Map<String, Object> args) throws TriggerValidationException {
- log.debug("TestTriggerAction init");
+ public void init() throws Exception {
+ log.info("TestEventQueueAction init");
actionInitCalled.countDown();
- super.configure(loader, cloudManager, args);
+ super.init();
}
}
- public static long eventQueueActionWait = 5000;
-
@Test
- //@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072") // this test fails easily
public void testEventQueue() throws Exception {
waitForSeconds = 1;
SolrClient solrClient = cluster.simGetSolrClient();
@@ -729,21 +733,31 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
actionInitCalled.await(60, TimeUnit.SECONDS));
- // wait for the trigger to run at least once
- cluster.getTimeSource().sleep(2 * waitForSeconds * 1000);
-
+ // setup the trigger action to stall so we can test interupting it w/overseer change
+ // NOTE: we will never release this latch, instead we expect the interupt on overseer shutdown
+ TestEventQueueAction.stall = new CountDownLatch(1);
+
// add node to generate the event
- String newNode = cluster.simAddNode();
+ final String newNode = cluster.simAddNode();
assertTrue("Action did not start even after await()ing an excessive amount of time",
actionStarted.await(60, TimeUnit.SECONDS));
+
// event should be there
- TriggerEvent nodeAddedEvent = events.iterator().next();
+ final TriggerEvent nodeAddedEvent = events.iterator().next();
assertNotNull(nodeAddedEvent);
- // but action did not complete yet so the event is still enqueued
+ assertNotNull(nodeAddedEvent.getId());
+ assertNotNull(nodeAddedEvent.getEventType());
+ assertNotNull(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+
+ // but action did not complete yet (due to stall) so the event is still enqueued
assertFalse(triggerFired.get());
+
+ // we know the event action has started, so we can re-set state for the next instance
+ // that will run after the overseer change
events.clear();
actionStarted = new CountDownLatch(1);
- eventQueueActionWait = 1;
+ TestEventQueueAction.stall = new CountDownLatch(0); // so replay won't wait
+
// kill overseer
cluster.simRestartOverseer(overseerLeader);
cluster.getTimeSource().sleep(5000);
@@ -756,12 +770,21 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
// it should fire again from enqueued event
assertTrue("Action did not (re-)start even after await()ing an excessive amount of time",
actionStarted.await(60, TimeUnit.SECONDS));
- TriggerEvent replayedEvent = events.iterator().next();
- assertTrue(replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME) != null);
- assertTrue(events + "\n" + replayedEvent.toString(), replayedEvent.getProperty(TriggerEventQueue.DEQUEUE_TIME) != null);
+
+ final TriggerEvent replayedEvent = events.iterator().next();
+ assertNotNull(replayedEvent);
+
assertTrue("Action did not complete even after await()ing an excessive amount of time",
actionCompleted.await(60, TimeUnit.SECONDS));
assertTrue(triggerFired.get());
+
+ assertEquals(nodeAddedEvent.getId(), replayedEvent.getId());
+ assertEquals(nodeAddedEvent.getEventTime(), replayedEvent.getEventTime());
+ assertEquals(nodeAddedEvent.getEventType(), replayedEvent.getEventType());
+ assertEquals(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME),
+ replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+ assertEquals(Boolean.TRUE, replayedEvent.getProperty(TriggerEvent.REPLAYING));
+
}
@Test