You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2019/03/04 16:58:58 UTC

[lucene-solr] branch master updated (f4f3277 -> 7f73576)

This is an automated email from the ASF dual-hosted git repository.

hossman pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


    from f4f3277  SOLR-13276: Move change entry to 8.1
     new 235b15a  SOLR-12923: increase all await() times in TriggerIntegrationTest
     new 7f73576  SOLR-12923: harden testEventQueue by replacing the arbitrary sleep call with a countdown latch

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../cloud/autoscaling/TriggerIntegrationTest.java  | 118 ++++++++++++---------
 .../autoscaling/sim/TestSimTriggerIntegration.java |  65 ++++++++----
 2 files changed, 112 insertions(+), 71 deletions(-)


[lucene-solr] 02/02: SOLR-12923: harden testEventQueue by replacing the arbitrary sleep call with a countdown latch

Posted by ho...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 7f7357696f9efe63147bacc3e1ed3d800d389d28
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Fri Mar 1 20:20:22 2019 -0700

    SOLR-12923: harden testEventQueue by replacing the arbitrary sleep call with a countdown latch
---
 .../cloud/autoscaling/TriggerIntegrationTest.java  | 53 +++++++++++++-----
 .../autoscaling/sim/TestSimTriggerIntegration.java | 65 +++++++++++++++-------
 2 files changed, 82 insertions(+), 36 deletions(-)

diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
index 23f24c3..756617e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
@@ -75,7 +75,6 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
   private static volatile CountDownLatch actionCompleted;
   private static AtomicBoolean triggerFired;
   private static Set<TriggerEvent> events = ConcurrentHashMap.newKeySet();
-  public static volatile long eventQueueActionWait = 5000;
   private static SolrCloudManager cloudManager;
 
   static final long WAIT_FOR_DELTA_NANOS = TimeUnit.MILLISECONDS.toNanos(5);
@@ -151,7 +150,6 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     events.clear();
     listenerEvents.clear();
     lastActionExecutedAt.set(0);
-    eventQueueActionWait = 5000;
     while (cluster.getJettySolrRunners().size() < 2) {
       // perhaps a test stopped a node but didn't start it back
       // lets start a node
@@ -394,25 +392,29 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
   }
 
   public static class TestEventQueueAction extends TriggerActionBase {
-
+    private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+    public static volatile CountDownLatch stall = new CountDownLatch(0);
     public TestEventQueueAction() {
       log.info("TestEventQueueAction instantiated");
     }
 
     @Override
     public void process(TriggerEvent event, ActionContext actionContext) {
-      log.info("-- event: " + event);
+      // make a local copy of the latch so we're using it consistently even as test thread changes tings
+      final CountDownLatch stallLatch = stall;
+      log.info("processing: stall={} event={} ", stallLatch, event);
       events.add(event);
-      long eventQueueActionWaitCopy = eventQueueActionWait;
       getActionStarted().countDown();
       try {
-        log.info("-- Going to sleep for {} ms", eventQueueActionWaitCopy);
-        Thread.sleep(eventQueueActionWaitCopy);
-        log.info("-- Woke up after sleeping for {} ms", eventQueueActionWaitCopy);
-        triggerFired.compareAndSet(false, true);
+        if (stallLatch.await(60, TimeUnit.SECONDS)) {
+          log.info("Firing trigger event after await()ing 'stall' countdown");
+          triggerFired.set(true);
+        } else {
+          log.error("Timed out await()ing 'stall' countdown");
+        }
         getActionCompleted().countDown();
       } catch (InterruptedException e) {
-        log.info("-- Interrupted");
+        log.info("Interrupted");
         getActionInterrupted().countDown();
       }
     }
@@ -454,19 +456,32 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
                actionInitCalled.await(60, TimeUnit.SECONDS));
 
+    // setup the trigger action to stall so we can test interupting it w/overseer change
+    // NOTE: we will never release this latch, instead we expect the interupt on overseer shutdown
+    TestEventQueueAction.stall = new CountDownLatch(1);
+    
     // add node to generate the event
     JettySolrRunner newNode = cluster.startJettySolrRunner();
     cluster.waitForAllNodes(30);
     assertTrue("Action did not start even after await()ing an excessive amount of time",
                actionStarted.await(60, TimeUnit.SECONDS));
-    eventQueueActionWait = 1;
+    
     // event should be there
-    NodeAddedTrigger.NodeAddedEvent nodeAddedEvent = (NodeAddedTrigger.NodeAddedEvent) events.iterator().next();
+    final TriggerEvent nodeAddedEvent = events.iterator().next();
     assertNotNull(nodeAddedEvent);
+    assertNotNull(nodeAddedEvent.getId());
+    assertNotNull(nodeAddedEvent.getEventType());
+    assertNotNull(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+
     // but action did not complete yet so the event is still enqueued
     assertFalse(triggerFired.get());
+
+    // we know the event action has started, so we can re-set state for the next instance
+    // that will run after the overseer change
     events.clear();
     actionStarted = new CountDownLatch(1);
+    TestEventQueueAction.stall = new CountDownLatch(0); // so replay won't wait
+    
     // kill overseer leader
     JettySolrRunner j = cluster.stopJettySolrRunner(overseerLeaderIndex);
     cluster.waitForJettyToStop(j);
@@ -477,12 +492,20 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     // it should fire again from enqueued event
     assertTrue("Action did not (re-)start even after await()ing an excessive amount of time",
                actionStarted.await(60, TimeUnit.SECONDS));
-    TriggerEvent replayedEvent = events.iterator().next();
-    assertTrue(replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME) != null);
-    assertTrue(events + "\n" + replayedEvent.toString(), replayedEvent.getProperty(TriggerEventQueue.DEQUEUE_TIME) != null);
+    
+    final TriggerEvent replayedEvent = events.iterator().next();
+    assertNotNull(replayedEvent);
+
     assertTrue("Action did not complete even after await()ing an excessive amount of time",
                actionCompleted.await(60, TimeUnit.SECONDS));
     assertTrue(triggerFired.get());
+    
+    assertEquals(nodeAddedEvent.getId(), replayedEvent.getId());
+    assertEquals(nodeAddedEvent.getEventTime(), replayedEvent.getEventTime());
+    assertEquals(nodeAddedEvent.getEventType(), replayedEvent.getEventType());
+    assertEquals(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME),
+                 replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+    assertEquals(Boolean.TRUE, replayedEvent.getProperty(TriggerEvent.REPLAYING));
   }
 
   static Map<String, List<CapturedEvent>> listenerEvents = new HashMap<>();
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java
index 871e083..f680781 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/TestSimTriggerIntegration.java
@@ -677,38 +677,42 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
   }
 
   public static class TestEventQueueAction extends TriggerActionBase {
-
+    private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+    public static volatile CountDownLatch stall = new CountDownLatch(0);
     public TestEventQueueAction() {
       log.info("TestEventQueueAction instantiated");
     }
 
     @Override
     public void process(TriggerEvent event, ActionContext actionContext) {
-      log.info("-- event: " + event);
+      // make a local copy of the latch so we're using it consistently even as test thread changes tings
+      final CountDownLatch stallLatch = stall;
+      log.info("processing: stall={} event={} ", stallLatch, event);
       events.add(event);
       getActionStarted().countDown();
       try {
-        Thread.sleep(eventQueueActionWait);
-        triggerFired.compareAndSet(false, true);
+        if (stallLatch.await(60, TimeUnit.SECONDS)) {
+          log.info("Firing trigger event after await()ing 'stall' countdown");
+          triggerFired.set(true);
+        } else {
+          log.error("Timed out await()ing 'stall' countdown");
+        }
         getActionCompleted().countDown();
       } catch (InterruptedException e) {
+        log.info("Interrupted");
         getActionInterrupted().countDown();
-        return;
       }
     }
 
     @Override
-    public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, Map<String, Object> args) throws TriggerValidationException {
-      log.debug("TestTriggerAction init");
+    public void init() throws Exception {
+      log.info("TestEventQueueAction init");
       actionInitCalled.countDown();
-      super.configure(loader, cloudManager, args);
+      super.init();
     }
   }
 
-  public static long eventQueueActionWait = 5000;
-
   @Test
-  //@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-13072") // this test fails easily
   public void testEventQueue() throws Exception {
     waitForSeconds = 1;
     SolrClient solrClient = cluster.simGetSolrClient();
@@ -729,21 +733,31 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
     assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
                actionInitCalled.await(60, TimeUnit.SECONDS));
 
-    // wait for the trigger to run at least once
-    cluster.getTimeSource().sleep(2 * waitForSeconds * 1000);
-
+    // setup the trigger action to stall so we can test interupting it w/overseer change
+    // NOTE: we will never release this latch, instead we expect the interupt on overseer shutdown
+    TestEventQueueAction.stall = new CountDownLatch(1);
+    
     // add node to generate the event
-    String newNode = cluster.simAddNode();
+    final String newNode = cluster.simAddNode();
     assertTrue("Action did not start even after await()ing an excessive amount of time",
                actionStarted.await(60, TimeUnit.SECONDS));
+    
     // event should be there
-    TriggerEvent nodeAddedEvent = events.iterator().next();
+    final TriggerEvent nodeAddedEvent = events.iterator().next();
     assertNotNull(nodeAddedEvent);
-    // but action did not complete yet so the event is still enqueued
+    assertNotNull(nodeAddedEvent.getId());
+    assertNotNull(nodeAddedEvent.getEventType());
+    assertNotNull(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+    
+    // but action did not complete yet (due to stall) so the event is still enqueued
     assertFalse(triggerFired.get());
+
+    // we know the event action has started, so we can re-set state for the next instance
+    // that will run after the overseer change
     events.clear();
     actionStarted = new CountDownLatch(1);
-    eventQueueActionWait = 1;
+    TestEventQueueAction.stall = new CountDownLatch(0); // so replay won't wait
+
     // kill overseer
     cluster.simRestartOverseer(overseerLeader);
     cluster.getTimeSource().sleep(5000);
@@ -756,12 +770,21 @@ public class TestSimTriggerIntegration extends SimSolrCloudTestCase {
     // it should fire again from enqueued event
     assertTrue("Action did not (re-)start even after await()ing an excessive amount of time",
                actionStarted.await(60, TimeUnit.SECONDS));
-    TriggerEvent replayedEvent = events.iterator().next();
-    assertTrue(replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME) != null);
-    assertTrue(events + "\n" + replayedEvent.toString(), replayedEvent.getProperty(TriggerEventQueue.DEQUEUE_TIME) != null);
+    
+    final TriggerEvent replayedEvent = events.iterator().next();
+    assertNotNull(replayedEvent);
+
     assertTrue("Action did not complete even after await()ing an excessive amount of time",
                actionCompleted.await(60, TimeUnit.SECONDS));
     assertTrue(triggerFired.get());
+
+    assertEquals(nodeAddedEvent.getId(), replayedEvent.getId());
+    assertEquals(nodeAddedEvent.getEventTime(), replayedEvent.getEventTime());
+    assertEquals(nodeAddedEvent.getEventType(), replayedEvent.getEventType());
+    assertEquals(nodeAddedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME),
+                 replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME));
+    assertEquals(Boolean.TRUE, replayedEvent.getProperty(TriggerEvent.REPLAYING));
+      
   }
 
   @Test


[lucene-solr] 01/02: SOLR-12923: increase all await() times in TriggerIntegrationTest

Posted by ho...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

hossman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 235b15acfc97a97cdf03ce73939bc5daf052b6cf
Author: Chris Hostetter <ho...@apache.org>
AuthorDate: Fri Mar 1 15:58:25 2019 -0700

    SOLR-12923: increase all await() times in TriggerIntegrationTest
    
    This means that 'real' failures (which should be rare and hopefully reproducile) will be 'slow', but the trade off will be less hard to reproduce 'false failures' due to thread contention on slow or heavily loaded (ie: jenkins) machines
---
 .../cloud/autoscaling/TriggerIntegrationTest.java  | 65 ++++++++++------------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
index 2fb86c7..23f24c3 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/TriggerIntegrationTest.java
@@ -208,15 +208,14 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     assertEquals(response.get("result").toString(), "success");
 
     // wait until the two instances of action are created
-    if (!actionInitCalled.await(3, TimeUnit.SECONDS))  {
-      fail("Two TriggerAction instances should have been created by now");
-    }
+    assertTrue("Two TriggerAction instances were not created "+
+               "even after await()ing an excessive amount of time",
+               actionInitCalled.await(60, TimeUnit.SECONDS));
 
     JettySolrRunner newNode = cluster.startJettySolrRunner();
     cluster.waitForAllNodes(30);
-    if (!triggerFiredLatch.await(30, TimeUnit.SECONDS)) {
-      fail("Both triggers should have fired by now");
-    }
+    assertTrue("Both triggers did not fire event after await()ing an excessive amount of time",
+               triggerFiredLatch.await(60, TimeUnit.SECONDS));
 
     // reset shared state
     lastActionExecutedAt.set(0);
@@ -248,9 +247,9 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     assertEquals(response.get("result").toString(), "success");
 
     // wait until the two instances of action are created
-    if (!actionInitCalled.await(3, TimeUnit.SECONDS))  {
-      fail("Two TriggerAction instances should have been created by now");
-    }
+    assertTrue("Two TriggerAction instances were not created "+
+               "even after await()ing an excessive amount of time",
+               actionInitCalled.await(60, TimeUnit.SECONDS));
 
     // stop the node we had started earlier
     List<JettySolrRunner> jettySolrRunners = cluster.getJettySolrRunners();
@@ -263,9 +262,8 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
       }
     }
 
-    if (!triggerFiredLatch.await(30, TimeUnit.SECONDS)) {
-      fail("Both triggers should have fired by now");
-    }
+    assertTrue("Both triggers did not fire event after await()ing an excessive amount of time",
+               triggerFiredLatch.await(60, TimeUnit.SECONDS));
   }
 
   static AtomicLong lastActionExecutedAt = new AtomicLong(0);
@@ -342,9 +340,8 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
     response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
-    if (!actionInitCalled.await(3, TimeUnit.SECONDS))  {
-      fail("The TriggerAction should have been created by now");
-    }
+    assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
+               actionInitCalled.await(60, TimeUnit.SECONDS));
 
     // stop the overseer, somebody else will take over as the overseer
     JettySolrRunner j = cluster.stopJettySolrRunner(index);
@@ -352,8 +349,8 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     Thread.sleep(10000);
     JettySolrRunner newNode = cluster.startJettySolrRunner();
     cluster.waitForAllNodes(30);
-    boolean await = triggerFiredLatch.await(20, TimeUnit.SECONDS);
-    assertTrue("The trigger did not fire at all", await);
+    assertTrue("trigger did not fire even after await()ing an excessive amount of time",
+               triggerFiredLatch.await(60, TimeUnit.SECONDS));
     assertTrue(triggerFired.get());
     NodeAddedTrigger.NodeAddedEvent nodeAddedEvent = (NodeAddedTrigger.NodeAddedEvent) events.iterator().next();
     assertNotNull(nodeAddedEvent);
@@ -454,15 +451,14 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     NamedList<Object> response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
-    if (!actionInitCalled.await(3, TimeUnit.SECONDS))  {
-      fail("The TriggerAction should have been created by now");
-    }
+    assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
+               actionInitCalled.await(60, TimeUnit.SECONDS));
 
     // add node to generate the event
     JettySolrRunner newNode = cluster.startJettySolrRunner();
     cluster.waitForAllNodes(30);
-    boolean await = actionStarted.await(60, TimeUnit.SECONDS);
-    assertTrue("action did not start", await);
+    assertTrue("Action did not start even after await()ing an excessive amount of time",
+               actionStarted.await(60, TimeUnit.SECONDS));
     eventQueueActionWait = 1;
     // event should be there
     NodeAddedTrigger.NodeAddedEvent nodeAddedEvent = (NodeAddedTrigger.NodeAddedEvent) events.iterator().next();
@@ -476,16 +472,16 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     cluster.waitForJettyToStop(j);
     Thread.sleep(5000);
     // new overseer leader should be elected and run triggers
-    await = actionInterrupted.await(3, TimeUnit.SECONDS);
-    assertTrue("action wasn't interrupted", await);
+    assertTrue("Action was not interupted even after await()ing an excessive amount of time",
+               actionInterrupted.await(60, TimeUnit.SECONDS));
     // it should fire again from enqueued event
-    await = actionStarted.await(60, TimeUnit.SECONDS);
-    assertTrue("action wasn't started", await);
+    assertTrue("Action did not (re-)start even after await()ing an excessive amount of time",
+               actionStarted.await(60, TimeUnit.SECONDS));
     TriggerEvent replayedEvent = events.iterator().next();
     assertTrue(replayedEvent.getProperty(TriggerEventQueue.ENQUEUE_TIME) != null);
     assertTrue(events + "\n" + replayedEvent.toString(), replayedEvent.getProperty(TriggerEventQueue.DEQUEUE_TIME) != null);
-    await = actionCompleted.await(10, TimeUnit.SECONDS);
-    assertTrue("action wasn't completed", await);
+    assertTrue("Action did not complete even after await()ing an excessive amount of time",
+               actionCompleted.await(60, TimeUnit.SECONDS));
     assertTrue(triggerFired.get());
   }
 
@@ -543,9 +539,8 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     NamedList<Object> response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
-    if (!actionInitCalled.await(3, TimeUnit.SECONDS))  {
-      fail("The TriggerAction should have been created by now");
-    }
+    assertTrue("Trigger was not init()ed even after await()ing an excessive amount of time",
+               actionInitCalled.await(60, TimeUnit.SECONDS));
 
     String setListenerCommand = "{" +
         "'set-listener' : " +
@@ -581,8 +576,8 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     failDummyAction = false;
 
     JettySolrRunner newNode = cluster.startJettySolrRunner();
-    boolean await = triggerFiredLatch.await(20, TimeUnit.SECONDS);
-    assertTrue("The trigger did not fire at all", await);
+    assertTrue("trigger did not fire even after await()ing an excessive amount of time",
+               triggerFiredLatch.await(60, TimeUnit.SECONDS));
     assertTrue(triggerFired.get());
 
     assertEquals("both listeners should have fired", 2, listenerEvents.size());
@@ -648,8 +643,8 @@ public class TriggerIntegrationTest extends SolrCloudTestCase {
     failDummyAction = true;
 
     newNode = cluster.startJettySolrRunner();
-    await = triggerFiredLatch.await(20, TimeUnit.SECONDS);
-    assertTrue("The trigger did not fire at all", await);
+    assertTrue("trigger did not fire event after await()ing an excessive amount of time",
+               triggerFiredLatch.await(60, TimeUnit.SECONDS));
 
     Thread.sleep(2000);