You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2016/02/17 02:22:29 UTC
hbase git commit: HBASE-14807 TestWALLockup is flakey Second attempt at stabilizing this test.

Repository: hbase
Updated Branches:
  refs/heads/branch-1.1 a6f53600d -> 0de7e7c7c


HBASE-14807 TestWALLockup is flakey Second attempt at stabilizing this test.

Patch includes loosening of the test so we continue when threads
run in not-expected order. Also includes minor clean ups in
FSHLog -- a formatting change, removal of an unused trace logging,
and a check so we don't create a new exception when not needed --
but it also includes a subtle so we check if we need to get to safe
point EVEN IF an outstanding exception. Previous we could by-pass
the safe point check. This should make us even more robust against
lockup (though this is a change that comes of code reading, not of
any issue seen in test).

Here is some detail on how I loosened the test:

The test can run in an unexpected order. Attempts at dictating the
order in which threads fire only had me deadlocking one latch
against another (the test latch vs the WAL zigzag latch) so I
gave up trying and instead, if we happen to go the unusual route of
rolling WALs and failing flush before the scheduled log roll
latch goes into place, just time out the run after a few seconds
and exit the test (but do not fail it); just log a WARN.

This is less than ideal but allows us keep some coverage of the
tricky scenario that was bringing on deadlock (a broken WAL that
is throwing exceptions getting stuck waiting on a sync to clear
out the ring buffer getting overshadowed by a subsequent append
added in by a concurrent flush).

Signed-off-by: stack <st...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/0de7e7c7
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/0de7e7c7
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/0de7e7c7

Branch: refs/heads/branch-1.1
Commit: 0de7e7c7c89799dd069ae4cbe806210ab43dec97
Parents: a6f5360
Author: stack <st...@apache.org>
Authored: Thu Nov 19 11:38:55 2015 -0800
Committer: stack <st...@apache.org>
Committed: Tue Feb 16 17:22:19 2016 -0800

----------------------------------------------------------------------
 .../hadoop/hbase/regionserver/wal/FSHLog.java   | 65 +++++++++-----------
 .../hbase/regionserver/TestWALLockup.java       | 11 +++-
 2 files changed, 39 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/0de7e7c7/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
index f31a910..9c14313 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
@@ -116,7 +116,7 @@ import com.lmax.disruptor.dsl.ProducerType;
  *
  * <p>To read an WAL, call {@link WALFactory#createReader(org.apache.hadoop.fs.FileSystem,
  * org.apache.hadoop.fs.Path)}.
- * 
+ *
  * <h2>Failure Semantic</h2>
  * If an exception on append or sync, roll the WAL because the current WAL is now a lame duck;
  * any more appends or syncs will fail also with the same original exception. If we have made
@@ -146,7 +146,7 @@ public class FSHLog implements WAL {
   // Calls to append now also wait until the append has been done on the consumer side of the
   // disruptor.  We used to not wait but it makes the implemenation easier to grok if we have
   // the region edit/sequence id after the append returns.
-  // 
+  //
   // TODO: Handlers need to coordinate appending AND syncing.  Can we have the threads contend
   // once only?  Probably hard given syncs take way longer than an append.
   //
@@ -166,7 +166,7 @@ public class FSHLog implements WAL {
   static final Log LOG = LogFactory.getLog(FSHLog.class);
 
   private static final int DEFAULT_SLOW_SYNC_TIME_MS = 100; // in ms
-  
+
   /**
    * The nexus at which all incoming handlers meet.  Does appends and sync with an ordering.
    * Appends and syncs are each put on the ring which means handlers need to
@@ -236,7 +236,7 @@ public class FSHLog implements WAL {
   private final String logFilePrefix;
 
   /**
-   * Suffix included on generated wal file names 
+   * Suffix included on generated wal file names
    */
   private final String logFileSuffix;
 
@@ -258,7 +258,7 @@ public class FSHLog implements WAL {
   public void registerWALActionsListener(final WALActionsListener listener) {
     this.listeners.add(listener);
   }
-  
+
   @Override
   public boolean unregisterWALActionsListener(final WALActionsListener listener) {
     return this.listeners.remove(listener);
@@ -659,7 +659,7 @@ public class FSHLog implements WAL {
 
   /**
    * Tell listeners about pre log roll.
-   * @throws IOException 
+   * @throws IOException
    */
   private void tellListenersAboutPreLogRoll(final Path oldPath, final Path newPath)
   throws IOException {
@@ -672,7 +672,7 @@ public class FSHLog implements WAL {
 
   /**
    * Tell listeners about post log roll.
-   * @throws IOException 
+   * @throws IOException
    */
   private void tellListenersAboutPostLogRoll(final Path oldPath, final Path newPath)
   throws IOException {
@@ -1212,12 +1212,12 @@ public class FSHLog implements WAL {
     // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
     return new HLogKey(encodedRegionName, tableName, seqnum, now, clusterIds, nonceGroup, nonce);
   }
-  
+
   @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
       justification="Will never be null")
   @Override
   public long append(final HTableDescriptor htd, final HRegionInfo hri, final WALKey key,
-      final WALEdit edits, final AtomicLong sequenceId, final boolean inMemstore, 
+      final WALEdit edits, final AtomicLong sequenceId, final boolean inMemstore,
       final List<Cell> memstoreCells) throws IOException {
     if (this.closed) throw new IOException("Cannot append; log is closed");
     // Make a trace scope for the append.  It is closed on other side of the ring buffer by the
@@ -1263,9 +1263,9 @@ public class FSHLog implements WAL {
     private volatile long sequence;
     // Keep around last exception thrown. Clear on successful sync.
     private final BlockingQueue<SyncFuture> syncFutures;
- 
+
     /**
-     * UPDATE! 
+     * UPDATE!
      * @param syncs the batch of calls to sync that arrived as this thread was starting; when done,
      * we will put the result of the actual hdfs sync call as the result.
      * @param sequence The sequence number on the ring buffer when this thread was set running.
@@ -1313,7 +1313,7 @@ public class FSHLog implements WAL {
       // This function releases one sync future only.
       return 1;
     }
- 
+
     /**
      * Release all SyncFutures whose sequence is <= <code>currentSequence</code>.
      * @param currentSequence
@@ -1843,7 +1843,7 @@ public class FSHLog implements WAL {
    * 'safe point' while the orchestrating thread does some work that requires the first thread
    * paused: e.g. holding the WAL writer while its WAL is swapped out from under it by another
    * thread.
-   * 
+   *
    * <p>Thread A signals Thread B to hold when it gets to a 'safe point'.  Thread A wait until
    * Thread B gets there. When the 'safe point' has been attained, Thread B signals Thread A.
    * Thread B then holds at the 'safe point'.  Thread A on notification that Thread B is paused,
@@ -1851,7 +1851,7 @@ public class FSHLog implements WAL {
    * it flags B and then Thread A and Thread B continue along on their merry way.  Pause and
    * signalling 'zigzags' between the two participating threads.  We use two latches -- one the
    * inverse of the other -- pausing and signaling when states are achieved.
-   * 
+   *
    * <p>To start up the drama, Thread A creates an instance of this class each time it would do
    * this zigzag dance and passes it to Thread B (these classes use Latches so it is one shot
    * only). Thread B notices the new instance (via reading a volatile reference or how ever) and it
@@ -1873,7 +1873,7 @@ public class FSHLog implements WAL {
      * Latch to wait on.  Will be released when we can proceed.
      */
     private volatile CountDownLatch safePointReleasedLatch = new CountDownLatch(1);
- 
+
     /**
      * For Thread A to call when it is ready to wait on the 'safe point' to be attained.
      * Thread A will be held in here until Thread B calls {@link #safePointAttained()}
@@ -1882,7 +1882,7 @@ public class FSHLog implements WAL {
      * @param syncFuture We need this as barometer on outstanding syncs.  If it comes home with
      * an exception, then something is up w/ our syncing.
      * @return The passed <code>syncFuture</code>
-     * @throws FailedSyncBeforeLogCloseException 
+     * @throws FailedSyncBeforeLogCloseException
      */
     SyncFuture waitSafePoint(final SyncFuture syncFuture)
     throws InterruptedException, FailedSyncBeforeLogCloseException {
@@ -1894,7 +1894,7 @@ public class FSHLog implements WAL {
       }
       return syncFuture;
     }
- 
+
     /**
      * Called by Thread B when it attains the 'safe point'.  In this method, Thread B signals
      * Thread A it can proceed. Thread B will be held in here until {@link #releaseSafePoint()}
@@ -2043,18 +2043,11 @@ public class FSHLog implements WAL {
         }
 
         // TODO: Check size and if big go ahead and call a sync if we have enough data.
-
-        // If not a batch, return to consume more events from the ring buffer before proceeding;
-        // we want to get up a batch of syncs and appends before we go do a filesystem sync.
-        if (!endOfBatch || this.syncFuturesCount <= 0) return;
-
-        // Now we have a batch.
-
-        if (LOG.isTraceEnabled()) {
-          LOG.trace("Sequence=" + sequence + ", syncCount=" + this.syncFuturesCount);
-        }
-
+        // This is a sync. If existing exception, fall through. Else look to see if batch.
         if (this.exception == null) {
+          // If not a batch, return to consume more events from the ring buffer before proceeding;
+          // we want to get up a batch of syncs and appends before we go do a filesystem sync.
+          if (!endOfBatch || this.syncFuturesCount <= 0) return;
           // Below expects that the offer 'transfers' responsibility for the outstanding syncs to
           // the syncRunner. We should never get an exception in here.
           this.syncRunnerIndex = (this.syncRunnerIndex + 1) % this.syncRunners.length;
@@ -2070,7 +2063,9 @@ public class FSHLog implements WAL {
         // We may have picked up an exception above trying to offer sync
         if (this.exception != null) {
           cleanupOutstandingSyncsOnException(sequence,
-            new DamagedWALException("On sync", this.exception));
+            this.exception instanceof DamagedWALException?
+              this.exception:
+              new DamagedWALException("On sync", this.exception));
         }
         attainSafePoint(sequence);
         this.syncFuturesCount = 0;
@@ -2142,14 +2137,14 @@ public class FSHLog implements WAL {
         // here inside this single appending/writing thread.  Events are ordered on the ringbuffer
         // so region sequenceids will also be in order.
         regionSequenceId = entry.stampRegionSequenceId();
-        
-        // Edits are empty, there is nothing to append.  Maybe empty when we are looking for a 
-        // region sequence id only, a region edit/sequence id that is not associated with an actual 
+
+        // Edits are empty, there is nothing to append.  Maybe empty when we are looking for a
+        // region sequence id only, a region edit/sequence id that is not associated with an actual
         // edit. It has to go through all the rigmarole to be sure we have the right ordering.
         if (entry.getEdit().isEmpty()) {
           return;
         }
-        
+
         // Coprocessor hook.
         if (!coprocessorHost.preWALWrite(entry.getHRegionInfo(), entry.getKey(),
             entry.getEdit())) {
@@ -2180,7 +2175,7 @@ public class FSHLog implements WAL {
         // Update metrics.
         postAppend(entry, EnvironmentEdgeManager.currentTime() - start);
       } catch (Exception e) {
-        String msg = "Failed appending " + regionSequenceId + ", requesting roll of WAL";
+        String msg = "Append sequenceId=" + regionSequenceId + ", requesting roll of WAL";
         LOG.warn(msg, e);
         requestLogRoll();
         throw new DamagedWALException(msg, e);
@@ -2259,7 +2254,7 @@ public class FSHLog implements WAL {
       System.exit(-1);
     }
   }
-  
+
   /**
    * Find the 'getPipeline' on the passed <code>os</code> stream.
    * @return Method or null.

http://git-wip-us.apache.org/repos/asf/hbase/blob/0de7e7c7/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java
index ce70682..41b7ff9 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -130,7 +131,13 @@ public class TestWALLockup {
         if (throwException) {
           try {
             LOG.info("LATCHED");
-            this.latch.await();
+            // So, timing can have it that the test can run and the bad flush below happens
+            // before we get here. In this case, we'll be stuck waiting on this latch but there
+            // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
+            // because all WALs have rolled. In this case, just give up on test.
+            if (!this.latch.await(5, TimeUnit.SECONDS)) {
+              LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
+            }
           } catch (InterruptedException e) {
             // TODO Auto-generated catch block
             e.printStackTrace();
@@ -276,4 +283,4 @@ public class TestWALLockup {
       getName(), CONF, false, Durability.SYNC_WAL,
       wal, COLUMN_FAMILY_BYTES);
   }
-}
\ No newline at end of file
+}