You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by mh...@apache.org on 2017/10/17 10:06:00 UTC

asterixdb git commit: [NO ISSUE][CLUS] Prevent Possible Deadlock in GlobalRecoveryManager

Repository: asterixdb
Updated Branches:
  refs/heads/master 31aacc7be -> bc918f7ff


[NO ISSUE][CLUS] Prevent Possible Deadlock in GlobalRecoveryManager

- user model changes: no
- storage format changes: no
- interface changes: no

Details:
- Perform global recovery on a different thread to prevent
  possible deadlocks in ClusterStateManager.
- Shutdown the system in case of global recovery failure
  since the system will be in inconsistent state.

Change-Id: I3b0491a84fb24b428d5ce98f392adafc6dfacff9
Reviewed-on: https://asterix-gerrit.ics.uci.edu/2072
Sonar-Qube: Jenkins <je...@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Ian Maxon <im...@apache.org>
Reviewed-by: Michael Blow <mb...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo
Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/bc918f7f
Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/bc918f7f
Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/bc918f7f

Branch: refs/heads/master
Commit: bc918f7ff2ced1af32bf595689406bad7e950721
Parents: 31aacc7
Author: Murtadha Hubail <mh...@apache.org>
Authored: Sat Oct 14 02:45:21 2017 +0300
Committer: Murtadha Hubail <mh...@apache.org>
Committed: Tue Oct 17 03:05:23 2017 -0700

----------------------------------------------------------------------
 .../bootstrap/GlobalRecoveryManager.java        | 52 +++++++++++---------
 .../common/cluster/IGlobalRecoveryManager.java  |  8 +--
 .../hyracks/control/nc/NCShutdownHook.java      |  1 +
 3 files changed, 31 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/asterixdb/blob/bc918f7f/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
index 13f3afa..d6854b1 100644
--- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
+++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
@@ -48,6 +48,8 @@ import org.apache.hyracks.api.client.IHyracksClientConnection;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.api.job.JobId;
 import org.apache.hyracks.api.job.JobSpecification;
+import org.apache.hyracks.control.nc.NCShutdownHook;
+import org.apache.hyracks.util.ExitUtil;
 
 public class GlobalRecoveryManager implements IGlobalRecoveryManager {
 
@@ -56,6 +58,7 @@ public class GlobalRecoveryManager implements IGlobalRecoveryManager {
     protected final ICCServiceContext serviceCtx;
     protected IHyracksClientConnection hcc;
     protected volatile boolean recoveryCompleted;
+    protected volatile boolean recovering;
 
     public GlobalRecoveryManager(ICCServiceContext serviceCtx, IHyracksClientConnection hcc,
             IStorageComponentProvider componentProvider) {
@@ -81,41 +84,42 @@ public class GlobalRecoveryManager implements IGlobalRecoveryManager {
     }
 
     @Override
-    public void startGlobalRecovery(ICcApplicationContext appCtx) throws HyracksDataException {
-        if (!recoveryCompleted) {
-            recover(appCtx);
+    public void startGlobalRecovery(ICcApplicationContext appCtx) {
+        if (!recoveryCompleted && !recovering) {
+            synchronized (this) {
+                if (!recovering) {
+                    recovering = true;
+                    /**
+                     * Perform recovery on a different thread to avoid deadlocks in
+                     * {@link org.apache.asterix.common.cluster.IClusterStateManager}
+                     */
+                    serviceCtx.getControllerService().getExecutor().submit(() -> {
+                        try {
+                            recover(appCtx);
+                        } catch (HyracksDataException e) {
+                            LOGGER.log(Level.SEVERE, "Global recovery failed. Shutting down...", e);
+                            ExitUtil.exit(NCShutdownHook.FAILED_TO_RECOVER_EXIT_CODE);
+                        }
+                    });
+                }
+            }
         }
     }
 
     protected void recover(ICcApplicationContext appCtx) throws HyracksDataException {
-        LOGGER.info("Starting Global Recovery");
-        MetadataTransactionContext mdTxnCtx = null;
         try {
+            LOGGER.info("Starting Global Recovery");
             MetadataManager.INSTANCE.init();
-            mdTxnCtx = MetadataManager.INSTANCE.beginTransaction();
+            MetadataTransactionContext mdTxnCtx = MetadataManager.INSTANCE.beginTransaction();
             mdTxnCtx = doRecovery(appCtx, mdTxnCtx);
             MetadataManager.INSTANCE.commitTransaction(mdTxnCtx);
+            recoveryCompleted = true;
+            recovering = false;
+            LOGGER.info("Global Recovery Completed. Refreshing cluster state...");
+            appCtx.getClusterStateManager().refreshState();
         } catch (Exception e) {
-            // This needs to be fixed <-- Needs to shutdown the system -->
-            /*
-             * Note: Throwing this illegal state exception will terminate this thread
-             * and feeds listeners will not be notified.
-             */
-            LOGGER.log(Level.SEVERE, "Global recovery was not completed successfully: ", e);
-            if (mdTxnCtx != null) {
-                try {
-                    MetadataManager.INSTANCE.abortTransaction(mdTxnCtx);
-                } catch (Exception e1) {
-                    LOGGER.log(Level.SEVERE, "Exception aborting metadata transaction", e1);
-                    e.addSuppressed(e1);
-                    throw new IllegalStateException(e);
-                }
-            }
             throw HyracksDataException.create(e);
         }
-        recoveryCompleted = true;
-        LOGGER.info("Global Recovery Completed");
-        appCtx.getClusterStateManager().refreshState();
     }
 
     protected MetadataTransactionContext doRecovery(ICcApplicationContext appCtx, MetadataTransactionContext mdTxnCtx)

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/bc918f7f/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java
index b4559c8..a3add90 100644
--- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java
+++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/cluster/IGlobalRecoveryManager.java
@@ -20,19 +20,15 @@ package org.apache.asterix.common.cluster;
 
 import org.apache.asterix.common.api.IClusterEventsSubscriber;
 import org.apache.asterix.common.dataflow.ICcApplicationContext;
-import org.apache.hyracks.api.exceptions.HyracksDataException;
 
 public interface IGlobalRecoveryManager extends IClusterEventsSubscriber {
 
     /**
      * Starts the global recovery process after the cluster state has changed to ACTIVE.
      *
-     * @param appCtx
-     *            the application context
-     * @throws HyracksDataException
-     *             if the global recovery fails
+     * @param appCtx the application context
      */
-    void startGlobalRecovery(ICcApplicationContext appCtx) throws HyracksDataException;
+    void startGlobalRecovery(ICcApplicationContext appCtx);
 
     /**
      * @return true, if global recovery has been completed successfully

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/bc918f7f/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java
----------------------------------------------------------------------
diff --git a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java
index 0a02635..6308373 100644
--- a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java
+++ b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/NCShutdownHook.java
@@ -31,6 +31,7 @@ import org.apache.hyracks.util.ThreadDumpUtil;
 public class NCShutdownHook extends Thread {
 
     public static final int FAILED_TO_STARTUP_EXIT_CODE = 2;
+    public static final int FAILED_TO_RECOVER_EXIT_CODE = 3;
     private static final Logger LOGGER = Logger.getLogger(NCShutdownHook.class.getName());
     private static final long SHUTDOWN_WAIT_TIME = 10 * 60 * 1000L;
     private final Thread watchDog;