You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by so...@apache.org on 2021/06/14 10:29:01 UTC

[ozone] branch master updated: HDDS-5329. Remove lockmanager and synchronize on ContainerInfo in Replication Manager (#2325)

This is an automated email from the ASF dual-hosted git repository.

sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new d45819f  HDDS-5329. Remove lockmanager and synchronize on ContainerInfo in Replication Manager (#2325)
d45819f is described below

commit d45819f7eb79590ddd445cadb87bd21b8eae336c
Author: Jackson Yao <ja...@tencent.com>
AuthorDate: Mon Jun 14 18:28:37 2021 +0800

    HDDS-5329. Remove lockmanager and synchronize on ContainerInfo in Replication Manager (#2325)
---
 .../hdds/scm/container/ReplicationManager.java     | 217 ++++++++++-----------
 .../hdds/scm/server/StorageContainerManager.java   |   2 -
 .../hdds/scm/container/TestReplicationManager.java |   3 -
 3 files changed, 104 insertions(+), 118 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
index 8accae6..13f19f1 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
@@ -65,7 +65,6 @@ import org.apache.hadoop.metrics2.MetricsInfo;
 import org.apache.hadoop.metrics2.MetricsSource;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
-import org.apache.hadoop.ozone.lock.LockManager;
 import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand;
 import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
 import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand;
@@ -119,11 +118,6 @@ public class ReplicationManager implements MetricsSource, SCMService {
   private final SCMContext scmContext;
 
   /**
-   * Used for locking a container using its ID while processing it.
-   */
-  private final LockManager<ContainerID> lockManager;
-
-  /**
    * Used to lookup the health of a nodes or the nodes operational state.
    */
   private final NodeManager nodeManager;
@@ -187,13 +181,11 @@ public class ReplicationManager implements MetricsSource, SCMService {
                             final EventPublisher eventPublisher,
                             final SCMContext scmContext,
                             final SCMServiceManager serviceManager,
-                            final LockManager<ContainerID> lockManager,
                             final NodeManager nodeManager) {
     this.containerManager = containerManager;
     this.containerPlacement = containerPlacement;
     this.eventPublisher = eventPublisher;
     this.scmContext = scmContext;
-    this.lockManager = lockManager;
     this.nodeManager = nodeManager;
     this.rmConf = conf.getObject(ReplicationManagerConfiguration.class);
     this.running = false;
@@ -310,134 +302,135 @@ public class ReplicationManager implements MetricsSource, SCMService {
     if (!shouldRun()) {
       return;
     }
-
     final ContainerID id = container.containerID();
-    lockManager.lock(id);
     try {
-      final Set<ContainerReplica> replicas = containerManager
-          .getContainerReplicas(container.containerID());
-      final LifeCycleState state = container.getState();
+      // synchronize on the containerInfo object to solve container
+      // race conditions with ICR/FCR handlers
+      synchronized (container) {
+        final Set<ContainerReplica> replicas = containerManager
+            .getContainerReplicas(id);
+        final LifeCycleState state = container.getState();
 
-      /*
-       * We don't take any action if the container is in OPEN state and
-       * the container is healthy. If the container is not healthy, i.e.
-       * the replicas are not in OPEN state, send CLOSE_CONTAINER command.
-       */
-      if (state == LifeCycleState.OPEN) {
-        if (!isOpenContainerHealthy(container, replicas)) {
-          eventPublisher.fireEvent(SCMEvents.CLOSE_CONTAINER, id);
+        /*
+         * We don't take any action if the container is in OPEN state and
+         * the container is healthy. If the container is not healthy, i.e.
+         * the replicas are not in OPEN state, send CLOSE_CONTAINER command.
+         */
+        if (state == LifeCycleState.OPEN) {
+          if (!isOpenContainerHealthy(container, replicas)) {
+            eventPublisher.fireEvent(SCMEvents.CLOSE_CONTAINER, id);
+          }
+          return;
         }
-        return;
-      }
 
-      /*
-       * If the container is in CLOSING state, the replicas can either
-       * be in OPEN or in CLOSING state. In both of this cases
-       * we have to resend close container command to the datanodes.
-       */
-      if (state == LifeCycleState.CLOSING) {
-        replicas.forEach(replica -> sendCloseCommand(
-            container, replica.getDatanodeDetails(), false));
-        return;
-      }
+        /*
+         * If the container is in CLOSING state, the replicas can either
+         * be in OPEN or in CLOSING state. In both of this cases
+         * we have to resend close container command to the datanodes.
+         */
+        if (state == LifeCycleState.CLOSING) {
+          replicas.forEach(replica -> sendCloseCommand(
+              container, replica.getDatanodeDetails(), false));
+          return;
+        }
 
-      /*
-       * If the container is in QUASI_CLOSED state, check and close the
-       * container if possible.
-       */
-      if (state == LifeCycleState.QUASI_CLOSED &&
-          canForceCloseContainer(container, replicas)) {
-        forceCloseContainer(container, replicas);
-        return;
-      }
+        /*
+         * If the container is in QUASI_CLOSED state, check and close the
+         * container if possible.
+         */
+        if (state == LifeCycleState.QUASI_CLOSED &&
+            canForceCloseContainer(container, replicas)) {
+          forceCloseContainer(container, replicas);
+          return;
+        }
 
-      /*
-       * Before processing the container we have to reconcile the
-       * inflightReplication and inflightDeletion actions.
-       *
-       * We remove the entry from inflightReplication and inflightDeletion
-       * list, if the operation is completed or if it has timed out.
-       */
-      updateInflightAction(container, inflightReplication,
-          action -> replicas.stream()
-              .anyMatch(r -> r.getDatanodeDetails().equals(action.datanode)));
+        /*
+         * Before processing the container we have to reconcile the
+         * inflightReplication and inflightDeletion actions.
+         *
+         * We remove the entry from inflightReplication and inflightDeletion
+         * list, if the operation is completed or if it has timed out.
+         */
+        updateInflightAction(container, inflightReplication,
+            action -> replicas.stream()
+                .anyMatch(r -> r.getDatanodeDetails().equals(action.datanode)));
 
-      updateInflightAction(container, inflightDeletion,
-          action -> replicas.stream()
-              .noneMatch(r -> r.getDatanodeDetails().equals(action.datanode)));
+        updateInflightAction(container, inflightDeletion,
+            action -> replicas.stream()
+                .noneMatch(r ->
+                    r.getDatanodeDetails().equals(action.datanode)));
 
-      /*
-       * If container is under deleting and all it's replicas are deleted, then
-       * make the container as CLEANED, or resend the delete replica command if
-       * needed.
-       */
-      if (state == LifeCycleState.DELETING) {
-        handleContainerUnderDelete(container, replicas);
-        return;
-      }
+        /*
+         * If container is under deleting and all it's replicas are deleted,
+         * then make the container as CLEANED,
+         * or resend the delete replica command if needed.
+         */
+        if (state == LifeCycleState.DELETING) {
+          handleContainerUnderDelete(container, replicas);
+          return;
+        }
 
-      /**
-       * We don't need to take any action for a DELETE container - eventually
-       * it will be removed from SCM.
-       */
-      if (state == LifeCycleState.DELETED) {
-        return;
-      }
+        /**
+         * We don't need to take any action for a DELETE container - eventually
+         * it will be removed from SCM.
+         */
+        if (state == LifeCycleState.DELETED) {
+          return;
+        }
 
-      ContainerReplicaCount replicaSet =
-          getContainerReplicaCount(container, replicas);
-      ContainerPlacementStatus placementStatus = getPlacementStatus(
-          replicas, container.getReplicationConfig().getRequiredNodes());
+        ContainerReplicaCount replicaSet =
+            getContainerReplicaCount(container, replicas);
+        ContainerPlacementStatus placementStatus = getPlacementStatus(
+            replicas, container.getReplicationConfig().getRequiredNodes());
 
-      /*
-       * We don't have to take any action if the container is healthy.
-       *
-       * According to ReplicationMonitor container is considered healthy if
-       * the container is either in QUASI_CLOSED or in CLOSED state and has
-       * exact number of replicas in the same state.
-       */
-      if (isContainerEmpty(container, replicas)) {
         /*
-         *  If container is empty, schedule task to delete the container.
+         * We don't have to take any action if the container is healthy.
+         *
+         * According to ReplicationMonitor container is considered healthy if
+         * the container is either in QUASI_CLOSED or in CLOSED state and has
+         * exact number of replicas in the same state.
          */
-        deleteContainerReplicas(container, replicas);
-        return;
-      }
+        if (isContainerEmpty(container, replicas)) {
+          /*
+           *  If container is empty, schedule task to delete the container.
+           */
+          deleteContainerReplicas(container, replicas);
+          return;
+        }
 
-      /*
-       * Check if the container is under replicated and take appropriate
-       * action.
-       */
-      if (!replicaSet.isSufficientlyReplicated()
-          || !placementStatus.isPolicySatisfied()) {
-        handleUnderReplicatedContainer(container, replicaSet, placementStatus);
-        return;
-      }
+        /*
+         * Check if the container is under replicated and take appropriate
+         * action.
+         */
+        if (!replicaSet.isSufficientlyReplicated()
+            || !placementStatus.isPolicySatisfied()) {
+          handleUnderReplicatedContainer(container,
+              replicaSet, placementStatus);
+          return;
+        }
 
-      /*
-       * Check if the container is over replicated and take appropriate
-       * action.
-       */
-      if (replicaSet.isOverReplicated()) {
-        handleOverReplicatedContainer(container, replicaSet);
-        return;
-      }
+        /*
+         * Check if the container is over replicated and take appropriate
+         * action.
+         */
+        if (replicaSet.isOverReplicated()) {
+          handleOverReplicatedContainer(container, replicaSet);
+          return;
+        }
 
       /*
        If we get here, the container is not over replicated or under replicated
        but it may be "unhealthy", which means it has one or more replica which
        are not in the same state as the container itself.
        */
-      if (!replicaSet.isHealthy()) {
-        handleUnstableContainer(container, replicas);
+        if (!replicaSet.isHealthy()) {
+          handleUnstableContainer(container, replicas);
+        }
       }
-
     } catch (ContainerNotFoundException ex) {
       LOG.warn("Missing container {}.", id);
     } catch (Exception ex) {
       LOG.warn("Process container {} error: ", id, ex);
-    } finally {
-      lockManager.writeUnlock(id);
     }
   }
 
@@ -540,13 +533,11 @@ public class ReplicationManager implements MetricsSource, SCMService {
    */
   public ContainerReplicaCount getContainerReplicaCount(ContainerInfo container)
       throws ContainerNotFoundException {
-    lockManager.readLock(container.containerID());
-    try {
+    // TODO: using a RW lock for only read
+    synchronized (container) {
       final Set<ContainerReplica> replica = containerManager
           .getContainerReplicas(container.containerID());
       return getContainerReplicaCount(container, replica);
-    } finally {
-      lockManager.readUnlock(container.containerID());
     }
   }
 
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
index 097de16..f0d388b 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
@@ -122,7 +122,6 @@ import org.apache.hadoop.ozone.OzoneConfigKeys;
 import org.apache.hadoop.ozone.OzoneSecurityUtil;
 import org.apache.hadoop.ozone.common.Storage.StorageState;
 import org.apache.hadoop.ozone.lease.LeaseManager;
-import org.apache.hadoop.ozone.lock.LockManager;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
@@ -572,7 +571,6 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl
           eventQueue,
           scmContext,
           serviceManager,
-          new LockManager<>(conf),
           scmNodeManager);
     }
     if(configurator.getScmSafeModeManager() != null) {
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
index bd47806..7d3d457 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
@@ -41,7 +41,6 @@ import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.hdds.server.events.EventHandler;
 import org.apache.hadoop.hdds.server.events.EventPublisher;
 import org.apache.hadoop.hdds.server.events.EventQueue;
-import org.apache.hadoop.ozone.lock.LockManager;
 import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
 import org.apache.ozone.test.GenericTestUtils;
 import org.junit.After;
@@ -162,7 +161,6 @@ public class TestReplicationManager {
         eventQueue,
         SCMContext.emptyContext(),
         serviceManager,
-        new LockManager<>(conf),
         nodeManager);
 
     serviceManager.notifyStatusChanged();
@@ -186,7 +184,6 @@ public class TestReplicationManager {
         eventQueue,
         SCMContext.emptyContext(),
         serviceManager,
-        new LockManager<ContainerID>(config),
         nodeManager);
 
     serviceManager.notifyStatusChanged();

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org