You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@ozone.apache.org by GitBox <gi...@apache.org> on 2022/05/04 02:32:02 UTC

[GitHub] [ozone] JacksonYao287 commented on a diff in pull request #3352: HDDS-6589. Add a new replication manager and change the existing one to legacy

JacksonYao287 commented on code in PR #3352:
URL: https://github.com/apache/ozone/pull/3352#discussion_r864420829


##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java:
##########
@@ -0,0 +1,448 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.container.replication;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.hdds.HddsConfigKeys;
+import org.apache.hadoop.hdds.conf.Config;
+import org.apache.hadoop.hdds.conf.ConfigGroup;
+import org.apache.hadoop.hdds.conf.ConfigType;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.scm.PlacementPolicy;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.container.ContainerReplicaCount;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
+import org.apache.hadoop.hdds.scm.container.common.helpers.MoveDataNodePair;
+import org.apache.hadoop.hdds.scm.ha.SCMContext;
+import org.apache.hadoop.hdds.scm.ha.SCMHAManager;
+import org.apache.hadoop.hdds.scm.ha.SCMService;
+import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import org.apache.hadoop.hdds.utils.db.Table;
+import org.apache.hadoop.util.ExitUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.time.Clock;
+import java.time.Duration;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import static org.apache.hadoop.hdds.conf.ConfigTag.OZONE;
+import static org.apache.hadoop.hdds.conf.ConfigTag.SCM;
+
+/**
+ * Replication Manager (RM) is the one which is responsible for making sure
+ * that the containers are properly replicated. Replication Manager deals only
+ * with Quasi Closed / Closed container.
+ */
+public class ReplicationManager implements SCMService {
+
+  public static final Logger LOG =
+      LoggerFactory.getLogger(ReplicationManager.class);
+
+  /**
+   * Reference to the ContainerManager.
+   */
+  private final ContainerManager containerManager;
+
+
+  /**
+   * SCMContext from StorageContainerManager.
+   */
+  private final SCMContext scmContext;
+
+
+  /**
+   * ReplicationManager specific configuration.
+   */
+  private final ReplicationManagerConfiguration rmConf;
+
+  /**
+   * ReplicationMonitor thread is the one which wakes up at configured
+   * interval and processes all the containers.
+   */
+  private Thread replicationMonitor;
+
+  /**
+   * Flag used for checking if the ReplicationMonitor thread is running or
+   * not.
+   */
+  private volatile boolean running;
+
+  /**
+   * Report object that is refreshed each time replication Manager runs.
+   */
+  private ReplicationManagerReport containerReport;
+
+  /**
+   * Replication progress related metrics.
+   */
+  private ReplicationManagerMetrics metrics;
+
+
+  /**
+   * Legacy RM will hopefully be removed after completing refactor
+   * for now, it is used to process non-EC container.
+   */
+  private LegacyReplicationManager legacyReplicationManager;
+
+  /**
+   * SCMService related variables.
+   * After leaving safe mode, replicationMonitor needs to wait for a while
+   * before really take effect.
+   */
+  private final Lock serviceLock = new ReentrantLock();
+  private ServiceStatus serviceStatus = ServiceStatus.PAUSING;
+  private final long waitTimeInMillis;
+  private long lastTimeToBeReadyInMillis = 0;
+  private final Clock clock;
+
+  /**
+   * Constructs ReplicationManager instance with the given configuration.
+   *
+   * @param conf OzoneConfiguration
+   * @param containerManager ContainerManager
+   * @param containerPlacement PlacementPolicy
+   * @param eventPublisher EventPublisher
+   */
+  @SuppressWarnings("parameternumber")
+  public ReplicationManager(final ConfigurationSource conf,
+             final ContainerManager containerManager,
+             final PlacementPolicy containerPlacement,
+             final EventPublisher eventPublisher,
+             final SCMContext scmContext,
+             final SCMServiceManager serviceManager,
+             final NodeManager nodeManager,
+             final Clock clock,
+             final SCMHAManager scmhaManager,
+             final Table<ContainerID, MoveDataNodePair> moveTable)
+             throws IOException {
+    this.containerManager = containerManager;
+    this.scmContext = scmContext;
+    this.rmConf = conf.getObject(ReplicationManagerConfiguration.class);
+    this.running = false;
+    this.clock = clock;
+    this.containerReport = new ReplicationManagerReport();
+    this.metrics = null;
+    this.waitTimeInMillis = conf.getTimeDuration(
+        HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT,
+        HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT_DEFAULT,
+        TimeUnit.MILLISECONDS);
+    this.legacyReplicationManager = new LegacyReplicationManager(
+        conf, containerManager, containerPlacement, eventPublisher,
+        scmContext, nodeManager, scmhaManager, clock, moveTable);
+
+    // register ReplicationManager to SCMServiceManager.
+    serviceManager.register(this);
+
+    // start ReplicationManager.
+    start();
+  }
+
+  /**
+   * Starts Replication Monitor thread.
+   */
+  @Override
+  public synchronized void start() {
+    if (!isRunning()) {
+      LOG.info("Starting Replication Monitor Thread.");
+      running = true;
+      metrics = ReplicationManagerMetrics.create(this);
+      legacyReplicationManager.setMetrics(metrics);
+      replicationMonitor = new Thread(this::run);
+      replicationMonitor.setName("ReplicationMonitor");
+      replicationMonitor.setDaemon(true);
+      replicationMonitor.start();
+    } else {
+      LOG.info("Replication Monitor Thread is already running.");
+    }
+  }
+
+  /**
+   * Returns true if the Replication Monitor Thread is running.
+   *
+   * @return true if running, false otherwise
+   */
+  public boolean isRunning() {
+    if (!running) {
+      synchronized (this) {
+        return replicationMonitor != null
+            && replicationMonitor.isAlive();
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Stops Replication Monitor thread.
+   */
+  public synchronized void stop() {
+    if (running) {
+      LOG.info("Stopping Replication Monitor Thread.");
+      running = false;
+      legacyReplicationManager.clearInflightActions();
+      metrics.unRegister();
+      notifyAll();

Review Comment:
   this is a good point , i have updated this patch , please take a look!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@ozone.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@ozone.apache.org
For additional commands, e-mail: issues-help@ozone.apache.org