You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by zh...@apache.org on 2018/01/15 10:34:04 UTC
[38/48] hbase git commit: HBASE-19636 All rs should already start
work with the new peer change when replication peer procedure is finished
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
index fc978be..e087127 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceManager.java
@@ -1,5 +1,4 @@
-/*
- *
+/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -16,7 +15,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.hadoop.hbase.replication.regionserver;
import java.io.IOException;
@@ -33,7 +31,7 @@ import java.util.SortedSet;
import java.util.TreeSet;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
@@ -70,27 +68,53 @@ import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesti
import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
- * This class is responsible to manage all the replication
- * sources. There are two classes of sources:
+ * This class is responsible to manage all the replication sources. There are two classes of
+ * sources:
* <ul>
- * <li> Normal sources are persistent and one per peer cluster</li>
- * <li> Old sources are recovered from a failed region server and our
- * only goal is to finish replicating the WAL queue it had up in ZK</li>
+ * <li>Normal sources are persistent and one per peer cluster</li>
+ * <li>Old sources are recovered from a failed region server and our only goal is to finish
+ * replicating the WAL queue it had</li>
+ * </ul>
+ * <p>
+ * When a region server dies, this class uses a watcher to get notified and it tries to grab a lock
+ * in order to transfer all the queues in a local old source.
+ * <p>
+ * Synchronization specification:
+ * <ul>
+ * <li>No need synchronized on {@link #sources}. {@link #sources} is a ConcurrentHashMap and there
+ * is a Lock for peer id in {@link PeerProcedureHandlerImpl}. So there is no race for peer
+ * operations.</li>
+ * <li>Need synchronized on {@link #walsById}. There are four methods which modify it,
+ * {@link #addPeer(String)}, {@link #removePeer(String)},
+ * {@link #cleanOldLogs(SortedSet, String, String)} and {@link #preLogRoll(Path)}. {@link #walsById}
+ * is a ConcurrentHashMap and there is a Lock for peer id in {@link PeerProcedureHandlerImpl}. So
+ * there is no race between {@link #addPeer(String)} and {@link #removePeer(String)}.
+ * {@link #cleanOldLogs(SortedSet, String, String)} is called by {@link ReplicationSourceInterface}.
+ * So no race with {@link #addPeer(String)}. {@link #removePeer(String)} will terminate the
+ * {@link ReplicationSourceInterface} firstly, then remove the wals from {@link #walsById}. So no
+ * race with {@link #removePeer(String)}. The only case need synchronized is
+ * {@link #cleanOldLogs(SortedSet, String, String)} and {@link #preLogRoll(Path)}.</li>
+ * <li>No need synchronized on {@link #walsByIdRecoveredQueues}. There are three methods which
+ * modify it, {@link #removePeer(String)} , {@link #cleanOldLogs(SortedSet, String, String)} and
+ * {@link ReplicationSourceManager.NodeFailoverWorker#run()}.
+ * {@link #cleanOldLogs(SortedSet, String, String)} is called by {@link ReplicationSourceInterface}.
+ * {@link #removePeer(String)} will terminate the {@link ReplicationSourceInterface} firstly, then
+ * remove the wals from {@link #walsByIdRecoveredQueues}. And
+ * {@link ReplicationSourceManager.NodeFailoverWorker#run()} will add the wals to
+ * {@link #walsByIdRecoveredQueues} firstly, then start up a {@link ReplicationSourceInterface}. So
+ * there is no race here. For {@link ReplicationSourceManager.NodeFailoverWorker#run()} and
+ * {@link #removePeer(String)}, there is already synchronized on {@link #oldsources}. So no need
+ * synchronized on {@link #walsByIdRecoveredQueues}.</li>
+ * <li>Need synchronized on {@link #latestPaths} to avoid the new open source miss new log.</li>
+ * <li>Need synchronized on {@link #oldsources} to avoid adding recovered source for the
+ * to-be-removed peer.</li>
* </ul>
- *
- * When a region server dies, this class uses a watcher to get notified and it
- * tries to grab a lock in order to transfer all the queues in a local
- * old source.
- *
- * This class implements the ReplicationListener interface so that it can track changes in
- * replication state.
*/
@InterfaceAudience.Private
public class ReplicationSourceManager implements ReplicationListener {
- private static final Logger LOG =
- LoggerFactory.getLogger(ReplicationSourceManager.class);
- // List of all the sources that read this RS's logs
- private final List<ReplicationSourceInterface> sources;
+ private static final Logger LOG = LoggerFactory.getLogger(ReplicationSourceManager.class);
+ // all the sources that read this RS's logs and every peer only has one replication source
+ private final ConcurrentMap<String, ReplicationSourceInterface> sources;
// List of all the sources we got from died RSs
private final List<ReplicationSourceInterface> oldsources;
private final ReplicationQueueStorage queueStorage;
@@ -100,11 +124,16 @@ public class ReplicationSourceManager implements ReplicationListener {
private final UUID clusterId;
// All about stopping
private final Server server;
+
// All logs we are currently tracking
- // Index structure of the map is: peer_id->logPrefix/logGroup->logs
- private final Map<String, Map<String, SortedSet<String>>> walsById;
+ // Index structure of the map is: queue_id->logPrefix/logGroup->logs
+ // For normal replication source, the peer id is same with the queue id
+ private final ConcurrentMap<String, Map<String, SortedSet<String>>> walsById;
// Logs for recovered sources we are currently tracking
- private final Map<String, Map<String, SortedSet<String>>> walsByIdRecoveredQueues;
+ // the map is: queue_id->logPrefix/logGroup->logs
+ // For recovered source, the queue id's format is peer_id-servername-*
+ private final ConcurrentMap<String, Map<String, SortedSet<String>>> walsByIdRecoveredQueues;
+
private final Configuration conf;
private final FileSystem fs;
// The paths to the latest log of each wal group, for new coming peers
@@ -142,22 +171,22 @@ public class ReplicationSourceManager implements ReplicationListener {
ReplicationPeers replicationPeers, ReplicationTracker replicationTracker, Configuration conf,
Server server, FileSystem fs, Path logDir, Path oldLogDir, UUID clusterId,
WALFileLengthProvider walFileLengthProvider) throws IOException {
- //CopyOnWriteArrayList is thread-safe.
- //Generally, reading is more than modifying.
- this.sources = new CopyOnWriteArrayList<>();
+ // CopyOnWriteArrayList is thread-safe.
+ // Generally, reading is more than modifying.
+ this.sources = new ConcurrentHashMap<>();
this.queueStorage = queueStorage;
this.replicationPeers = replicationPeers;
this.replicationTracker = replicationTracker;
this.server = server;
- this.walsById = new HashMap<>();
+ this.walsById = new ConcurrentHashMap<>();
this.walsByIdRecoveredQueues = new ConcurrentHashMap<>();
- this.oldsources = new CopyOnWriteArrayList<>();
+ this.oldsources = new ArrayList<>();
this.conf = conf;
this.fs = fs;
this.logDir = logDir;
this.oldLogDir = oldLogDir;
- this.sleepBeforeFailover =
- conf.getLong("replication.sleep.before.failover", 30000); // 30 seconds
+ this.sleepBeforeFailover = conf.getLong("replication.sleep.before.failover", 30000); // 30
+ // seconds
this.clusterId = clusterId;
this.walFileLengthProvider = walFileLengthProvider;
this.replicationTracker.registerListener(this);
@@ -166,89 +195,36 @@ public class ReplicationSourceManager implements ReplicationListener {
int nbWorkers = conf.getInt("replication.executor.workers", 1);
// use a short 100ms sleep since this could be done inline with a RS startup
// even if we fail, other region servers can take care of it
- this.executor = new ThreadPoolExecutor(nbWorkers, nbWorkers,
- 100, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>());
+ this.executor = new ThreadPoolExecutor(nbWorkers, nbWorkers, 100, TimeUnit.MILLISECONDS,
+ new LinkedBlockingQueue<>());
ThreadFactoryBuilder tfb = new ThreadFactoryBuilder();
tfb.setNameFormat("ReplicationExecutor-%d");
tfb.setDaemon(true);
this.executor.setThreadFactory(tfb.build());
this.latestPaths = new HashSet<Path>();
- replicationForBulkLoadDataEnabled =
- conf.getBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY,
- HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT);
+ replicationForBulkLoadDataEnabled = conf.getBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY,
+ HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT);
this.replicationWaitTime = conf.getLong(HConstants.REPLICATION_SERIALLY_WAITING_KEY,
- HConstants.REPLICATION_SERIALLY_WAITING_DEFAULT);
+ HConstants.REPLICATION_SERIALLY_WAITING_DEFAULT);
connection = ConnectionFactory.createConnection(conf);
}
- @FunctionalInterface
- private interface ReplicationQueueOperation {
- void exec() throws ReplicationException;
- }
-
- private void abortWhenFail(ReplicationQueueOperation op) {
- try {
- op.exec();
- } catch (ReplicationException e) {
- server.abort("Failed to operate on replication queue", e);
- }
- }
-
/**
- * Provide the id of the peer and a log key and this method will figure which
- * wal it belongs to and will log, for this region server, the current
- * position. It will also clean old logs from the queue.
- * @param log Path to the log currently being replicated from
- * replication status in zookeeper. It will also delete older entries.
- * @param id id of the peer cluster
- * @param position current location in the log
- * @param queueRecovered indicates if this queue comes from another region server
- * @param holdLogInZK if true then the log is retained in ZK
- */
- public void logPositionAndCleanOldLogs(Path log, String id, long position, boolean queueRecovered,
- boolean holdLogInZK) {
- String fileName = log.getName();
- abortWhenFail(
- () -> this.queueStorage.setWALPosition(server.getServerName(), id, fileName, position));
- if (holdLogInZK) {
- return;
- }
- cleanOldLogs(fileName, id, queueRecovered);
- }
-
- /**
- * Cleans a log file and all older files from ZK. Called when we are sure that a
- * log file is closed and has no more entries.
- * @param key Path to the log
- * @param id id of the peer cluster
- * @param queueRecovered Whether this is a recovered queue
+ * Adds a normal source per registered peer cluster and tries to process all old region server wal
+ * queues
+ * <p>
+ * The returned future is for adoptAbandonedQueues task.
*/
- public void cleanOldLogs(String key, String id, boolean queueRecovered) {
- String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(key);
- if (queueRecovered) {
- SortedSet<String> wals = walsByIdRecoveredQueues.get(id).get(logPrefix);
- if (wals != null && !wals.first().equals(key)) {
- cleanOldLogs(wals, key, id);
- }
- } else {
- synchronized (this.walsById) {
- SortedSet<String> wals = walsById.get(id).get(logPrefix);
- if (wals != null && !wals.first().equals(key)) {
- cleanOldLogs(wals, key, id);
- }
+ Future<?> init() throws IOException {
+ for (String id : this.replicationPeers.getAllPeerIds()) {
+ addSource(id);
+ if (replicationForBulkLoadDataEnabled) {
+ // Check if peer exists in hfile-refs queue, if not add it. This can happen in the case
+ // when a peer was added before replication for bulk loaded data was enabled.
+ throwIOExceptionWhenFail(() -> this.queueStorage.addPeerToHFileRefs(id));
}
}
- }
-
- private void cleanOldLogs(SortedSet<String> wals, String key, String id) {
- SortedSet<String> walSet = wals.headSet(key);
- if (LOG.isDebugEnabled()) {
- LOG.debug("Removing " + walSet.size() + " logs in the list: " + walSet);
- }
- for (String wal : walSet) {
- abortWhenFail(() -> this.queueStorage.removeWAL(server.getServerName(), id, wal));
- }
- walSet.clear();
+ return this.executor.submit(this::adoptAbandonedQueues);
}
private void adoptAbandonedQueues() {
@@ -264,8 +240,8 @@ public class ReplicationSourceManager implements ReplicationListener {
}
List<ServerName> otherRegionServers = replicationTracker.getListOfRegionServers().stream()
.map(ServerName::valueOf).collect(Collectors.toList());
- LOG.info("Current list of replicators: " + currentReplicators + " other RSs: "
- + otherRegionServers);
+ LOG.info(
+ "Current list of replicators: " + currentReplicators + " other RSs: " + otherRegionServers);
// Look if there's anything to process after a restart
for (ServerName rs : currentReplicators) {
@@ -276,56 +252,112 @@ public class ReplicationSourceManager implements ReplicationListener {
}
/**
- * Adds a normal source per registered peer cluster and tries to process all old region server wal
- * queues
- * <p>
- * The returned future is for adoptAbandonedQueues task.
+ * 1. Add peer to replicationPeers 2. Add the normal source and related replication queue 3. Add
+ * HFile Refs
+ * @param peerId the id of replication peer
*/
- Future<?> init() throws IOException, ReplicationException {
- for (String id : this.replicationPeers.getAllPeerIds()) {
- addSource(id);
+ public void addPeer(String peerId) throws IOException {
+ boolean added = false;
+ try {
+ added = this.replicationPeers.addPeer(peerId);
+ } catch (ReplicationException e) {
+ throw new IOException(e);
+ }
+ if (added) {
+ addSource(peerId);
if (replicationForBulkLoadDataEnabled) {
- // Check if peer exists in hfile-refs queue, if not add it. This can happen in the case
- // when a peer was added before replication for bulk loaded data was enabled.
- this.queueStorage.addPeerToHFileRefs(id);
+ throwIOExceptionWhenFail(() -> this.queueStorage.addPeerToHFileRefs(peerId));
}
}
- return this.executor.submit(this::adoptAbandonedQueues);
}
/**
- * Add sources for the given peer cluster on this region server. For the newly added peer, we only
- * need to enqueue the latest log of each wal group and do replication
- * @param id the id of the peer cluster
+ * 1. Remove peer for replicationPeers 2. Remove all the recovered sources for the specified id
+ * and related replication queues 3. Remove the normal source and related replication queue 4.
+ * Remove HFile Refs
+ * @param peerId the id of the replication peer
+ */
+ public void removePeer(String peerId) {
+ replicationPeers.removePeer(peerId);
+ String terminateMessage = "Replication stream was removed by a user";
+ List<ReplicationSourceInterface> oldSourcesToDelete = new ArrayList<>();
+ // synchronized on oldsources to avoid adding recovered source for the to-be-removed peer
+ // see NodeFailoverWorker.run
+ synchronized (this.oldsources) {
+ // First close all the recovered sources for this peer
+ for (ReplicationSourceInterface src : oldsources) {
+ if (peerId.equals(src.getPeerId())) {
+ oldSourcesToDelete.add(src);
+ }
+ }
+ for (ReplicationSourceInterface src : oldSourcesToDelete) {
+ src.terminate(terminateMessage);
+ removeRecoveredSource(src);
+ }
+ }
+ LOG.info(
+ "Number of deleted recovered sources for " + peerId + ": " + oldSourcesToDelete.size());
+ // Now close the normal source for this peer
+ ReplicationSourceInterface srcToRemove = this.sources.get(peerId);
+ if (srcToRemove != null) {
+ srcToRemove.terminate(terminateMessage);
+ removeSource(srcToRemove);
+ } else {
+ // This only happened in unit test TestReplicationSourceManager#testPeerRemovalCleanup
+ // Delete queue from storage and memory and queue id is same with peer id for normal
+ // source
+ deleteQueue(peerId);
+ this.walsById.remove(peerId);
+ }
+
+ // Remove HFile Refs
+ abortWhenFail(() -> this.queueStorage.removePeerFromHFileRefs(peerId));
+ }
+
+ /**
+ * Factory method to create a replication source
+ * @param queueId the id of the replication queue
+ * @return the created source
+ */
+ private ReplicationSourceInterface createSource(String queueId, ReplicationPeer replicationPeer)
+ throws IOException {
+ ReplicationSourceInterface src = ReplicationSourceFactory.create(conf, queueId);
+
+ MetricsSource metrics = new MetricsSource(queueId);
+ // init replication source
+ src.init(conf, fs, this, queueStorage, replicationPeer, server, queueId, clusterId,
+ walFileLengthProvider, metrics);
+ return src;
+ }
+
+ /**
+ * Add a normal source for the given peer on this region server. Meanwhile, add new replication
+ * queue to storage. For the newly added peer, we only need to enqueue the latest log of each wal
+ * group and do replication
+ * @param peerId the id of the replication peer
* @return the source that was created
*/
@VisibleForTesting
- ReplicationSourceInterface addSource(String id) throws IOException, ReplicationException {
- ReplicationPeer peer = replicationPeers.getPeer(id);
- ReplicationSourceInterface src = getReplicationSource(id, peer);
- synchronized (this.walsById) {
- this.sources.add(src);
+ ReplicationSourceInterface addSource(String peerId) throws IOException {
+ ReplicationPeer peer = replicationPeers.getPeer(peerId);
+ ReplicationSourceInterface src = createSource(peerId, peer);
+ // synchronized on latestPaths to avoid missing the new log
+ synchronized (this.latestPaths) {
+ this.sources.put(peerId, src);
Map<String, SortedSet<String>> walsByGroup = new HashMap<>();
- this.walsById.put(id, walsByGroup);
+ this.walsById.put(peerId, walsByGroup);
// Add the latest wal to that source's queue
- synchronized (latestPaths) {
- if (this.latestPaths.size() > 0) {
- for (Path logPath : latestPaths) {
- String name = logPath.getName();
- String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(name);
- SortedSet<String> logs = new TreeSet<>();
- logs.add(name);
- walsByGroup.put(walPrefix, logs);
- try {
- this.queueStorage.addWAL(server.getServerName(), id, name);
- } catch (ReplicationException e) {
- String message = "Cannot add log to queue when creating a new source, queueId=" + id +
- ", filename=" + name;
- server.stop(message);
- throw e;
- }
- src.enqueueLog(logPath);
- }
+ if (this.latestPaths.size() > 0) {
+ for (Path logPath : latestPaths) {
+ String name = logPath.getName();
+ String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(name);
+ SortedSet<String> logs = new TreeSet<>();
+ logs.add(name);
+ walsByGroup.put(walPrefix, logs);
+ // Abort RS and throw exception to make add peer failed
+ abortAndThrowIOExceptionWhenFail(
+ () -> this.queueStorage.addWAL(server.getServerName(), peerId, name));
+ src.enqueueLog(logPath);
}
}
}
@@ -333,87 +365,217 @@ public class ReplicationSourceManager implements ReplicationListener {
return src;
}
- @VisibleForTesting
- int getSizeOfLatestPath() {
- synchronized (latestPaths) {
- return latestPaths.size();
- }
- }
-
/**
- * Delete a complete queue of wals associated with a peer cluster
- * @param peerId Id of the peer cluster queue of wals to delete
+ * Close the previous replication sources of this peer id and open new sources to trigger the new
+ * replication state changes or new replication config changes. Here we don't need to change
+ * replication queue storage and only to enqueue all logs to the new replication source
+ * @param peerId the id of the replication peer
+ * @throws IOException
*/
- public void deleteSource(String peerId, boolean closeConnection) {
- abortWhenFail(() -> this.queueStorage.removeQueue(server.getServerName(), peerId));
- if (closeConnection) {
- this.replicationPeers.removePeer(peerId);
+ public void refreshSources(String peerId) throws IOException {
+ String terminateMessage = "Peer " + peerId +
+ " state or config changed. Will close the previous replication source and open a new one";
+ ReplicationPeer peer = replicationPeers.getPeer(peerId);
+ ReplicationSourceInterface src = createSource(peerId, peer);
+ // synchronized on latestPaths to avoid missing the new log
+ synchronized (this.latestPaths) {
+ ReplicationSourceInterface toRemove = this.sources.put(peerId, src);
+ if (toRemove != null) {
+ LOG.info("Terminate replication source for " + toRemove.getPeerId());
+ toRemove.terminate(terminateMessage);
+ }
+ for (SortedSet<String> walsByGroup : walsById.get(peerId).values()) {
+ walsByGroup.forEach(wal -> src.enqueueLog(new Path(this.logDir, wal)));
+ }
}
- }
+ LOG.info("Startup replication source for " + src.getPeerId());
+ src.startup();
- /**
- * Terminate the replication on this region server
- */
- public void join() {
- this.executor.shutdown();
- for (ReplicationSourceInterface source : this.sources) {
- source.terminate("Region server is closing");
+ List<ReplicationSourceInterface> toStartup = new ArrayList<>();
+ // synchronized on oldsources to avoid race with NodeFailoverWorker
+ synchronized (this.oldsources) {
+ List<String> previousQueueIds = new ArrayList<>();
+ for (ReplicationSourceInterface oldSource : this.oldsources) {
+ if (oldSource.getPeerId().equals(peerId)) {
+ previousQueueIds.add(oldSource.getQueueId());
+ oldSource.terminate(terminateMessage);
+ this.oldsources.remove(oldSource);
+ }
+ }
+ for (String queueId : previousQueueIds) {
+ ReplicationSourceInterface replicationSource = createSource(queueId, peer);
+ this.oldsources.add(replicationSource);
+ for (SortedSet<String> walsByGroup : walsByIdRecoveredQueues.get(queueId).values()) {
+ walsByGroup.forEach(wal -> src.enqueueLog(new Path(wal)));
+ }
+ toStartup.add(replicationSource);
+ }
+ }
+ for (ReplicationSourceInterface replicationSource : oldsources) {
+ replicationSource.startup();
}
}
/**
- * Get a copy of the wals of the first source on this rs
- * @return a sorted set of wal names
+ * Clear the metrics and related replication queue of the specified old source
+ * @param src source to clear
*/
- @VisibleForTesting
- Map<String, Map<String, SortedSet<String>>> getWALs() {
- return Collections.unmodifiableMap(walsById);
+ void removeRecoveredSource(ReplicationSourceInterface src) {
+ LOG.info("Done with the recovered queue " + src.getQueueId());
+ src.getSourceMetrics().clear();
+ this.oldsources.remove(src);
+ // Delete queue from storage and memory
+ deleteQueue(src.getQueueId());
+ this.walsByIdRecoveredQueues.remove(src.getQueueId());
}
/**
- * Get a copy of the wals of the recovered sources on this rs
- * @return a sorted set of wal names
+ * Clear the metrics and related replication queue of the specified old source
+ * @param src source to clear
*/
- @VisibleForTesting
- Map<String, Map<String, SortedSet<String>>> getWalsByIdRecoveredQueues() {
- return Collections.unmodifiableMap(walsByIdRecoveredQueues);
+ void removeSource(ReplicationSourceInterface src) {
+ LOG.info("Done with the queue " + src.getQueueId());
+ src.getSourceMetrics().clear();
+ this.sources.remove(src.getPeerId());
+ // Delete queue from storage and memory
+ deleteQueue(src.getQueueId());
+ this.walsById.remove(src.getQueueId());
}
/**
- * Get a list of all the normal sources of this rs
- * @return lis of all sources
+ * Delete a complete queue of wals associated with a replication source
+ * @param queueId the id of replication queue to delete
*/
- public List<ReplicationSourceInterface> getSources() {
- return this.sources;
+ private void deleteQueue(String queueId) {
+ abortWhenFail(() -> this.queueStorage.removeQueue(server.getServerName(), queueId));
+ }
+
+ @FunctionalInterface
+ private interface ReplicationQueueOperation {
+ void exec() throws ReplicationException;
+ }
+
+ private void abortWhenFail(ReplicationQueueOperation op) {
+ try {
+ op.exec();
+ } catch (ReplicationException e) {
+ server.abort("Failed to operate on replication queue", e);
+ }
+ }
+
+ private void throwIOExceptionWhenFail(ReplicationQueueOperation op) throws IOException {
+ try {
+ op.exec();
+ } catch (ReplicationException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private void abortAndThrowIOExceptionWhenFail(ReplicationQueueOperation op) throws IOException {
+ try {
+ op.exec();
+ } catch (ReplicationException e) {
+ server.abort("Failed to operate on replication queue", e);
+ throw new IOException(e);
+ }
}
/**
- * Get a list of all the old sources of this rs
- * @return list of all old sources
+ * This method will log the current position to storage. And also clean old logs from the
+ * replication queue.
+ * @param log Path to the log currently being replicated
+ * @param queueId id of the replication queue
+ * @param position current location in the log
+ * @param queueRecovered indicates if this queue comes from another region server
*/
- public List<ReplicationSourceInterface> getOldSources() {
- return this.oldsources;
+ public void logPositionAndCleanOldLogs(Path log, String queueId, long position,
+ boolean queueRecovered) {
+ String fileName = log.getName();
+ abortWhenFail(
+ () -> this.queueStorage.setWALPosition(server.getServerName(), queueId, fileName, position));
+ cleanOldLogs(fileName, queueId, queueRecovered);
}
/**
- * Get the normal source for a given peer
- * @param peerId
- * @return the normal source for the give peer if it exists, otherwise null.
+ * Cleans a log file and all older logs from replication queue. Called when we are sure that a log
+ * file is closed and has no more entries.
+ * @param log Path to the log
+ * @param queueId id of the replication queue
+ * @param queueRecovered Whether this is a recovered queue
*/
- public ReplicationSourceInterface getSource(String peerId) {
- return getSources().stream().filter(s -> s.getPeerId().equals(peerId)).findFirst().orElse(null);
+ @VisibleForTesting
+ void cleanOldLogs(String log, String queueId, boolean queueRecovered) {
+ String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(log);
+ if (queueRecovered) {
+ SortedSet<String> wals = walsByIdRecoveredQueues.get(queueId).get(logPrefix);
+ if (wals != null && !wals.first().equals(log)) {
+ cleanOldLogs(wals, log, queueId);
+ }
+ } else {
+ // synchronized on walsById to avoid race with preLogRoll
+ synchronized (this.walsById) {
+ SortedSet<String> wals = walsById.get(queueId).get(logPrefix);
+ if (wals != null && !wals.first().equals(log)) {
+ cleanOldLogs(wals, log, queueId);
+ }
+ }
+ }
}
- @VisibleForTesting
- List<String> getAllQueues() throws ReplicationException {
- return queueStorage.getAllQueues(server.getServerName());
+ private void cleanOldLogs(SortedSet<String> wals, String key, String id) {
+ SortedSet<String> walSet = wals.headSet(key);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Removing " + walSet.size() + " logs in the list: " + walSet);
+ }
+ for (String wal : walSet) {
+ abortWhenFail(() -> this.queueStorage.removeWAL(server.getServerName(), id, wal));
+ }
+ walSet.clear();
}
void preLogRoll(Path newLog) throws IOException {
- recordLog(newLog);
String logName = newLog.getName();
String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(logName);
- synchronized (latestPaths) {
+ // synchronized on latestPaths to avoid the new open source miss the new log
+ synchronized (this.latestPaths) {
+ // Add log to queue storage
+ for (ReplicationSourceInterface source : this.sources.values()) {
+ // If record log to queue storage failed, abort RS and throw exception to make log roll
+ // failed
+ abortAndThrowIOExceptionWhenFail(
+ () -> this.queueStorage.addWAL(server.getServerName(), source.getQueueId(), logName));
+ }
+
+ // synchronized on walsById to avoid race with cleanOldLogs
+ synchronized (this.walsById) {
+ // Update walsById map
+ for (Map.Entry<String, Map<String, SortedSet<String>>> entry : this.walsById.entrySet()) {
+ String peerId = entry.getKey();
+ Map<String, SortedSet<String>> walsByPrefix = entry.getValue();
+ boolean existingPrefix = false;
+ for (Map.Entry<String, SortedSet<String>> walsEntry : walsByPrefix.entrySet()) {
+ SortedSet<String> wals = walsEntry.getValue();
+ if (this.sources.isEmpty()) {
+ // If there's no slaves, don't need to keep the old wals since
+ // we only consider the last one when a new slave comes in
+ wals.clear();
+ }
+ if (logPrefix.equals(walsEntry.getKey())) {
+ wals.add(logName);
+ existingPrefix = true;
+ }
+ }
+ if (!existingPrefix) {
+ // The new log belongs to a new group, add it into this peer
+ LOG.debug("Start tracking logs for wal group " + logPrefix + " for peer " + peerId);
+ SortedSet<String> wals = new TreeSet<>();
+ wals.add(logName);
+ walsByPrefix.put(logPrefix, wals);
+ }
+ }
+ }
+
+ // Add to latestPaths
Iterator<Path> iterator = latestPaths.iterator();
while (iterator.hasNext()) {
Path path = iterator.next();
@@ -426,87 +588,21 @@ public class ReplicationSourceManager implements ReplicationListener {
}
}
- /**
- * Check and enqueue the given log to the correct source. If there's still no source for the
- * group to which the given log belongs, create one
- * @param logPath the log path to check and enqueue
- * @throws IOException
- */
- private void recordLog(Path logPath) throws IOException {
- String logName = logPath.getName();
- String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(logName);
- // update replication queues on ZK
- // synchronize on replicationPeers to avoid adding source for the to-be-removed peer
- synchronized (replicationPeers) {
- for (String id : replicationPeers.getAllPeerIds()) {
- try {
- this.queueStorage.addWAL(server.getServerName(), id, logName);
- } catch (ReplicationException e) {
- throw new IOException("Cannot add log to replication queue"
- + " when creating a new source, queueId=" + id + ", filename=" + logName, e);
- }
- }
- }
- // update walsById map
- synchronized (walsById) {
- for (Map.Entry<String, Map<String, SortedSet<String>>> entry : this.walsById.entrySet()) {
- String peerId = entry.getKey();
- Map<String, SortedSet<String>> walsByPrefix = entry.getValue();
- boolean existingPrefix = false;
- for (Map.Entry<String, SortedSet<String>> walsEntry : walsByPrefix.entrySet()) {
- SortedSet<String> wals = walsEntry.getValue();
- if (this.sources.isEmpty()) {
- // If there's no slaves, don't need to keep the old wals since
- // we only consider the last one when a new slave comes in
- wals.clear();
- }
- if (logPrefix.equals(walsEntry.getKey())) {
- wals.add(logName);
- existingPrefix = true;
- }
- }
- if (!existingPrefix) {
- // The new log belongs to a new group, add it into this peer
- LOG.debug("Start tracking logs for wal group " + logPrefix + " for peer " + peerId);
- SortedSet<String> wals = new TreeSet<>();
- wals.add(logName);
- walsByPrefix.put(logPrefix, wals);
- }
- }
- }
- }
-
void postLogRoll(Path newLog) throws IOException {
// This only updates the sources we own, not the recovered ones
- for (ReplicationSourceInterface source : this.sources) {
+ for (ReplicationSourceInterface source : this.sources.values()) {
source.enqueueLog(newLog);
}
}
- @VisibleForTesting
- public AtomicLong getTotalBufferUsed() {
- return totalBufferUsed;
- }
-
- /**
- * Factory method to create a replication source
- * @param peerId the id of the peer cluster
- * @return the created source
- */
- private ReplicationSourceInterface getReplicationSource(String peerId,
- ReplicationPeer replicationPeer) throws IOException {
- ReplicationSourceInterface src = ReplicationSourceFactory.create(conf, peerId);
-
- MetricsSource metrics = new MetricsSource(peerId);
- // init replication source
- src.init(conf, fs, this, queueStorage, replicationPeer, server, peerId, clusterId,
- walFileLengthProvider, metrics);
- return src;
+ @Override
+ public void regionServerRemoved(String regionserver) {
+ transferQueues(ServerName.valueOf(regionserver));
}
/**
* Transfer all the queues of the specified to this region server. First it tries to grab a lock
- * and if it works it will move the znodes and finally will delete the old znodes.
+ * and if it works it will move the old queues and finally will delete the old queues.
* <p>
* It creates one old source for any type of source of the old rs.
*/
@@ -524,102 +620,8 @@ public class ReplicationSourceManager implements ReplicationListener {
}
/**
- * Clear the references to the specified old source
- * @param src source to clear
- */
- public void closeRecoveredQueue(ReplicationSourceInterface src) {
- LOG.info("Done with the recovered queue " + src.getPeerClusterZnode());
- if (src instanceof ReplicationSource) {
- ((ReplicationSource) src).getSourceMetrics().clear();
- }
- this.oldsources.remove(src);
- deleteSource(src.getPeerClusterZnode(), false);
- this.walsByIdRecoveredQueues.remove(src.getPeerClusterZnode());
- }
-
- /**
- * Clear the references to the specified old source
- * @param src source to clear
- */
- public void closeQueue(ReplicationSourceInterface src) {
- LOG.info("Done with the queue " + src.getPeerClusterZnode());
- src.getSourceMetrics().clear();
- this.sources.remove(src);
- deleteSource(src.getPeerClusterZnode(), true);
- this.walsById.remove(src.getPeerClusterZnode());
- }
-
- public void addPeer(String id) throws ReplicationException, IOException {
- LOG.info("Trying to add peer, peerId: " + id);
- boolean added = this.replicationPeers.addPeer(id);
- if (added) {
- LOG.info("Peer " + id + " connected success, trying to start the replication source thread.");
- addSource(id);
- if (replicationForBulkLoadDataEnabled) {
- this.queueStorage.addPeerToHFileRefs(id);
- }
- }
- }
-
- /**
- * Thie method first deletes all the recovered sources for the specified
- * id, then deletes the normal source (deleting all related data in ZK).
- * @param id The id of the peer cluster
- */
- public void removePeer(String id) {
- LOG.info("Closing the following queue " + id + ", currently have "
- + sources.size() + " and another "
- + oldsources.size() + " that were recovered");
- String terminateMessage = "Replication stream was removed by a user";
- List<ReplicationSourceInterface> oldSourcesToDelete = new ArrayList<>();
- // synchronized on oldsources to avoid adding recovered source for the to-be-removed peer
- // see NodeFailoverWorker.run
- synchronized (oldsources) {
- // First close all the recovered sources for this peer
- for (ReplicationSourceInterface src : oldsources) {
- if (id.equals(src.getPeerId())) {
- oldSourcesToDelete.add(src);
- }
- }
- for (ReplicationSourceInterface src : oldSourcesToDelete) {
- src.terminate(terminateMessage);
- closeRecoveredQueue(src);
- }
- }
- LOG.info("Number of deleted recovered sources for " + id + ": "
- + oldSourcesToDelete.size());
- // Now look for the one on this cluster
- List<ReplicationSourceInterface> srcToRemove = new ArrayList<>();
- // synchronize on replicationPeers to avoid adding source for the to-be-removed peer
- synchronized (this.replicationPeers) {
- for (ReplicationSourceInterface src : this.sources) {
- if (id.equals(src.getPeerId())) {
- srcToRemove.add(src);
- }
- }
- if (srcToRemove.isEmpty()) {
- LOG.error("The peer we wanted to remove is missing a ReplicationSourceInterface. " +
- "This could mean that ReplicationSourceInterface initialization failed for this peer " +
- "and that replication on this peer may not be caught up. peerId=" + id);
- }
- for (ReplicationSourceInterface toRemove : srcToRemove) {
- toRemove.terminate(terminateMessage);
- closeQueue(toRemove);
- }
- deleteSource(id, true);
- }
- // Remove HFile Refs znode from zookeeper
- abortWhenFail(() -> this.queueStorage.removePeerFromHFileRefs(id));
- }
-
- @Override
- public void regionServerRemoved(String regionserver) {
- transferQueues(ServerName.valueOf(regionserver));
- }
-
- /**
- * Class responsible to setup new ReplicationSources to take care of the
- * queues from dead region servers.
+ * Class responsible to setup new ReplicationSources to take care of the queues from dead region
+ * servers.
*/
class NodeFailoverWorker extends Thread {
@@ -649,10 +651,10 @@ public class ReplicationSourceManager implements ReplicationListener {
}
Map<String, Set<String>> newQueues = new HashMap<>();
try {
- List<String> peers = queueStorage.getAllQueues(deadRS);
- while (!peers.isEmpty()) {
+ List<String> queues = queueStorage.getAllQueues(deadRS);
+ while (!queues.isEmpty()) {
Pair<String, SortedSet<String>> peer = queueStorage.claimQueue(deadRS,
- peers.get(ThreadLocalRandom.current().nextInt(peers.size())), server.getServerName());
+ queues.get(ThreadLocalRandom.current().nextInt(queues.size())), server.getServerName());
long sleep = sleepBeforeFailover / 2;
if (!peer.getSecond().isEmpty()) {
newQueues.put(peer.getFirst(), peer.getSecond());
@@ -664,9 +666,9 @@ public class ReplicationSourceManager implements ReplicationListener {
LOG.warn("Interrupted while waiting before transferring a queue.");
Thread.currentThread().interrupt();
}
- peers = queueStorage.getAllQueues(deadRS);
+ queues = queueStorage.getAllQueues(deadRS);
}
- if (!peers.isEmpty()) {
+ if (queues.isEmpty()) {
queueStorage.removeReplicatorIfQueueIsEmpty(deadRS);
}
} catch (ReplicationException e) {
@@ -681,23 +683,23 @@ public class ReplicationSourceManager implements ReplicationListener {
}
for (Map.Entry<String, Set<String>> entry : newQueues.entrySet()) {
- String peerId = entry.getKey();
+ String queueId = entry.getKey();
Set<String> walsSet = entry.getValue();
try {
// there is not an actual peer defined corresponding to peerId for the failover.
- ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(peerId);
+ ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(queueId);
String actualPeerId = replicationQueueInfo.getPeerId();
ReplicationPeer peer = replicationPeers.getPeer(actualPeerId);
if (peer == null) {
- LOG.warn("Skipping failover for peer:" + actualPeerId + " of node " + deadRS
- + ", peer is null");
- abortWhenFail(() -> queueStorage.removeQueue(server.getServerName(), peerId));
+ LOG.warn("Skipping failover for peer:" + actualPeerId + " of node " + deadRS +
+ ", peer is null");
+ abortWhenFail(() -> queueStorage.removeQueue(server.getServerName(), queueId));
continue;
}
// track sources in walsByIdRecoveredQueues
Map<String, SortedSet<String>> walsByGroup = new HashMap<>();
- walsByIdRecoveredQueues.put(peerId, walsByGroup);
+ walsByIdRecoveredQueues.put(queueId, walsByGroup);
for (String wal : walsSet) {
String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal);
SortedSet<String> wals = walsByGroup.get(walPrefix);
@@ -708,14 +710,12 @@ public class ReplicationSourceManager implements ReplicationListener {
wals.add(wal);
}
- // enqueue sources
- ReplicationSourceInterface src = getReplicationSource(peerId, peer);
+ ReplicationSourceInterface src = createSource(queueId, peer);
// synchronized on oldsources to avoid adding recovered source for the to-be-removed peer
- // see removePeer
synchronized (oldsources) {
if (!replicationPeers.getAllPeerIds().contains(src.getPeerId())) {
src.terminate("Recovered queue doesn't belong to any current peer");
- closeRecoveredQueue(src);
+ removeRecoveredSource(src);
continue;
}
oldsources.add(src);
@@ -733,6 +733,82 @@ public class ReplicationSourceManager implements ReplicationListener {
}
/**
+ * Terminate the replication on this region server
+ */
+ public void join() {
+ this.executor.shutdown();
+ for (ReplicationSourceInterface source : this.sources.values()) {
+ source.terminate("Region server is closing");
+ }
+ }
+
+ /**
+ * Get a copy of the wals of the normal sources on this rs
+ * @return a sorted set of wal names
+ */
+ @VisibleForTesting
+ Map<String, Map<String, SortedSet<String>>> getWALs() {
+ return Collections.unmodifiableMap(walsById);
+ }
+
+ /**
+ * Get a copy of the wals of the recovered sources on this rs
+ * @return a sorted set of wal names
+ */
+ @VisibleForTesting
+ Map<String, Map<String, SortedSet<String>>> getWalsByIdRecoveredQueues() {
+ return Collections.unmodifiableMap(walsByIdRecoveredQueues);
+ }
+
+ /**
+ * Get a list of all the normal sources of this rs
+ * @return list of all normal sources
+ */
+ public List<ReplicationSourceInterface> getSources() {
+ return new ArrayList<>(this.sources.values());
+ }
+
+ /**
+ * Get a list of all the recovered sources of this rs
+ * @return list of all recovered sources
+ */
+ public List<ReplicationSourceInterface> getOldSources() {
+ return this.oldsources;
+ }
+
+ /**
+ * Get the normal source for a given peer
+ * @return the normal source for the give peer if it exists, otherwise null.
+ */
+ @VisibleForTesting
+ public ReplicationSourceInterface getSource(String peerId) {
+ return this.sources.get(peerId);
+ }
+
+ @VisibleForTesting
+ List<String> getAllQueues() throws IOException {
+ List<String> allQueues = Collections.emptyList();
+ try {
+ allQueues = queueStorage.getAllQueues(server.getServerName());
+ } catch (ReplicationException e) {
+ throw new IOException(e);
+ }
+ return allQueues;
+ }
+
+ @VisibleForTesting
+ int getSizeOfLatestPath() {
+ synchronized (latestPaths) {
+ return latestPaths.size();
+ }
+ }
+
+ @VisibleForTesting
+ public AtomicLong getTotalBufferUsed() {
+ return totalBufferUsed;
+ }
+
+ /**
* Get the directory where wals are archived
* @return the directory where wals are archived
*/
@@ -764,28 +840,30 @@ public class ReplicationSourceManager implements ReplicationListener {
* Get the ReplicationPeers used by this ReplicationSourceManager
* @return the ReplicationPeers used by this ReplicationSourceManager
*/
- public ReplicationPeers getReplicationPeers() {return this.replicationPeers;}
+ public ReplicationPeers getReplicationPeers() {
+ return this.replicationPeers;
+ }
/**
* Get a string representation of all the sources' metrics
*/
public String getStats() {
StringBuilder stats = new StringBuilder();
- for (ReplicationSourceInterface source : sources) {
+ for (ReplicationSourceInterface source : this.sources.values()) {
stats.append("Normal source for cluster " + source.getPeerId() + ": ");
stats.append(source.getStats() + "\n");
}
for (ReplicationSourceInterface oldSource : oldsources) {
- stats.append("Recovered source for cluster/machine(s) " + oldSource.getPeerId()+": ");
- stats.append(oldSource.getStats()+ "\n");
+ stats.append("Recovered source for cluster/machine(s) " + oldSource.getPeerId() + ": ");
+ stats.append(oldSource.getStats() + "\n");
}
return stats.toString();
}
public void addHFileRefs(TableName tableName, byte[] family, List<Pair<Path, Path>> pairs)
- throws ReplicationException {
- for (ReplicationSourceInterface source : this.sources) {
- source.addHFileRefs(tableName, family, pairs);
+ throws IOException {
+ for (ReplicationSourceInterface source : this.sources.values()) {
+ throwIOExceptionWhenFail(() -> source.addHFileRefs(tableName, family, pairs));
}
}
@@ -798,11 +876,10 @@ public class ReplicationSourceManager implements ReplicationListener {
}
/**
- * Whether an entry can be pushed to the peer or not right now.
- * If we enable serial replication, we can not push the entry until all entries in its region
- * whose sequence numbers are smaller than this entry have been pushed.
- * For each ReplicationSource, we need only check the first entry in each region, as long as it
- * can be pushed, we can push all in this ReplicationSource.
+ * Whether an entry can be pushed to the peer or not right now. If we enable serial replication,
+ * we can not push the entry until all entries in its region whose sequence numbers are smaller
+ * than this entry have been pushed. For each ReplicationSource, we need only check the first
+ * entry in each region, as long as it can be pushed, we can push all in this ReplicationSource.
* This method will be blocked until we can push.
* @return the first barrier of entry's region, or -1 if there is no barrier. It is used to
* prevent saving positions in the region of no barrier.
@@ -813,22 +890,18 @@ public class ReplicationSourceManager implements ReplicationListener {
/**
* There are barriers for this region and position for this peer. N barriers form N intervals,
* (b1,b2) (b2,b3) ... (bn,max). Generally, there is no logs whose seq id is not greater than
- * the first barrier and the last interval is start from the last barrier.
- *
- * There are several conditions that we can push now, otherwise we should block:
- * 1) "Serial replication" is not enabled, we can push all logs just like before. This case
- * should not call this method.
- * 2) There is no barriers for this region, or the seq id is smaller than the first barrier.
- * It is mainly because we alter REPLICATION_SCOPE = 2. We can not guarantee the
- * order of logs that is written before altering.
- * 3) This entry is in the first interval of barriers. We can push them because it is the
- * start of a region. But if the region is created by region split, we should check
- * if the parent regions are fully pushed.
- * 4) If the entry's seq id and the position are in same section, or the pos is the last
- * number of previous section. Because when open a region we put a barrier the number
- * is the last log's id + 1.
- * 5) Log's seq is smaller than pos in meta, we are retrying. It may happen when a RS crashes
- * after save replication meta and before save zk offset.
+ * the first barrier and the last interval is start from the last barrier. There are several
+ * conditions that we can push now, otherwise we should block: 1) "Serial replication" is not
+ * enabled, we can push all logs just like before. This case should not call this method. 2)
+ * There is no barriers for this region, or the seq id is smaller than the first barrier. It is
+ * mainly because we alter REPLICATION_SCOPE = 2. We can not guarantee the order of logs that is
+ * written before altering. 3) This entry is in the first interval of barriers. We can push them
+ * because it is the start of a region. But if the region is created by region split, we should
+ * check if the parent regions are fully pushed. 4) If the entry's seq id and the position are
+ * in same section, or the pos is the last number of previous section. Because when open a
+ * region we put a barrier the number is the last log's id + 1. 5) Log's seq is smaller than pos
+ * in meta, we are retrying. It may happen when a RS crashes after save replication meta and
+ * before save zk offset.
*/
List<Long> barriers = MetaTableAccessor.getReplicationBarriers(connection, encodedName);
if (barriers.isEmpty() || seq <= barriers.get(0)) {
@@ -842,8 +915,8 @@ public class ReplicationSourceManager implements ReplicationListener {
if (interval == 1) {
// Case 3
// Check if there are parent regions
- String parentValue = MetaTableAccessor.getSerialReplicationParentRegion(connection,
- encodedName);
+ String parentValue =
+ MetaTableAccessor.getSerialReplicationParentRegion(connection, encodedName);
if (parentValue == null) {
// This region has no parent or the parent's log entries are fully pushed.
return;
@@ -855,16 +928,17 @@ public class ReplicationSourceManager implements ReplicationListener {
byte[] region = Bytes.toBytes(parent);
long pos = MetaTableAccessor.getReplicationPositionForOnePeer(connection, region, peerId);
List<Long> parentBarriers = MetaTableAccessor.getReplicationBarriers(connection, region);
- if (parentBarriers.size() > 0
- && parentBarriers.get(parentBarriers.size() - 1) - 1 > pos) {
+ if (parentBarriers.size() > 0 &&
+ parentBarriers.get(parentBarriers.size() - 1) - 1 > pos) {
allParentDone = false;
// For a closed region, we will write a close event marker to WAL whose sequence id is
// larger than final barrier but still smaller than next region's openSeqNum.
// So if the pos is larger than last barrier, we can say we have read the event marker
// which means the parent region has been fully pushed.
- LOG.info(Bytes.toString(encodedName) + " can not start pushing because parent region's"
- + " log has not been fully pushed: parent=" + Bytes.toString(region) + " pos=" + pos
- + " barriers=" + Arrays.toString(barriers.toArray()));
+ LOG.info(
+ Bytes.toString(encodedName) + " can not start pushing because parent region's" +
+ " log has not been fully pushed: parent=" + Bytes.toString(region) + " pos=" + pos +
+ " barriers=" + Arrays.toString(barriers.toArray()));
break;
}
}
@@ -878,7 +952,8 @@ public class ReplicationSourceManager implements ReplicationListener {
}
while (true) {
- long pos = MetaTableAccessor.getReplicationPositionForOnePeer(connection, encodedName, peerId);
+ long pos =
+ MetaTableAccessor.getReplicationPositionForOnePeer(connection, encodedName, peerId);
if (seq <= pos) {
// Case 5
}
@@ -893,9 +968,9 @@ public class ReplicationSourceManager implements ReplicationListener {
}
}
- LOG.info(Bytes.toString(encodedName) + " can not start pushing to peer " + peerId
- + " because previous log has not been pushed: sequence=" + seq + " pos=" + pos
- + " barriers=" + Arrays.toString(barriers.toArray()));
+ LOG.info(Bytes.toString(encodedName) + " can not start pushing to peer " + peerId +
+ " because previous log has not been pushed: sequence=" + seq + " pos=" + pos +
+ " barriers=" + Arrays.toString(barriers.toArray()));
Thread.sleep(replicationWaitTime);
}
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
index ea98cda..808f738 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceShipper.java
@@ -277,8 +277,8 @@ public class ReplicationSourceShipper extends Thread {
}
protected void updateLogPosition(long lastReadPosition) {
- source.getSourceManager().logPositionAndCleanOldLogs(currentPath, source.getPeerClusterZnode(),
- lastReadPosition, false, false);
+ source.getSourceManager().logPositionAndCleanOldLogs(currentPath, source.getQueueId(),
+ lastReadPosition, false);
lastLoggedPosition = lastReadPosition;
}
@@ -295,7 +295,7 @@ public class ReplicationSourceShipper extends Thread {
public void startup(UncaughtExceptionHandler handler) {
String name = Thread.currentThread().getName();
Threads.setDaemonThreadRunning(this, name + ".replicationSource." + walGroupId + ","
- + source.getPeerClusterZnode(), handler);
+ + source.getQueueId(), handler);
}
public PriorityBlockingQueue<Path> getLogQueue() {
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
index 4643a22..90a421d 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSourceWALReader.java
@@ -115,7 +115,7 @@ public class ReplicationSourceWALReader extends Thread {
this.conf.getInt("replication.source.maxretriesmultiplier", 300); // 5 minutes @ 1 sec per
this.eofAutoRecovery = conf.getBoolean("replication.source.eof.autorecovery", false);
this.entryBatchQueue = new LinkedBlockingQueue<>(batchCount);
- LOG.info("peerClusterZnode=" + source.getPeerClusterZnode()
+ LOG.info("peerClusterZnode=" + source.getQueueId()
+ ", ReplicationSourceWALReaderThread : " + source.getPeerId()
+ " inited, replicationBatchSizeCapacity=" + replicationBatchSizeCapacity
+ ", replicationBatchCountCapacity=" + replicationBatchCountCapacity
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java
index 38ec598..ff20ddc 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/ReplicationSourceDummy.java
@@ -89,7 +89,7 @@ public class ReplicationSourceDummy implements ReplicationSourceInterface {
}
@Override
- public String getPeerClusterZnode() {
+ public String getQueueId() {
return peerClusterId;
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplication.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplication.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplication.java
index ed71123..40a955e 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplication.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestNamespaceReplication.java
@@ -30,12 +30,11 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
-import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.NamespaceDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
+import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
@@ -43,6 +42,8 @@ import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
+import org.apache.hadoop.hbase.client.TableDescriptor;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.AfterClass;
@@ -68,9 +69,6 @@ public class TestNamespaceReplication extends TestReplicationBase {
private static final byte[] val = Bytes.toBytes("myval");
- private static HTableDescriptor tabA;
- private static HTableDescriptor tabB;
-
private static Connection connection1;
private static Connection connection2;
private static Admin admin1;
@@ -90,23 +88,21 @@ public class TestNamespaceReplication extends TestReplicationBase {
admin2.createNamespace(NamespaceDescriptor.create(ns1).build());
admin2.createNamespace(NamespaceDescriptor.create(ns2).build());
- tabA = new HTableDescriptor(tabAName);
- HColumnDescriptor fam = new HColumnDescriptor(f1Name);
- fam.setScope(HConstants.REPLICATION_SCOPE_GLOBAL);
- tabA.addFamily(fam);
- fam = new HColumnDescriptor(f2Name);
- fam.setScope(HConstants.REPLICATION_SCOPE_GLOBAL);
- tabA.addFamily(fam);
+ TableDescriptorBuilder builder = TableDescriptorBuilder.newBuilder(tabAName);
+ builder.addColumnFamily(ColumnFamilyDescriptorBuilder
+ .newBuilder(f1Name).setScope(HConstants.REPLICATION_SCOPE_GLOBAL).build());
+ builder.addColumnFamily(ColumnFamilyDescriptorBuilder
+ .newBuilder(f2Name).setScope(HConstants.REPLICATION_SCOPE_GLOBAL).build());
+ TableDescriptor tabA = builder.build();
admin1.createTable(tabA);
admin2.createTable(tabA);
- tabB = new HTableDescriptor(tabBName);
- fam = new HColumnDescriptor(f1Name);
- fam.setScope(HConstants.REPLICATION_SCOPE_GLOBAL);
- tabB.addFamily(fam);
- fam = new HColumnDescriptor(f2Name);
- fam.setScope(HConstants.REPLICATION_SCOPE_GLOBAL);
- tabB.addFamily(fam);
+ builder = TableDescriptorBuilder.newBuilder(tabBName);
+ builder.addColumnFamily(ColumnFamilyDescriptorBuilder
+ .newBuilder(f1Name).setScope(HConstants.REPLICATION_SCOPE_GLOBAL).build());
+ builder.addColumnFamily(ColumnFamilyDescriptorBuilder
+ .newBuilder(f2Name).setScope(HConstants.REPLICATION_SCOPE_GLOBAL).build());
+ TableDescriptor tabB = builder.build();
admin1.createTable(tabB);
admin2.createTable(tabB);
}
@@ -134,22 +130,24 @@ public class TestNamespaceReplication extends TestReplicationBase {
@Test
public void testNamespaceReplication() throws Exception {
+ String peerId = "2";
+
Table htab1A = connection1.getTable(tabAName);
Table htab2A = connection2.getTable(tabAName);
Table htab1B = connection1.getTable(tabBName);
Table htab2B = connection2.getTable(tabBName);
- ReplicationPeerConfig rpc = admin.getPeerConfig("2");
- rpc.setReplicateAllUserTables(false);
- admin.updatePeerConfig("2", rpc);
+ ReplicationPeerConfig rpc = admin1.getReplicationPeerConfig(peerId);
+ admin1.updateReplicationPeerConfig(peerId,
+ ReplicationPeerConfig.newBuilder(rpc).setReplicateAllUserTables(false).build());
// add ns1 to peer config which replicate to cluster2
- rpc = admin.getPeerConfig("2");
+ rpc = admin1.getReplicationPeerConfig(peerId);
Set<String> namespaces = new HashSet<>();
namespaces.add(ns1);
- rpc.setNamespaces(namespaces);
- admin.updatePeerConfig("2", rpc);
+ admin1.updateReplicationPeerConfig(peerId,
+ ReplicationPeerConfig.newBuilder(rpc).setNamespaces(namespaces).build());
LOG.info("update peer config");
// Table A can be replicated to cluster2
@@ -163,15 +161,14 @@ public class TestNamespaceReplication extends TestReplicationBase {
ensureRowNotExisted(htab2B, row, f1Name, f2Name);
// add ns1:TA => 'f1' and ns2 to peer config which replicate to cluster2
- rpc = admin.getPeerConfig("2");
+ rpc = admin1.getReplicationPeerConfig(peerId);
namespaces = new HashSet<>();
namespaces.add(ns2);
- rpc.setNamespaces(namespaces);
Map<TableName, List<String>> tableCfs = new HashMap<>();
tableCfs.put(tabAName, new ArrayList<>());
tableCfs.get(tabAName).add("f1");
- rpc.setTableCFsMap(tableCfs);
- admin.updatePeerConfig("2", rpc);
+ admin1.updateReplicationPeerConfig(peerId, ReplicationPeerConfig.newBuilder(rpc)
+ .setNamespaces(namespaces).setTableCFsMap(tableCfs).build());
LOG.info("update peer config");
// Only family f1 of Table A can replicated to cluster2
@@ -186,7 +183,7 @@ public class TestNamespaceReplication extends TestReplicationBase {
delete(htab1B, row, f1Name, f2Name);
ensureRowNotExisted(htab2B, row, f1Name, f2Name);
- admin.removePeer("2");
+ admin1.removeReplicationPeer(peerId);
}
private void put(Table source, byte[] row, byte[]... families)
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java
index 1001aa5..33216cb 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManager.java
@@ -1,5 +1,4 @@
-/*
- *
+/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -95,12 +94,12 @@ import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
-import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor;
-
import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
+import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
+import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor;
+
/**
* An abstract class that tests ReplicationSourceManager. Classes that extend this class should
* set up the proper config for this class and initialize the proper cluster using
@@ -313,7 +312,7 @@ public abstract class TestReplicationSourceManager {
wal.rollWriter();
manager.logPositionAndCleanOldLogs(manager.getSources().get(0).getCurrentPath(),
- "1", 0, false, false);
+ "1", 0, false);
wal.append(hri,
new WALKeyImpl(hri.getEncodedNameAsBytes(), test, System.currentTimeMillis(), mvcc, scopes),
http://git-wip-us.apache.org/repos/asf/hbase/blob/1f0bfe01/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java
index c6d9eef..490c4b5 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSourceManagerZkImpl.java
@@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.replication.regionserver;
import static org.junit.Assert.assertTrue;
import java.util.List;
-
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.Server;