You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hbase.apache.org by GitBox <gi...@apache.org> on 2021/02/11 21:20:31 UTC

[GitHub] [hbase] bharathv commented on a change in pull request #2945: [HBASE-25539] Add age of oldest wal metric

bharathv commented on a change in pull request #2945:
URL: https://github.com/apache/hbase/pull/2945#discussion_r574692860



##########
File path: hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/replication/regionserver/MetricsReplicationSourceSource.java
##########
@@ -76,4 +78,6 @@
   long getWALEditsRead();
   long getShippedOps();
   long getEditsFiltered();
+  void setOldestWalAge(long age);

Review comment:
       nit: Add a quick javadoc that age is ms since its created time? (not obvious otherwise)

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics
+  just at one place.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ReplicationLogQueue {
+  // Queues of logs to process, entry in format of walGroupId->queue,
+  // each presents a queue for one wal group
+  private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
+  private MetricsSource metrics;
+  private Configuration conf;
+  // per group queue size, keep no more than this number of logs in each wal group
+  private int queueSizePerGroup;
+  // WARN threshold for the number of queued logs, defaults to 2
+  private int logQueueWarnThreshold;
+  private ReplicationSource source;
+  private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);

Review comment:
       nit: static finals to the top of the class.

##########
File path: hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSource.java
##########
@@ -590,5 +595,60 @@ public void testAbortTrueOnError() throws IOException {
       rss.stop("Done");
     }
   }
-}
 
+  /*
+    Testing age of oldest wal metric.
+  */
+  @Test
+  public void testAgeOfOldestWal() throws Exception {
+    try {
+      ManualEnvironmentEdge manualEdge = new ManualEnvironmentEdge();
+      EnvironmentEdgeManager.injectEdge(manualEdge);
+
+      String id = "1";
+      MetricsSource metrics = new MetricsSource(id);
+      Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
+      conf.setInt("replication.source.maxretriesmultiplier", 1);
+      ReplicationPeer mockPeer = Mockito.mock(ReplicationPeer.class);
+      Mockito.when(mockPeer.getConfiguration()).thenReturn(conf);
+      Mockito.when(mockPeer.getPeerBandwidth()).thenReturn(0L);
+      ReplicationPeerConfig peerConfig = Mockito.mock(ReplicationPeerConfig.class);
+      Mockito.when(peerConfig.getReplicationEndpointImpl()).
+        thenReturn(DoNothingReplicationEndpoint.class.getName());
+      Mockito.when(mockPeer.getPeerConfig()).thenReturn(peerConfig);
+      ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class);
+      Mockito.when(manager.getTotalBufferUsed()).thenReturn(new AtomicLong());
+      Mockito.when(manager.getGlobalMetrics()).
+        thenReturn(mock(MetricsReplicationGlobalSourceSource.class));
+      RegionServerServices rss =
+        TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1"));
+
+      ReplicationSource source = new ReplicationSource();
+      source.init(conf, null, manager, null, mockPeer, rss, id, null,
+        p -> OptionalLong.empty(), metrics);
+
+      final Path log1 = new Path(logDir, "log-walgroup-a.8");
+      manualEdge.setValue(10);
+      // Diff of current time (10) and  log-walgroup-a.8 timestamp will be 2.
+      source.enqueueLog(log1);
+      MetricsReplicationSourceSource metricsSource1 = getSourceMetrics(id);
+      assertEquals(2, metricsSource1.getOldestWalAge());
+
+      final Path log2 = new Path(logDir, "log-walgroup-b.4");
+      // Diff of current time (10) and log-walgroup-b.4 will be 6 so oldestWalAge should be 6
+      source.enqueueLog(log2);
+      assertEquals(6, metricsSource1.getOldestWalAge());
+      // Clear all metrics.

Review comment:
       Couple more scenarios I think are worth testing ?
   
   - dequeue an existing max age ts, make sure the the new max age is updated
   - RecoveredReplicationSource case.

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics

Review comment:
       nit: dequeuing typo

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics
+  just at one place.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ReplicationLogQueue {
+  // Queues of logs to process, entry in format of walGroupId->queue,
+  // each presents a queue for one wal group
+  private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
+  private MetricsSource metrics;
+  private Configuration conf;
+  // per group queue size, keep no more than this number of logs in each wal group
+  private int queueSizePerGroup;
+  // WARN threshold for the number of queued logs, defaults to 2
+  private int logQueueWarnThreshold;
+  private ReplicationSource source;
+  private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);
+
+
+  public ReplicationLogQueue(Configuration conf, MetricsSource metrics, ReplicationSource source) {

Review comment:
       nit: thinking out loud, should we rename it to ReplicationSourceLogQueue to better convey that this is per source across all walGroups? 

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics
+  just at one place.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ReplicationLogQueue {
+  // Queues of logs to process, entry in format of walGroupId->queue,
+  // each presents a queue for one wal group
+  private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
+  private MetricsSource metrics;
+  private Configuration conf;
+  // per group queue size, keep no more than this number of logs in each wal group
+  private int queueSizePerGroup;
+  // WARN threshold for the number of queued logs, defaults to 2
+  private int logQueueWarnThreshold;
+  private ReplicationSource source;
+  private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);
+
+
+  public ReplicationLogQueue(Configuration conf, MetricsSource metrics, ReplicationSource source) {
+    this.conf = conf;
+    this.metrics = metrics;
+    this.source = source;
+    this.queueSizePerGroup = this.conf.getInt("hbase.regionserver.maxlogs", 32);
+    this.logQueueWarnThreshold = this.conf.getInt("replication.source.log.queue.warn", 2);
+  }
+
+  /**
+   * Enqueue the wal
+   * @param wal wal to be enqueued
+   * @param walGroupId Key for the wal in @queues map
+   * @return boolean whether this is the first time we are seeing this walGroupId.
+   */
+  public boolean enqueueLog(Path wal, String walGroupId) {
+    boolean exists = false;
+    PriorityBlockingQueue<Path> queue = queues.get(walGroupId);
+    if (queue == null) {
+      queue = new PriorityBlockingQueue<>(queueSizePerGroup,
+        new AbstractFSWALProvider.WALStartTimeComparator());
+      // make sure that we do not use an empty queue when setting up a ReplicationSource, otherwise
+      // the shipper may quit immediately
+      queue.put(wal);
+      queues.put(walGroupId, queue);
+    } else {
+      exists = true;
+      queue.put(wal);
+    }
+    // Increment size of logQueue
+    this.metrics.incrSizeOfLogQueue();
+    // Compute oldest wal age
+    setOldestWalAge();
+    // This will wal a warning for each new wal that gets created above the warn threshold
+    int queueSize = queue.size();
+    if (queueSize > this.logQueueWarnThreshold) {
+      LOG.warn("{} WAL group {} queue size: {} exceeds value of " +
+          "replication.source.log.queue.warn {}", source.logPeerId(), walGroupId, queueSize,
+        logQueueWarnThreshold);
+    }
+    return exists;
+  }
+
+  /**
+   * Get the queue size for the given walGroupId.
+   * @param walGroupId walGroupId
+   */
+  public int getQueueSize(String walGroupId) {
+    Queue queue = queues.get(walGroupId);
+    if (queue == null) {
+      return 0;
+    }
+    return queue.size();
+  }
+
+  /**
+   * Returns number of queues.
+   */
+  public int getNumQueues() {
+    return queues.size();
+  }
+
+  public Map<String, PriorityBlockingQueue<Path>> getQueues() {
+    return queues;
+  }
+
+  /**
+   * Return queue for the given walGroupId
+   * Please don't add or remove elements from the returned queue.
+   * Use @enqueueLog and @remove methods respectively.
+   * @param walGroupId walGroupId
+   */
+  public PriorityBlockingQueue<Path> getQueue(String walGroupId) {
+    return queues.get(walGroupId);
+  }
+
+  /**
+   * Remove head from the queue corresponding to given walGroupId.
+   * @param walGroupId walGroupId
+   */
+  public void remove(String walGroupId) {
+    PriorityBlockingQueue<Path> queue = getQueue(walGroupId);
+    if (queue == null || queue.isEmpty()) {
+      return;
+    }
+    queue.remove();
+    // Decrease size logQueue.
+    metrics.decrSizeOfLogQueue();
+    // Re-compute age of oldest wal metric.
+    setOldestWalAge();
+  }
+
+  /**
+   * Remove all the elements from the queue corresponding to walGroupId
+   * @param walGroupId walGroupId
+   */
+  public void clear(String walGroupId) {
+    PriorityBlockingQueue<Path> queue = getQueue(walGroupId);
+    while (!queue.isEmpty()) {
+      // Need to iterate since metrics#decrSizeOfLogQueue decrements just by 1.
+      queue.remove();
+      metrics.decrSizeOfLogQueue();
+    }
+    setOldestWalAge();
+  }
+
+  private void setOldestWalAge() {
+    long now = EnvironmentEdgeManager.currentTime();
+    long timestamp = getOldestWalTimestamp();
+    // TODO: Should we handle the case where getOldestWalTimestamp returns Long.MAX_VALUE ?
+    long age = now - timestamp;
+    this.metrics.setOldestWalAge(age);
+  }
+
+  /*
+  Get the oldest wal timestamp from all the queues.
+ */

Review comment:
       nit: indent

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics
+  just at one place.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ReplicationLogQueue {
+  // Queues of logs to process, entry in format of walGroupId->queue,
+  // each presents a queue for one wal group
+  private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
+  private MetricsSource metrics;
+  private Configuration conf;
+  // per group queue size, keep no more than this number of logs in each wal group
+  private int queueSizePerGroup;
+  // WARN threshold for the number of queued logs, defaults to 2
+  private int logQueueWarnThreshold;
+  private ReplicationSource source;
+  private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);
+
+
+  public ReplicationLogQueue(Configuration conf, MetricsSource metrics, ReplicationSource source) {
+    this.conf = conf;
+    this.metrics = metrics;
+    this.source = source;
+    this.queueSizePerGroup = this.conf.getInt("hbase.regionserver.maxlogs", 32);
+    this.logQueueWarnThreshold = this.conf.getInt("replication.source.log.queue.warn", 2);
+  }
+
+  /**
+   * Enqueue the wal
+   * @param wal wal to be enqueued
+   * @param walGroupId Key for the wal in @queues map
+   * @return boolean whether this is the first time we are seeing this walGroupId.
+   */
+  public boolean enqueueLog(Path wal, String walGroupId) {
+    boolean exists = false;
+    PriorityBlockingQueue<Path> queue = queues.get(walGroupId);
+    if (queue == null) {
+      queue = new PriorityBlockingQueue<>(queueSizePerGroup,
+        new AbstractFSWALProvider.WALStartTimeComparator());
+      // make sure that we do not use an empty queue when setting up a ReplicationSource, otherwise
+      // the shipper may quit immediately
+      queue.put(wal);
+      queues.put(walGroupId, queue);
+    } else {
+      exists = true;
+      queue.put(wal);
+    }
+    // Increment size of logQueue
+    this.metrics.incrSizeOfLogQueue();
+    // Compute oldest wal age
+    setOldestWalAge();
+    // This will wal a warning for each new wal that gets created above the warn threshold
+    int queueSize = queue.size();
+    if (queueSize > this.logQueueWarnThreshold) {
+      LOG.warn("{} WAL group {} queue size: {} exceeds value of " +
+          "replication.source.log.queue.warn {}", source.logPeerId(), walGroupId, queueSize,
+        logQueueWarnThreshold);
+    }
+    return exists;
+  }
+
+  /**
+   * Get the queue size for the given walGroupId.
+   * @param walGroupId walGroupId
+   */
+  public int getQueueSize(String walGroupId) {
+    Queue queue = queues.get(walGroupId);
+    if (queue == null) {
+      return 0;
+    }
+    return queue.size();
+  }
+
+  /**
+   * Returns number of queues.
+   */
+  public int getNumQueues() {
+    return queues.size();
+  }
+
+  public Map<String, PriorityBlockingQueue<Path>> getQueues() {
+    return queues;
+  }
+
+  /**
+   * Return queue for the given walGroupId
+   * Please don't add or remove elements from the returned queue.
+   * Use @enqueueLog and @remove methods respectively.
+   * @param walGroupId walGroupId
+   */
+  public PriorityBlockingQueue<Path> getQueue(String walGroupId) {
+    return queues.get(walGroupId);
+  }
+
+  /**
+   * Remove head from the queue corresponding to given walGroupId.
+   * @param walGroupId walGroupId
+   */
+  public void remove(String walGroupId) {
+    PriorityBlockingQueue<Path> queue = getQueue(walGroupId);
+    if (queue == null || queue.isEmpty()) {
+      return;
+    }
+    queue.remove();
+    // Decrease size logQueue.
+    metrics.decrSizeOfLogQueue();
+    // Re-compute age of oldest wal metric.
+    setOldestWalAge();
+  }
+
+  /**
+   * Remove all the elements from the queue corresponding to walGroupId
+   * @param walGroupId walGroupId
+   */
+  public void clear(String walGroupId) {
+    PriorityBlockingQueue<Path> queue = getQueue(walGroupId);
+    while (!queue.isEmpty()) {
+      // Need to iterate since metrics#decrSizeOfLogQueue decrements just by 1.
+      queue.remove();
+      metrics.decrSizeOfLogQueue();
+    }
+    setOldestWalAge();
+  }
+
+  private void setOldestWalAge() {
+    long now = EnvironmentEdgeManager.currentTime();
+    long timestamp = getOldestWalTimestamp();
+    // TODO: Should we handle the case where getOldestWalTimestamp returns Long.MAX_VALUE ?

Review comment:
       think we should? Otherwise on empty queue we get false alarms?

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java
##########
@@ -250,35 +244,20 @@ public void enqueueLog(Path wal) {
     }
     // Use WAL prefix as the WALGroupId for this peer.
     String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName());
-    PriorityBlockingQueue<Path> queue = queues.get(walPrefix);
-    if (queue == null) {
-      queue = new PriorityBlockingQueue<>(queueSizePerGroup,
-        new AbstractFSWALProvider.WALStartTimeComparator());
-      // make sure that we do not use an empty queue when setting up a ReplicationSource, otherwise
-      // the shipper may quit immediately
-      queue.put(wal);
-      queues.put(walPrefix, queue);
+    boolean queueAlreadyExisted = logQueue.enqueueLog(wal, walPrefix);

Review comment:
       nit: s/queueExists

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics
+  just at one place.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ReplicationLogQueue {
+  // Queues of logs to process, entry in format of walGroupId->queue,
+  // each presents a queue for one wal group
+  private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
+  private MetricsSource metrics;
+  private Configuration conf;
+  // per group queue size, keep no more than this number of logs in each wal group
+  private int queueSizePerGroup;
+  // WARN threshold for the number of queued logs, defaults to 2
+  private int logQueueWarnThreshold;
+  private ReplicationSource source;
+  private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);
+
+
+  public ReplicationLogQueue(Configuration conf, MetricsSource metrics, ReplicationSource source) {
+    this.conf = conf;
+    this.metrics = metrics;
+    this.source = source;
+    this.queueSizePerGroup = this.conf.getInt("hbase.regionserver.maxlogs", 32);
+    this.logQueueWarnThreshold = this.conf.getInt("replication.source.log.queue.warn", 2);
+  }
+
+  /**
+   * Enqueue the wal
+   * @param wal wal to be enqueued
+   * @param walGroupId Key for the wal in @queues map
+   * @return boolean whether this is the first time we are seeing this walGroupId.
+   */
+  public boolean enqueueLog(Path wal, String walGroupId) {
+    boolean exists = false;
+    PriorityBlockingQueue<Path> queue = queues.get(walGroupId);
+    if (queue == null) {
+      queue = new PriorityBlockingQueue<>(queueSizePerGroup,
+        new AbstractFSWALProvider.WALStartTimeComparator());
+      // make sure that we do not use an empty queue when setting up a ReplicationSource, otherwise
+      // the shipper may quit immediately
+      queue.put(wal);
+      queues.put(walGroupId, queue);
+    } else {
+      exists = true;
+      queue.put(wal);
+    }
+    // Increment size of logQueue
+    this.metrics.incrSizeOfLogQueue();
+    // Compute oldest wal age
+    setOldestWalAge();
+    // This will wal a warning for each new wal that gets created above the warn threshold
+    int queueSize = queue.size();
+    if (queueSize > this.logQueueWarnThreshold) {
+      LOG.warn("{} WAL group {} queue size: {} exceeds value of " +
+          "replication.source.log.queue.warn {}", source.logPeerId(), walGroupId, queueSize,
+        logQueueWarnThreshold);
+    }
+    return exists;
+  }
+
+  /**
+   * Get the queue size for the given walGroupId.
+   * @param walGroupId walGroupId
+   */
+  public int getQueueSize(String walGroupId) {
+    Queue queue = queues.get(walGroupId);
+    if (queue == null) {
+      return 0;
+    }
+    return queue.size();
+  }
+
+  /**
+   * Returns number of queues.
+   */
+  public int getNumQueues() {
+    return queues.size();
+  }
+
+  public Map<String, PriorityBlockingQueue<Path>> getQueues() {
+    return queues;
+  }
+
+  /**
+   * Return queue for the given walGroupId
+   * Please don't add or remove elements from the returned queue.
+   * Use @enqueueLog and @remove methods respectively.
+   * @param walGroupId walGroupId
+   */
+  public PriorityBlockingQueue<Path> getQueue(String walGroupId) {
+    return queues.get(walGroupId);
+  }
+
+  /**
+   * Remove head from the queue corresponding to given walGroupId.
+   * @param walGroupId walGroupId
+   */
+  public void remove(String walGroupId) {
+    PriorityBlockingQueue<Path> queue = getQueue(walGroupId);
+    if (queue == null || queue.isEmpty()) {
+      return;
+    }
+    queue.remove();
+    // Decrease size logQueue.
+    metrics.decrSizeOfLogQueue();
+    // Re-compute age of oldest wal metric.
+    setOldestWalAge();
+  }
+
+  /**
+   * Remove all the elements from the queue corresponding to walGroupId
+   * @param walGroupId walGroupId
+   */
+  public void clear(String walGroupId) {
+    PriorityBlockingQueue<Path> queue = getQueue(walGroupId);
+    while (!queue.isEmpty()) {
+      // Need to iterate since metrics#decrSizeOfLogQueue decrements just by 1.
+      queue.remove();
+      metrics.decrSizeOfLogQueue();
+    }
+    setOldestWalAge();
+  }
+
+  private void setOldestWalAge() {
+    long now = EnvironmentEdgeManager.currentTime();
+    long timestamp = getOldestWalTimestamp();
+    // TODO: Should we handle the case where getOldestWalTimestamp returns Long.MAX_VALUE ?
+    long age = now - timestamp;
+    this.metrics.setOldestWalAge(age);
+  }
+
+  /*
+  Get the oldest wal timestamp from all the queues.
+ */
+  private long getOldestWalTimestamp() {
+    long oldestWalTimestamp = Long.MAX_VALUE;
+    for (Map.Entry<String, PriorityBlockingQueue<Path>> entry : queues.entrySet()) {

Review comment:
       this is an O(n) loop but should be ok because the no. of walGroups is typically in a few 100s max?

##########
File path: hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationLogQueue.java
##########
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.replication.regionserver;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.PriorityBlockingQueue;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
+import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.yetus.audience.InterfaceStability;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+  Class that does enqueueing/dequeueing of wal at one place so that we can update the metrics
+  just at one place.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class ReplicationLogQueue {
+  // Queues of logs to process, entry in format of walGroupId->queue,
+  // each presents a queue for one wal group
+  private Map<String, PriorityBlockingQueue<Path>> queues = new HashMap<>();
+  private MetricsSource metrics;
+  private Configuration conf;
+  // per group queue size, keep no more than this number of logs in each wal group
+  private int queueSizePerGroup;
+  // WARN threshold for the number of queued logs, defaults to 2
+  private int logQueueWarnThreshold;
+  private ReplicationSource source;
+  private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class);
+
+
+  public ReplicationLogQueue(Configuration conf, MetricsSource metrics, ReplicationSource source) {
+    this.conf = conf;
+    this.metrics = metrics;
+    this.source = source;
+    this.queueSizePerGroup = this.conf.getInt("hbase.regionserver.maxlogs", 32);
+    this.logQueueWarnThreshold = this.conf.getInt("replication.source.log.queue.warn", 2);
+  }
+
+  /**
+   * Enqueue the wal
+   * @param wal wal to be enqueued
+   * @param walGroupId Key for the wal in @queues map
+   * @return boolean whether this is the first time we are seeing this walGroupId.
+   */
+  public boolean enqueueLog(Path wal, String walGroupId) {
+    boolean exists = false;
+    PriorityBlockingQueue<Path> queue = queues.get(walGroupId);

Review comment:
       I'm a bit concerned that this is not thread-safe. It wasn't before the patch too but this seems prone to weird concurrent modification issues. Fix while we are here?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org