You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bookkeeper.apache.org by si...@apache.org on 2018/12/13 14:50:48 UTC

[bookkeeper] branch master updated: [STATS] [DOC] Add @StatsDoc annotation for bookkeeper autorecovery stats

This is an automated email from the ASF dual-hosted git repository.

sijie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new c0138f3  [STATS] [DOC] Add @StatsDoc annotation for bookkeeper autorecovery stats
c0138f3 is described below

commit c0138f3758333877739f256303722dd892663979
Author: Sijie Guo <gu...@gmail.com>
AuthorDate: Thu Dec 13 22:50:44 2018 +0800

    [STATS] [DOC] Add @StatsDoc annotation for bookkeeper autorecovery stats
    
    Descriptions of the changes in this PR:
    
    *Motivation*
    
    As part of [BP-36](https://github.com/apache/bookkeeper/issues/1785), this PR is to document bookkeeper autorecovery stats.
    
    *Changes*
    
    - convert bookkeeper autorecovery stats to use StatsDoc for documenting metrics
    
    Master Issue: #1785
    
    
    
    
    Reviewers: Jia Zhai <None>
    
    This closes #1879 from sijie/replication_stats
---
 .../org/apache/bookkeeper/replication/Auditor.java | 64 +++++++++++++++++++++-
 .../bookkeeper/replication/AuditorElector.java     | 10 ++++
 .../bookkeeper/replication/ReplicationWorker.java  | 26 ++++++++-
 3 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
index acf0c09..89883b0 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/Auditor.java
@@ -20,6 +20,18 @@
  */
 package org.apache.bookkeeper.replication;
 
+import static org.apache.bookkeeper.replication.ReplicationStats.AUDITOR_SCOPE;
+import static org.apache.bookkeeper.replication.ReplicationStats.AUDIT_BOOKIES_TIME;
+import static org.apache.bookkeeper.replication.ReplicationStats.BOOKIE_TO_LEDGERS_MAP_CREATION_TIME;
+import static org.apache.bookkeeper.replication.ReplicationStats.CHECK_ALL_LEDGERS_TIME;
+import static org.apache.bookkeeper.replication.ReplicationStats.NUM_BOOKIES_PER_LEDGER;
+import static org.apache.bookkeeper.replication.ReplicationStats.NUM_BOOKIE_AUDITS_DELAYED;
+import static org.apache.bookkeeper.replication.ReplicationStats.NUM_DELAYED_BOOKIE_AUDITS_DELAYES_CANCELLED;
+import static org.apache.bookkeeper.replication.ReplicationStats.NUM_FRAGMENTS_PER_LEDGER;
+import static org.apache.bookkeeper.replication.ReplicationStats.NUM_LEDGERS_CHECKED;
+import static org.apache.bookkeeper.replication.ReplicationStats.NUM_UNDER_REPLICATED_LEDGERS;
+import static org.apache.bookkeeper.replication.ReplicationStats.URL_PUBLISH_TIME_FOR_LOST_BOOKIE;
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Stopwatch;
 import com.google.common.collect.Lists;
@@ -61,6 +73,7 @@ import org.apache.bookkeeper.replication.ReplicationException.UnavailableExcepti
 import org.apache.bookkeeper.stats.Counter;
 import org.apache.bookkeeper.stats.OpStatsLogger;
 import org.apache.bookkeeper.stats.StatsLogger;
+import org.apache.bookkeeper.stats.annotations.StatsDoc;
 import org.apache.commons.collections4.CollectionUtils;
 import org.apache.zookeeper.AsyncCallback;
 import org.apache.zookeeper.KeeperException;
@@ -76,6 +89,10 @@ import org.slf4j.LoggerFactory;
  *
  * <p>TODO: eliminate the direct usage of zookeeper here {@link https://github.com/apache/bookkeeper/issues/1332}
  */
+@StatsDoc(
+    name = AUDITOR_SCOPE,
+    help = "Auditor related stats"
+)
 public class Auditor implements AutoCloseable {
     private static final Logger LOG = LoggerFactory.getLogger(Auditor.class);
     private final ServerConfiguration conf;
@@ -88,20 +105,61 @@ public class Auditor implements AutoCloseable {
     private final ScheduledExecutorService executor;
     private List<String> knownBookies = new ArrayList<String>();
     private final String bookieIdentifier;
+    private volatile Future<?> auditTask;
+    private Set<String> bookiesToBeAudited = Sets.newHashSet();
+    private volatile int lostBookieRecoveryDelayBeforeChange;
+
     private final StatsLogger statsLogger;
+    @StatsDoc(
+        name = NUM_UNDER_REPLICATED_LEDGERS,
+        help = "the distribution of num under_replicated ledgers on each auditor run"
+    )
     private final OpStatsLogger numUnderReplicatedLedger;
+    @StatsDoc(
+        name = URL_PUBLISH_TIME_FOR_LOST_BOOKIE,
+        help = "the latency distribution of publishing under replicated ledgers for lost bookies"
+    )
     private final OpStatsLogger uRLPublishTimeForLostBookies;
+    @StatsDoc(
+        name = BOOKIE_TO_LEDGERS_MAP_CREATION_TIME,
+        help = "the latency distribution of creating bookies-to-ledgers map"
+    )
     private final OpStatsLogger bookieToLedgersMapCreationTime;
+    @StatsDoc(
+        name = CHECK_ALL_LEDGERS_TIME,
+        help = "the latency distribution of checking all ledgers"
+    )
     private final OpStatsLogger checkAllLedgersTime;
+    @StatsDoc(
+        name = AUDIT_BOOKIES_TIME,
+        help = "the latency distribution of auditing all the bookies"
+    )
     private final OpStatsLogger auditBookiesTime;
+    @StatsDoc(
+        name = NUM_LEDGERS_CHECKED,
+        help = "the number of ledgers checked by the auditor"
+    )
     private final Counter numLedgersChecked;
+    @StatsDoc(
+        name = NUM_FRAGMENTS_PER_LEDGER,
+        help = "the distribution of number of fragments per ledger"
+    )
     private final OpStatsLogger numFragmentsPerLedger;
+    @StatsDoc(
+        name = NUM_BOOKIES_PER_LEDGER,
+        help = "the distribution of number of bookies per ledger"
+    )
     private final OpStatsLogger numBookiesPerLedger;
+    @StatsDoc(
+        name = NUM_BOOKIE_AUDITS_DELAYED,
+        help = "the number of bookie-audits delayed"
+    )
     private final Counter numBookieAuditsDelayed;
+    @StatsDoc(
+        name = NUM_DELAYED_BOOKIE_AUDITS_DELAYES_CANCELLED,
+        help = "the number of delayed-bookie-audits cancelled"
+    )
     private final Counter numDelayedBookieAuditsCancelled;
-    private volatile Future<?> auditTask;
-    private Set<String> bookiesToBeAudited = Sets.newHashSet();
-    private volatile int lostBookieRecoveryDelayBeforeChange;
 
     static BookKeeper createBookKeeperClient(ServerConfiguration conf)
             throws InterruptedException, IOException {
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
index 14c5c53..543aaac 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AuditorElector.java
@@ -21,6 +21,7 @@
 package org.apache.bookkeeper.replication;
 
 import static com.google.common.base.Charsets.UTF_8;
+import static org.apache.bookkeeper.replication.ReplicationStats.AUDITOR_SCOPE;
 import static org.apache.bookkeeper.replication.ReplicationStats.ELECTION_ATTEMPTS;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -48,6 +49,7 @@ import org.apache.bookkeeper.replication.ReplicationException.UnavailableExcepti
 import org.apache.bookkeeper.stats.Counter;
 import org.apache.bookkeeper.stats.NullStatsLogger;
 import org.apache.bookkeeper.stats.StatsLogger;
+import org.apache.bookkeeper.stats.annotations.StatsDoc;
 import org.apache.bookkeeper.util.BookKeeperConstants;
 import org.apache.bookkeeper.util.ZkUtils;
 import org.apache.commons.lang.StringUtils;
@@ -71,6 +73,10 @@ import org.slf4j.LoggerFactory;
  * will be elected as Auditor. All the other bookies will be watching on their
  * predecessor znode according to the ephemeral sequence numbers.
  */
+@StatsDoc(
+    name = AUDITOR_SCOPE,
+    help = "Auditor related stats"
+)
 public class AuditorElector {
     private static final Logger LOG = LoggerFactory
             .getLogger(AuditorElector.class);
@@ -98,6 +104,10 @@ public class AuditorElector {
     private AtomicBoolean running = new AtomicBoolean(false);
 
     // Expose Stats
+    @StatsDoc(
+        name = ELECTION_ATTEMPTS,
+        help = "The number of auditor election attempts"
+    )
     private final Counter electionAttempts;
     private final StatsLogger statsLogger;
 
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
index eeaa96b..74d2081 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
@@ -22,6 +22,7 @@ package org.apache.bookkeeper.replication;
 import static org.apache.bookkeeper.replication.ReplicationStats.NUM_DEFER_LEDGER_LOCK_RELEASE_OF_FAILED_LEDGER;
 import static org.apache.bookkeeper.replication.ReplicationStats.NUM_FULL_OR_PARTIAL_LEDGERS_REPLICATED;
 import static org.apache.bookkeeper.replication.ReplicationStats.REPLICATE_EXCEPTION;
+import static org.apache.bookkeeper.replication.ReplicationStats.REPLICATION_WORKER_SCOPE;
 import static org.apache.bookkeeper.replication.ReplicationStats.REREPLICATE_OP;
 
 import com.google.common.base.Stopwatch;
@@ -64,6 +65,7 @@ import org.apache.bookkeeper.stats.Counter;
 import org.apache.bookkeeper.stats.NullStatsLogger;
 import org.apache.bookkeeper.stats.OpStatsLogger;
 import org.apache.bookkeeper.stats.StatsLogger;
+import org.apache.bookkeeper.stats.annotations.StatsDoc;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -72,6 +74,10 @@ import org.slf4j.LoggerFactory;
  * ReplicationWorker will take the fragments one by one from
  * ZKLedgerUnderreplicationManager and replicates to it.
  */
+@StatsDoc(
+    name = REPLICATION_WORKER_SCOPE,
+    help = "replication worker related stats"
+)
 public class ReplicationWorker implements Runnable {
     private static final Logger LOG = LoggerFactory
             .getLogger(ReplicationWorker.class);
@@ -93,8 +99,25 @@ public class ReplicationWorker implements Runnable {
 
     // Expose Stats
     private final StatsLogger statsLogger;
+    @StatsDoc(
+        name = REPLICATE_EXCEPTION,
+        help = "replication related exceptions"
+    )
+    private final StatsLogger exceptionLogger;
+    @StatsDoc(
+        name = REREPLICATE_OP,
+        help = "operation stats of re-replicating ledgers"
+    )
     private final OpStatsLogger rereplicateOpStats;
+    @StatsDoc(
+        name = NUM_FULL_OR_PARTIAL_LEDGERS_REPLICATED,
+        help = "the number of ledgers re-replicated"
+    )
     private final Counter numLedgersReplicated;
+    @StatsDoc(
+        name = NUM_DEFER_LEDGER_LOCK_RELEASE_OF_FAILED_LEDGER,
+        help = "the number of defer-ledger-lock-releases of failed ledgers"
+    )
     private final Counter numDeferLedgerLockReleaseOfFailedLedger;
     private final Map<String, Counter> exceptionCounters;
     final LoadingCache<Long, AtomicInteger> replicationFailedLedgers;
@@ -164,6 +187,7 @@ public class ReplicationWorker implements Runnable {
 
         // Expose Stats
         this.statsLogger = statsLogger;
+        this.exceptionLogger = statsLogger.scope(REPLICATE_EXCEPTION);
         this.rereplicateOpStats = this.statsLogger.getOpStatsLogger(REREPLICATE_OP);
         this.numLedgersReplicated = this.statsLogger.getCounter(NUM_FULL_OR_PARTIAL_LEDGERS_REPLICATED);
         this.numDeferLedgerLockReleaseOfFailedLedger = this.statsLogger
@@ -561,7 +585,7 @@ public class ReplicationWorker implements Runnable {
     private Counter getExceptionCounter(String name) {
         Counter counter = this.exceptionCounters.get(name);
         if (counter == null) {
-            counter = this.statsLogger.scope(REPLICATE_EXCEPTION).getCounter(name);
+            counter = this.exceptionLogger.getCounter(name);
             this.exceptionCounters.put(name, counter);
         }
         return counter;