You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2017/01/09 12:31:48 UTC

lucene-solr:jira/solr-9856: SOLR-9856 Fix timer in PeerSync, add skipped sync counter. Add unit tests.

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-9856 482ce1e56 -> ca2e2befa


SOLR-9856 Fix timer in PeerSync, add skipped sync counter. Add unit tests.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ca2e2bef
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ca2e2bef
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ca2e2bef

Branch: refs/heads/jira/solr-9856
Commit: ca2e2befa4e1e40960e1b4ed961423b54c5cd6a4
Parents: 482ce1e
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Mon Jan 9 13:31:14 2017 +0100
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Mon Jan 9 13:31:14 2017 +0100

----------------------------------------------------------------------
 .../java/org/apache/solr/update/PeerSync.java   | 22 +++++++++++++++--
 .../solr/cloud/PeerSyncReplicationTest.java     | 15 +++++++++++
 .../apache/solr/cloud/TestCloudRecovery.java    | 26 ++++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca2e2bef/solr/core/src/java/org/apache/solr/update/PeerSync.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index 428af0f..861cbf7 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -95,6 +95,7 @@ public class PeerSync implements SolrMetricProducer {
   // metrics
   private Timer syncTime;
   private Counter syncErrors;
+  private Counter syncSkipped;
 
   // comparator that sorts by absolute value, putting highest first
   public static Comparator<Long> absComparator = (o1, o2) -> {
@@ -163,6 +164,7 @@ public class PeerSync implements SolrMetricProducer {
   public void initializeMetrics(SolrMetricManager manager, String registry, String scope) {
     syncTime = manager.timer(registry, "time", scope);
     syncErrors = manager.counter(registry, "errors", scope);
+    syncSkipped = manager.counter(registry, "skipped", scope);
   }
 
   /** optional list of updates we had before possibly receiving new updates */
@@ -224,9 +226,11 @@ public class PeerSync implements SolrMetricProducer {
    */
   public PeerSyncResult sync() {
     if (ulog == null) {
+      syncErrors.inc();
       return PeerSyncResult.failure();
     }
     MDCLoggingContext.setCore(core);
+    Timer.Context timerContext = null;
     try {
       log.info(msg() + "START replicas=" + replicas + " nUpdates=" + nUpdates);
       
@@ -237,10 +241,13 @@ public class PeerSync implements SolrMetricProducer {
       }
       // check if we already in sync to begin with 
       if(doFingerprint && alreadyInSync()) {
+        syncSkipped.inc();
         return PeerSyncResult.success();
       }
-      
-      
+
+      // measure only when actual sync is performed
+      timerContext = syncTime.time();
+
       // Fire off the requests before getting our own recent updates (for better concurrency)
       // This also allows us to avoid getting updates we don't need... if we got our updates and then got their updates,
       // they would
@@ -258,6 +265,7 @@ public class PeerSync implements SolrMetricProducer {
       if (startingVersions != null) {
         if (startingVersions.size() == 0) {
           log.warn("no frame of reference to tell if we've missed updates");
+          syncErrors.inc();
           return PeerSyncResult.failure();
         }
         Collections.sort(startingVersions, absComparator);
@@ -273,6 +281,7 @@ public class PeerSync implements SolrMetricProducer {
         if (Math.abs(startingVersions.get(0)) < smallestNewUpdate) {
           log.warn(msg()
               + "too many updates received since start - startingUpdates no longer overlaps with our currentUpdates");
+          syncErrors.inc();
           return PeerSyncResult.failure();
         }
         
@@ -301,10 +310,12 @@ public class PeerSync implements SolrMetricProducer {
             if (srsp.getException() == null)  {
               List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");
               if (otherVersions != null && !otherVersions.isEmpty())  {
+                syncErrors.inc();
                 return PeerSyncResult.failure(true);
               }
             }
           }
+          syncErrors.inc();
           return PeerSyncResult.failure(false);
         }
       }
@@ -320,6 +331,7 @@ public class PeerSync implements SolrMetricProducer {
         if (!success) {
           log.info(msg() + "DONE. sync failed");
           shardHandler.cancelAll();
+          syncErrors.inc();
           return PeerSyncResult.failure();
         }
       }
@@ -334,8 +346,14 @@ public class PeerSync implements SolrMetricProducer {
       }
 
       log.info(msg() + "DONE. sync " + (success ? "succeeded" : "failed"));
+      if (!success) {
+        syncErrors.inc();
+      }
       return success ?  PeerSyncResult.success() : PeerSyncResult.failure();
     } finally {
+      if (timerContext != null) {
+        timerContext.close();
+      }
       MDCLoggingContext.clear();
     }
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca2e2bef/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
index 4084ad7..57784b6 100644
--- a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
@@ -27,9 +27,14 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
 
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.Metric;
+import com.codahale.metrics.MetricRegistry;
+import com.codahale.metrics.Timer;
 import org.apache.commons.lang.RandomStringUtils;
 import org.apache.lucene.util.LuceneTestCase.Slow;
 import org.apache.solr.client.solrj.SolrQuery;
@@ -172,6 +177,16 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase {
 
       // make sure leader has not changed after bringing initial leader back
       assertEquals(nodePeerSynced, shardToLeaderJetty.get("shard1"));
+
+      // assert metrics
+      MetricRegistry registry = nodePeerSynced.jetty.getCoreContainer().getMetricManager().registry("solr.core.collection1");
+      Map<String, Metric> metrics = registry.getMetrics();
+      assertTrue("REPLICATION.time present", metrics.containsKey("REPLICATION.time"));
+      assertTrue("REPLICATION.errors present", metrics.containsKey("REPLICATION.errors"));
+      Timer timer = (Timer)metrics.get("REPLICATION.time");
+      assertEquals(1L, timer.getCount());
+      Counter counter = (Counter)metrics.get("REPLICATION.errors");
+      assertEquals(0L, counter.getCount());
       success = true;
     } finally {
       System.clearProperty("solr.disableFingerprint");

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ca2e2bef/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
index e2f3bfd..164eeab 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
@@ -23,9 +23,14 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
 
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.Metric;
+import com.codahale.metrics.Timer;
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.JettySolrRunner;
@@ -35,6 +40,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.cloud.ClusterStateUtil;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.metrics.SolrMetricManager;
 import org.apache.solr.update.DirectUpdateHandler2;
 import org.apache.solr.update.UpdateLog;
 import org.apache.solr.util.TestInjection;
@@ -102,6 +108,26 @@ public class TestCloudRecovery extends SolrCloudTestCase {
     assertEquals(4, resp.getResults().getNumFound());
     // Make sure all nodes is recover from tlog
     assertEquals(4, countReplayLog.get());
+
+    // check metrics
+    int replicationCount = 0;
+    int errorsCount = 0;
+    int skippedCount = 0;
+    for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
+      SolrMetricManager manager = jetty.getCoreContainer().getMetricManager();
+      List<String> registryNames = manager.registryNames().stream()
+          .filter(s -> s.startsWith("solr.core.")).collect(Collectors.toList());
+      for (String registry : registryNames) {
+        Map<String, Metric> metrics = manager.registry(registry).getMetrics();
+        Timer timer = (Timer)metrics.get("REPLICATION.time");
+        Counter counter = (Counter)metrics.get("REPLICATION.errors");
+        Counter skipped = (Counter)metrics.get("REPLICATION.skipped");
+        replicationCount += timer.getCount();
+        errorsCount += counter.getCount();
+        skippedCount += skipped.getCount();
+      }
+    }
+    assertEquals(2, replicationCount);
   }
 
   @Test