You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2013/12/30 04:25:28 UTC

svn commit: r1554130 - in /lucene/dev/branches/branch_4x: ./ solr/ solr/CHANGES.txt solr/core/ solr/core/src/java/org/apache/solr/update/PeerSync.java solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java

Author: markrmiller
Date: Mon Dec 30 03:25:28 2013
New Revision: 1554130

URL: http://svn.apache.org/r1554130
Log:
SOLR-5588: PeerSync doesn't count all connect failures as success.

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/update/PeerSync.java
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java

Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1554130&r1=1554129&r2=1554130&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Mon Dec 30 03:25:28 2013
@@ -288,6 +288,9 @@ Bug Fixes
 
 * SOLR-5503: Retry 'forward to leader' requests less aggressively - rather 
   than on IOException and status 500, ConnectException. (Mark Miller)
+
+* SOLR-5588: PeerSync doesn't count all connect failures as success.
+  (Mark Miller)
     
 Optimizations
 ----------------------

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/update/PeerSync.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/update/PeerSync.java?rev=1554130&r1=1554129&r2=1554130&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/update/PeerSync.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/update/PeerSync.java Mon Dec 30 03:25:28 2013
@@ -291,7 +291,8 @@ public class PeerSync  {
       if (cantReachIsSuccess && sreq.purpose == 1 && srsp.getException() instanceof SolrServerException) {
         Throwable solrException = ((SolrServerException) srsp.getException())
             .getRootCause();
-        if (solrException instanceof ConnectException || solrException instanceof ConnectTimeoutException
+        boolean connectTimeoutExceptionInChain = connectTimeoutExceptionInChain(srsp.getException());
+        if (connectTimeoutExceptionInChain || solrException instanceof ConnectException || solrException instanceof ConnectTimeoutException
             || solrException instanceof NoHttpResponseException || solrException instanceof SocketException) {
           log.warn(msg() + " couldn't connect to " + srsp.getShardAddress() + ", counting as success");
 
@@ -309,6 +310,10 @@ public class PeerSync  {
             "Perhaps /get is not registered?");
         return true;
       }
+      
+      // TODO: we should return the above information so that when we can request a recovery through zookeeper, we do
+      // that for these nodes
+      
       // TODO: at least log???
       // srsp.getException().printStackTrace(System.out);
      
@@ -324,6 +329,23 @@ public class PeerSync  {
     }
   }
   
+  // sometimes the root exception is a SocketTimeoutException, but ConnectTimeoutException
+  // is in the chain
+  private boolean connectTimeoutExceptionInChain(Throwable exception) {
+    Throwable t = exception;
+    while (true) {
+      if (t instanceof ConnectTimeoutException) {
+        return true;
+      }
+      Throwable cause = t.getCause();
+      if (cause != null) {
+        t = cause;
+      } else {
+        return false;
+      }
+    }
+  }
+
   private boolean handleVersions(ShardResponse srsp) {
     // we retrieved the last N updates from the replica
     List<Long> otherVersions = (List<Long>)srsp.getSolrResponse().getResponse().get("versions");

Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java?rev=1554130&r1=1554129&r2=1554130&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/cloud/BasicDistributedZk2Test.java Mon Dec 30 03:25:28 2013
@@ -282,14 +282,6 @@ public class BasicDistributedZk2Test ext
     
     // kill a shard
     CloudJettyRunner deadShard = chaosMonkey.stopShard(SHARD1, 0);
-
-
-    // we are careful to make sure the downed node is no longer in the state,
-    // because on some systems (especially freebsd w/ blackhole enabled), trying
-    // to talk to a downed node causes grief
-    Set<CloudJettyRunner> jetties = new HashSet<CloudJettyRunner>();
-    jetties.addAll(shardToJetty.get(SHARD1));
-    jetties.remove(deadShard);
     
     // ensure shard is dead
     try {