You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2014/10/03 05:43:43 UTC

svn commit: r1629108 - in /lucene/dev/branches/branch_5x: ./ solr/ solr/core/ solr/core/src/java/org/apache/solr/update/processor/ solr/core/src/test/org/apache/solr/cloud/

Author: shalin
Date: Fri Oct  3 03:43:42 2014
New Revision: 1629108

URL: http://svn.apache.org/r1629108
Log:
SOLR-6530: Commits under network partitions can put any node in down state

Added:
    lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java
      - copied, changed from r1628945, lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/core/   (props changed)
    lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1629108&r1=1629107&r2=1629108&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Fri Oct  3 03:43:42 2014
@@ -173,6 +173,9 @@ Bug Fixes
 
 * SOLR-6511: Fencepost error in LeaderInitiatedRecoveryThread (Timothy Potter)
 
+* SOLR-6530: Commits under network partitions can put any node in down state.
+  (Ramkumar Aiyengar, Alan Woodward, Mark Miller, shalin)
+
 
 Other Changes
 ----------------------

Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java?rev=1629108&r1=1629107&r2=1629108&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java Fri Oct  3 03:43:42 2014
@@ -808,6 +808,11 @@ public class DistributedUpdateProcessor 
       if (phase != DistribPhase.FROMLEADER)
         continue; // don't have non-leaders try to recovery other nodes
 
+      // commits are special -- they can run on any node irrespective of whether it is a leader or not
+      // we don't want to run recovery on a node which missed a commit command
+      if (error.req.uReq.getParams().get(COMMIT_END_POINT) != null)
+        continue;
+
       final String replicaUrl = error.req.node.getUrl();
 
       // if the remote replica failed the request because of leader change (SOLR-6511), then fail the request
@@ -839,7 +844,17 @@ public class DistributedUpdateProcessor 
               " " + shardId + " before putting " + replicaUrl + " into leader-initiated recovery due to: " + exc);
         }
 
-        if (cloudDesc.getCoreNodeName().equals(leaderCoreNodeName)) {
+        List<ZkCoreNodeProps> myReplicas = zkController.getZkStateReader().getReplicaProps(collection,
+            cloudDesc.getShardId(), cloudDesc.getCoreNodeName());
+        boolean foundErrorNodeInReplicaList = false;
+        for (ZkCoreNodeProps replicaProp : myReplicas) {
+          if (((Replica) replicaProp.getNodeProps()).getName().equals(((Replica)stdNode.getNodeProps().getNodeProps()).getName()))  {
+            foundErrorNodeInReplicaList = true;
+            break;
+          }
+        }
+
+        if (cloudDesc.getCoreNodeName().equals(leaderCoreNodeName) && foundErrorNodeInReplicaList) {
           try {
             // if false, then the node is probably not "live" anymore
             sendRecoveryCommand =
@@ -866,10 +881,16 @@ public class DistributedUpdateProcessor 
             // will go ahead and try to send the recovery command once after this error
           }
         } else {
-          // not the leader anymore maybe?
+          // not the leader anymore maybe or the error'd node is not my replica?
           sendRecoveryCommand = false;
-          log.warn("Core "+cloudDesc.getCoreNodeName()+" is no longer the leader for "+collection+" "+
-              shardId+", no request recovery command will be sent!");
+          if (!foundErrorNodeInReplicaList) {
+            log.warn("Core "+cloudDesc.getCoreNodeName()+" belonging to "+collection+" "+
+                shardId+", does not have error'd node " + stdNode.getNodeProps().getCoreUrl() + " as a replica. " +
+                "No request recovery command will be sent!");
+          } else  {
+            log.warn("Core "+cloudDesc.getCoreNodeName()+" is no longer the leader for "+collection+" "+
+                shardId+", no request recovery command will be sent!");
+          }
         }
       } // else not a StdNode, recovery command still gets sent once
             

Copied: lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java (from r1628945, lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java?p2=lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java&p1=lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java&r1=1628945&r2=1629108&rev=1629108&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/cloud/LeaderInitiatedRecoveryOnCommitTest.java Fri Oct  3 03:43:42 2014
@@ -100,6 +100,9 @@ public class LeaderInitiatedRecoveryOnCo
     leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1");
     assertEquals("Leader was not active", "active", leader.getStr("state"));
 
+    leaderProxy.reopen();
+    Thread.sleep(sleepMsBeforeHealPartition);
+
     // try to clean up
     try {
       CollectionAdminRequest req = new CollectionAdminRequest.Delete();
@@ -139,6 +142,9 @@ public class LeaderInitiatedRecoveryOnCo
     leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1");
     assertEquals("Leader was not active", "active", leader.getStr("state"));
 
+    leaderProxy.reopen();
+    Thread.sleep(sleepMsBeforeHealPartition);
+
     // try to clean up
     try {
       CollectionAdminRequest req = new CollectionAdminRequest.Delete();