You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by an...@apache.org on 2015/09/15 22:39:14 UTC

svn commit: r1703289 - in /lucene/dev/trunk/solr: CHANGES.txt core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java core/src/test/org/apache/solr/cloud/HttpPartitionTest.java

Author: anshum
Date: Tue Sep 15 20:39:14 2015
New Revision: 1703289

URL: http://svn.apache.org/r1703289
Log:
SOLR-8034: Leader no longer puts replicas in recovery in case of a failed update, when minRF isn't achieved.

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1703289&r1=1703288&r2=1703289&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Tue Sep 15 20:39:14 2015
@@ -264,6 +264,9 @@ Other Changes
 * SOLR-7999: SolrRequestParser tests no longer depend on external URLs
   that may fail to work.  (Uwe Schindler)
 
+* SOLR-8034: Leader no longer puts replicas in recovery in case of a failed update, when minRF
+  isn't achieved. (Jessica Cheng, Timothy Potter, Anshum Gupta)
+
 ==================  5.3.1 ==================
 
 Bug Fixes

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java?rev=1703289&r1=1703288&r2=1703289&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java Tue Sep 15 20:39:14 2015
@@ -870,6 +870,11 @@ public class DistributedUpdateProcessor
           }
         }
 
+        // If the client specified minRf and we didn't achieve the minRf, don't send recovery and let client retry
+        if (replicationTracker != null && replicationTracker.getAchievedRf() < replicationTracker.minRf) {
+          continue;
+        }
+
         if (cloudDesc.getCoreNodeName().equals(leaderCoreNodeName) && foundErrorNodeInReplicaList) {
           try {
             // if false, then the node is probably not "live" anymore

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java?rev=1703289&r1=1703288&r2=1703289&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/cloud/HttpPartitionTest.java Tue Sep 15 20:39:14 2015
@@ -25,6 +25,7 @@ import org.apache.solr.client.solrj.embe
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.CollectionAdminResponse;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
@@ -38,7 +39,6 @@ import org.apache.solr.common.util.Named
 import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.servlet.SolrDispatchFilter;
-import org.apache.solr.update.UpdateHandler;
 import org.apache.solr.update.UpdateLog;
 import org.apache.solr.util.RTimer;
 import org.junit.Test;
@@ -50,7 +50,6 @@ import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -99,6 +98,12 @@ public class HttpPartitionTest extends A
 
     testLeaderInitiatedRecoveryCRUD();
 
+    // Tests that if we set a minRf that's not satisfied, no recovery is requested, but if minRf is satisfied,
+    // recovery is requested
+    testMinRf();
+
+    waitForThingsToLevelOut(30000);
+
     // test a 1x2 collection
     testRf2();
 
@@ -164,6 +169,90 @@ public class HttpPartitionTest extends A
     }
   }
 
+  protected void testMinRf() throws Exception {
+    // create a collection that has 1 shard and 3 replicas
+    String testCollectionName = "collMinRf_1x3";
+    createCollection(testCollectionName, 1, 3, 1);
+    cloudClient.setDefaultCollection(testCollectionName);
+
+    sendDoc(1, 2);
+
+    List<Replica> notLeaders =
+        ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
+    assertTrue("Expected 2 non-leader replicas for collection " + testCollectionName
+            + " but found " + notLeaders.size() + "; clusterState: "
+            + printClusterStateInfo(testCollectionName),
+        notLeaders.size() == 2);
+
+    assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 1);
+
+    // Now introduce a network partition between the leader and 1 replica, so a minRf of 2 is still achieved
+    SocketProxy proxy0 = getProxyForReplica(notLeaders.get(0));
+
+    proxy0.close();
+
+    // indexing during a partition
+    int achievedRf = sendDoc(2, 2);
+    assertEquals("Unexpected achieved replication factor", 2, achievedRf);
+
+    Thread.sleep(sleepMsBeforeHealPartition);
+
+    // Verify that the partitioned replica is DOWN
+    ZkStateReader zkr = cloudClient.getZkStateReader();
+    zkr.updateClusterState(); // force the state to be fresh
+    ClusterState cs = zkr.getClusterState();
+    Collection<Slice> slices = cs.getActiveSlices(testCollectionName);
+    Slice slice = slices.iterator().next();
+    Replica partitionedReplica = slice.getReplica(notLeaders.get(0).getName());
+    assertEquals("The partitioned replica did not get marked down",
+        Replica.State.DOWN.toString(), partitionedReplica.getStr(ZkStateReader.STATE_PROP));
+
+    proxy0.reopen();
+
+    notLeaders =
+        ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
+
+    // Since minRf is achieved, we expect recovery, so we expect seeing 2 documents
+    assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 2);
+
+    // Now introduce a network partition between the leader and both of its replicas, so a minRf of 2 is NOT achieved
+    proxy0 = getProxyForReplica(notLeaders.get(0));
+    proxy0.close();
+    SocketProxy proxy1 = getProxyForReplica(notLeaders.get(1));
+    proxy1.close();
+
+    achievedRf = sendDoc(3, 2);
+    assertEquals("Unexpected achieved replication factor", 1, achievedRf);
+
+    Thread.sleep(sleepMsBeforeHealPartition);
+
+    // Verify that the partitioned replicas are NOT DOWN since minRf wasn't achieved
+    ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, 1);
+
+    proxy0.reopen();
+    proxy1.reopen();
+
+    notLeaders =
+        ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
+
+    // Check that doc 3 is on the leader but not on the notLeaders
+    Replica leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1", 10000);
+    HttpSolrClient leaderSolr = getHttpSolrClient(leader, testCollectionName);
+    assertDocExists(leaderSolr, testCollectionName, "3");
+
+    for (Replica notLeader : notLeaders) {
+      HttpSolrClient notLeaderSolr = getHttpSolrClient(notLeader, testCollectionName);
+      assertDocNotExists(notLeaderSolr, testCollectionName, "3");
+    }
+
+    // Retry sending doc 3
+    achievedRf = sendDoc(3, 2);
+    assertEquals("Unexpected achieved replication factor", 3, achievedRf);
+
+    // Now doc 3 should be on all replicas
+    assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 3);
+  }
+
   protected void testRf2() throws Exception {
     // create a collection that has 1 shard but 2 replicas
     String testCollectionName = "c8n_1x2";
@@ -480,19 +569,29 @@ public class HttpPartitionTest extends A
       }
     }
   }
-  
+
   protected HttpSolrClient getHttpSolrClient(Replica replica, String coll) throws Exception {
     ZkCoreNodeProps zkProps = new ZkCoreNodeProps(replica);
     String url = zkProps.getBaseUrl() + "/" + coll;
     return new HttpSolrClient(url);
   }
 
-  protected void sendDoc(int docId) throws Exception {
+  protected int sendDoc(int docId) throws Exception {
+    return sendDoc(docId, null);
+  }
+  
+  protected int sendDoc(int docId, Integer minRf) throws Exception {
     SolrInputDocument doc = new SolrInputDocument();
     doc.addField(id, String.valueOf(docId));
     doc.addField("a_t", "hello" + docId);
 
-    sendDocsWithRetry(Collections.singletonList(doc), 2, 3, 100);
+    UpdateRequest up = new UpdateRequest();
+    if (minRf != null) {
+      up.setParam(UpdateRequest.MIN_REPFACT, String.valueOf(minRf));
+    }
+    up.add(doc);
+
+    return cloudClient.getMinAchievedReplicationFactor(cloudClient.getDefaultCollection(), cloudClient.request(up));
   }
 
   /**
@@ -501,13 +600,24 @@ public class HttpPartitionTest extends A
    */
   @SuppressWarnings("rawtypes")
   protected void assertDocExists(HttpSolrClient solr, String coll, String docId) throws Exception {
-    QueryRequest qr = new QueryRequest(params("qt", "/get", "id", docId, "distrib", "false"));
-    NamedList rsp = solr.request(qr);
+    NamedList rsp = realTimeGetDocId(solr, docId);
     String match = JSONTestUtil.matchObj("/id", rsp.get("doc"), new Integer(docId));
     assertTrue("Doc with id=" + docId + " not found in " + solr.getBaseURL()
         + " due to: " + match + "; rsp="+rsp, match == null);
   }
 
+  protected void assertDocNotExists(HttpSolrClient solr, String coll, String docId) throws Exception {
+    NamedList rsp = realTimeGetDocId(solr, docId);
+    String match = JSONTestUtil.matchObj("/id", rsp.get("doc"), new Integer(docId));
+    assertTrue("Doc with id=" + docId + " is found in " + solr.getBaseURL()
+        + " due to: " + match + "; rsp="+rsp, match != null);
+  }
+
+  private NamedList realTimeGetDocId(HttpSolrClient solr, String docId) throws SolrServerException, IOException {
+    QueryRequest qr = new QueryRequest(params("qt", "/get", "id", docId, "distrib", "false"));
+    return solr.request(qr);
+  }
+
   protected int getReplicaPort(Replica replica) {
     String replicaNode = replica.getNodeName();    
     String tmp = replicaNode.substring(replicaNode.indexOf(':')+1);