You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by va...@apache.org on 2018/04/25 05:39:22 UTC

[1/2] lucene-solr:branch_7_3: SOLR-12065: A successful restore collection should mark the shard state as active and not buffering

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_7_3 f779450d6 -> 8fa768741


SOLR-12065: A successful restore collection should mark the shard state as active and not buffering


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8894db1a
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8894db1a
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8894db1a

Branch: refs/heads/branch_7_3
Commit: 8894db1a727dff5a52444f9b4a5838995a8f7513
Parents: f779450
Author: Varun Thacker <va...@apache.org>
Authored: Thu Apr 12 08:18:35 2018 -0700
Committer: Varun Thacker <va...@apache.org>
Committed: Tue Apr 24 21:48:52 2018 -0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  5 ++-
 .../solr/cloud/api/collections/RestoreCmd.java  | 26 +++++++++---
 .../AbstractCloudBackupRestoreTestCase.java     | 42 ++++++++++++++------
 3 files changed, 55 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8894db1a/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 1ad2c87..cabdfcd 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -42,6 +42,9 @@ Bug Fixes
 
 * SOLR-12066: Cleanup deleted core when node start (Cao Manh Dat)
 
+* SOLR-12065: A successful restore collection should mark the shard state as active and not buffering
+  (Rohit, Varun Thacker)
+
 ==================  7.3.0 ==================
 
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
@@ -1829,7 +1832,7 @@ Bug Fixes
 
 * SOLR-11024: ParallelStream should set the StreamContext when constructing SolrStreams (Joel Bernstein)
 
-* SOLR-10908: CloudSolrStream.toExpression incorrectly handles fq clauses (Rohit Singh via Erick Erickson)
+* SOLR-10908: CloudSolrStream.toExpression incorrectly handles fq clauses (Rohit via Erick Erickson)
 
 * SOLR-11177: CoreContainer.load needs to send lazily loaded core descriptors to the proper list rather than send
   them all to the transient lists. (Erick Erickson)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8894db1a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
index 1823fe3..ca7c2d6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
@@ -258,7 +258,6 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
           propMap.put(ASYNC, asyncId);
         }
         ocmh.addPropertyParams(message, propMap);
-
         ocmh.addReplica(clusterState, new ZkNodeProps(propMap), new NamedList(), null);
       }
 
@@ -272,11 +271,31 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
         params.set(NAME, "snapshot." + slice.getName());
         params.set(CoreAdminParams.BACKUP_LOCATION, backupPath.toASCIIString());
         params.set(CoreAdminParams.BACKUP_REPOSITORY, repo);
-
         ocmh.sliceCmd(clusterState, params, null, slice, shardHandler, asyncId, requestMap);
       }
       ocmh.processResponses(new NamedList(), shardHandler, true, "Could not restore core", asyncId, requestMap);
 
+
+      for (Slice s: restoreCollection.getSlices()) {
+        for (Replica r : s.getReplicas()) {
+          String nodeName = r.getNodeName();
+          String coreNodeName = r.getCoreName();
+          Replica.State stateRep  = r.getState();
+
+          log.debug("Calling REQUESTAPPLYUPDATES on: nodeName={}, coreNodeName={}, state={}"
+              , nodeName, coreNodeName, stateRep.name());
+
+          ModifiableSolrParams params = new ModifiableSolrParams();
+          params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.REQUESTAPPLYUPDATES.toString());
+          params.set(CoreAdminParams.NAME, coreNodeName);
+
+          ocmh.sendShardRequest(nodeName, params, shardHandler, asyncId, requestMap);
+        }
+
+        ocmh.processResponses(new NamedList(), shardHandler, true, "REQUESTAPPLYUPDATES calls did not succeed", asyncId, requestMap);
+
+      }
+
       //Mark all shards in ACTIVE STATE
       {
         HashMap<String, Object> propMap = new HashMap<>();
@@ -288,9 +307,6 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
         inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
       }
 
-      //refresh the location copy of collection state
-      restoreCollection = zkStateReader.getClusterState().getCollection(restoreCollectionName);
-
       if (totalReplicasPerShard > 1) {
         log.info("Adding replicas to restored collection={}", restoreCollection);
         for (Slice slice : restoreCollection.getSlices()) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8894db1a/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java
index 058814c..46e6faf 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/AbstractCloudBackupRestoreTestCase.java
@@ -93,12 +93,12 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
     replFactor = TestUtil.nextInt(random(), 1, 2);
     numTlogReplicas = TestUtil.nextInt(random(), 0, 1);
     numPullReplicas = TestUtil.nextInt(random(), 0, 1);
-    
+
     CollectionAdminRequest.Create create = isImplicit ?
-      // NOTE: use shard list with same # of shards as NUM_SHARDS; we assume this later
-      CollectionAdminRequest.createCollectionWithImplicitRouter(getCollectionName(), "conf1", "shard1,shard2", replFactor, numTlogReplicas, numPullReplicas) :
-      CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numTlogReplicas, numPullReplicas);
-    
+        // NOTE: use shard list with same # of shards as NUM_SHARDS; we assume this later
+        CollectionAdminRequest.createCollectionWithImplicitRouter(getCollectionName(), "conf1", "shard1,shard2", replFactor, numTlogReplicas, numPullReplicas) :
+        CollectionAdminRequest.createCollection(getCollectionName(), "conf1", NUM_SHARDS, replFactor, numTlogReplicas, numPullReplicas);
+
     if (NUM_SHARDS * (replFactor + numTlogReplicas + numPullReplicas) > cluster.getJettySolrRunners().size() || random().nextBoolean()) {
       create.setMaxShardsPerNode((int)Math.ceil(NUM_SHARDS * (replFactor + numTlogReplicas + numPullReplicas) / cluster.getJettySolrRunners().size()));//just to assert it survives the restoration
       if (doSplitShardOperation) {
@@ -122,7 +122,7 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
     CloudSolrClient solrClient = cluster.getSolrClient();
     create.process(solrClient);
 
-    indexDocs(getCollectionName());
+    indexDocs(getCollectionName(), false);
 
     if (doSplitShardOperation) {
       // shard split the first shard
@@ -197,23 +197,29 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
     return cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(collectionName).getActiveSlices().size();
   }
 
-  private void indexDocs(String collectionName) throws Exception {
+  private int indexDocs(String collectionName, boolean useUUID) throws Exception {
     Random random = new Random(docsSeed);// use a constant seed for the whole test run so that we can easily re-index.
     int numDocs = random.nextInt(100);
     if (numDocs == 0) {
       log.info("Indexing ZERO test docs");
-      return;
+      return 0;
     }
+
     List<SolrInputDocument> docs = new ArrayList<>(numDocs);
     for (int i=0; i<numDocs; i++) {
       SolrInputDocument doc = new SolrInputDocument();
-      doc.addField("id", i);
+      doc.addField("id", ((useUUID == true) ? java.util.UUID.randomUUID().toString() : i));
       doc.addField("shard_s", "shard" + (1 + random.nextInt(NUM_SHARDS))); // for implicit router
       docs.add(doc);
     }
+
     CloudSolrClient client = cluster.getSolrClient();
-    client.add(collectionName, docs);// batch
+    client.add(collectionName, docs); //batch
     client.commit(collectionName);
+
+    log.info("Indexed {} docs to collection: {}", numDocs, collectionName);
+
+    return numDocs;
   }
 
   private void testBackupAndRestore(String collectionName) throws Exception {
@@ -257,7 +263,7 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
         // may need to increase maxShardsPerNode (e.g. if it was shard split, then now we need more)
         restore.setMaxShardsPerNode((int)Math.ceil(backupCollection.getReplicas().size()/cluster.getJettySolrRunners().size()));
       }
-      
+
 
       if (rarely()) { // Try with createNodeSet configuration
         int nodeSetSize = cluster.getJettySolrRunners().size() / 2;
@@ -298,7 +304,7 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
     //Re-index same docs (should be identical docs given same random seed) and test we have the same result.  Helps
     //  test we reconstituted the hash ranges / doc router.
     if (!(restoreCollection.getRouter() instanceof ImplicitDocRouter) && random().nextBoolean()) {
-      indexDocs(restoreCollectionName);
+      indexDocs(restoreCollectionName, false);
       assertEquals(origShardToDocCount, getShardToDocCountMap(client, restoreCollection));
     }
 
@@ -327,6 +333,18 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa
 
     assertEquals("Restore collection should use stateFormat=2", 2, restoreCollection.getStateFormat());
 
+    //SOLR-12605: Add more docs after restore is complete to see if they are getting added fine
+    //explicitly querying the leaders. If we use CloudSolrClient there is no guarantee that we'll hit a nrtReplica
+    {
+      Map<String, Integer> restoredCollectionPerShardCount =  getShardToDocCountMap(client, restoreCollection);
+      long restoredCollectionDocCount = restoredCollectionPerShardCount.values().stream().mapToInt(Number::intValue).sum();
+      int numberNewDocsIndexed = indexDocs(restoreCollectionName, true);
+      Map<String, Integer> restoredCollectionPerShardCountAfterIndexing = getShardToDocCountMap(client, restoreCollection);
+      int restoredCollectionFinalDocCount = restoredCollectionPerShardCountAfterIndexing.values().stream().mapToInt(Number::intValue).sum();
+
+      log.info("Original doc count in restored collection:" + restoredCollectionDocCount + ", number of newly added documents to the restored collection: " + numberNewDocsIndexed + ", after indexing: " + restoredCollectionFinalDocCount);
+      assertEquals((restoredCollectionDocCount + numberNewDocsIndexed), restoredCollectionFinalDocCount);
+    }
 
     // assert added core properties:
     // DWS: did via manual inspection.


[2/2] lucene-solr:branch_7_3: SOLR-11724: Cdcr bootstrapping should ensure that non-leader replicas should sync with the leader

Posted by va...@apache.org.
SOLR-11724: Cdcr bootstrapping should ensure that non-leader replicas should sync with the leader


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8fa76874
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8fa76874
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8fa76874

Branch: refs/heads/branch_7_3
Commit: 8fa7687413558b3bc65cbbbeb722a21314187e6a
Parents: 8894db1
Author: Varun Thacker <va...@apache.org>
Authored: Fri Apr 13 12:07:42 2018 -0700
Committer: Varun Thacker <va...@apache.org>
Committed: Tue Apr 24 21:51:11 2018 -0700

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  3 ++
 .../solr/handler/CdcrReplicatorManager.java     | 29 +++++++++++++++
 .../solr/cloud/cdcr/CdcrBootstrapTest.java      |  5 +--
 .../apache/solr/cloud/cdcr/CdcrTestsUtil.java   | 38 ++++++++++++++++++++
 4 files changed, 73 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fa76874/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index cabdfcd..2976a6e 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -45,6 +45,9 @@ Bug Fixes
 * SOLR-12065: A successful restore collection should mark the shard state as active and not buffering
   (Rohit, Varun Thacker)
 
+* SOLR-11724: Cdcr bootstrapping should ensure that non-leader replicas should sync with the leader
+  (Amrit Sarkar, Varun Thacker)
+
 ==================  7.3.0 ==================
 
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fa76874/solr/core/src/java/org/apache/solr/handler/CdcrReplicatorManager.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/CdcrReplicatorManager.java b/solr/core/src/java/org/apache/solr/handler/CdcrReplicatorManager.java
index abd6ed7..8ec3c8b 100644
--- a/solr/core/src/java/org/apache/solr/handler/CdcrReplicatorManager.java
+++ b/solr/core/src/java/org/apache/solr/handler/CdcrReplicatorManager.java
@@ -20,6 +20,7 @@ import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.Locale;
@@ -36,11 +37,14 @@ import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.CloudSolrClient.Builder;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.request.CoreAdminRequest;
 import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.common.params.CoreAdminParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.ExecutorUtil;
@@ -298,6 +302,8 @@ class CdcrReplicatorManager implements CdcrStateManager.CdcrStateObserver {
                 checkpoint, collectionName, shard);
             CdcrUpdateLog.CdcrLogReader reader1 = ulog.newLogReader();
             reader1.seek(checkpoint);
+            // issue asynchronous request_recovery to the follower nodes of the shards of target collection
+            sendRequestRecoveryToFollowers(state);
             success = true;
             break;
           } else if (status == BootstrapStatus.FAILED) {
@@ -411,6 +417,29 @@ class CdcrReplicatorManager implements CdcrStateManager.CdcrStateObserver {
     return client.request(request);
   }
 
+  private void sendRequestRecoveryToFollowers(CdcrReplicatorState state) throws SolrServerException, IOException {
+    Collection<Slice> slices = state.getClient().getZkStateReader().getClusterState().getCollection(state.getTargetCollection()).getActiveSlices();
+    for (Slice slice : slices) {
+      Collection<Replica> replicas = slice.getReplicas();
+      for (Replica replica : replicas) {
+        if (slice.getLeader().getCoreName().equals(replica.getCoreName())) {
+          continue; // no need to request recovery for leader
+        }
+        sendRequestRecoveryToFollower(state.getClient(), replica.getCoreName());
+        log.info("RequestRecovery cmd is issued by core: " + replica.getCoreName() + " of shard: " + slice.getName() +
+            "for target: " + state.getTargetCollection());
+      }
+    }
+  }
+
+  private NamedList sendRequestRecoveryToFollower(SolrClient client, String coreName) throws SolrServerException, IOException {
+    CoreAdminRequest.RequestRecovery recoverRequestCmd = new CoreAdminRequest.RequestRecovery();
+    recoverRequestCmd.setAction(CoreAdminParams.CoreAdminAction.REQUESTRECOVERY);
+    recoverRequestCmd.setCoreName(coreName);
+    return client.request(recoverRequestCmd);
+  }
+
+
   private enum BootstrapStatus  {
     SUBMITTED,
     RUNNING,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fa76874/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrBootstrapTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrBootstrapTest.java b/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrBootstrapTest.java
index cae9855..543bd5c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrBootstrapTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrBootstrapTest.java
@@ -105,7 +105,8 @@ public class CdcrBootstrapTest extends SolrTestCaseJ4 {
 
         // setup the target cluster
         target.uploadConfigSet(configset("cdcr-target"), "cdcr-target");
-        CollectionAdminRequest.createCollection("cdcr-target", "cdcr-target", 1, 1)
+        CollectionAdminRequest.createCollection("cdcr-target", "cdcr-target", 1, 2)
+            .setMaxShardsPerNode(2)
             .process(target.getSolrClient());
         CloudSolrClient targetSolrClient = target.getSolrClient();
         targetSolrClient.setDefaultCollection("cdcr-target");
@@ -118,6 +119,7 @@ public class CdcrBootstrapTest extends SolrTestCaseJ4 {
         log.info("Cdcr queue response: " + response.getResponse());
         long foundDocs = CdcrTestsUtil.waitForClusterToSync(numDocs, targetSolrClient);
         assertEquals("Document mismatch on target after sync", numDocs, foundDocs);
+        assertTrue(CdcrTestsUtil.assertShardInSync("cdcr-target", "shard1", targetSolrClient)); // with more than 1 replica
 
         params = new ModifiableSolrParams();
         params.set(CommonParams.ACTION, CdcrParams.CdcrAction.COLLECTIONCHECKPOINT.toString());
@@ -300,5 +302,4 @@ public class CdcrBootstrapTest extends SolrTestCaseJ4 {
       target.shutdown();
     }
   }
-
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8fa76874/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrTestsUtil.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrTestsUtil.java b/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrTestsUtil.java
index 99aa471..6a186fd 100644
--- a/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrTestsUtil.java
+++ b/solr/core/src/test/org/apache/solr/cloud/cdcr/CdcrTestsUtil.java
@@ -26,11 +26,17 @@ import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.apache.solr.client.solrj.response.QueryResponse;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.handler.CdcrParams;
+import org.apache.solr.util.TimeOut;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -108,4 +114,36 @@ public class CdcrTestsUtil extends SolrTestCaseJ4{
     }
     return response != null ? response.getResults().getNumFound() : 0;
   }
+
+  protected static boolean assertShardInSync(String collection, String shard, CloudSolrClient client) throws IOException, SolrServerException {
+    TimeOut waitTimeOut = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+    DocCollection docCollection = client.getZkStateReader().getClusterState().getCollection(collection);
+    Slice correctSlice = null;
+    for (Slice slice : docCollection.getSlices()) {
+      if (shard.equals(slice.getName())) {
+        correctSlice = slice;
+        break;
+      }
+    }
+    assertNotNull(correctSlice);
+
+    long leaderDocCount;
+    try (HttpSolrClient leaderClient = new HttpSolrClient.Builder(correctSlice.getLeader().getCoreUrl()).withHttpClient(client.getHttpClient()).build()) {
+      leaderDocCount = leaderClient.query(new SolrQuery("*:*").setParam("distrib", "false")).getResults().getNumFound();
+    }
+
+    while (!waitTimeOut.hasTimedOut()) {
+      int replicasInSync = 0;
+      for (Replica replica : correctSlice.getReplicas()) {
+        try (HttpSolrClient leaderClient = new HttpSolrClient.Builder(replica.getCoreUrl()).withHttpClient(client.getHttpClient()).build()) {
+          long replicaDocCount = leaderClient.query(new SolrQuery("*:*").setParam("distrib", "false")).getResults().getNumFound();
+          if (replicaDocCount == leaderDocCount) replicasInSync++;
+        }
+      }
+      if (replicasInSync == correctSlice.getReplicas().size()) {
+        return true;
+      }
+    }
+    return false;
+  }
 }