You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/11/16 11:24:45 UTC

[lucene-solr] 01/02: @1221 Enable basic move replica test.

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 6c5f9a27bca5f2e8c6dc4bc6e085bbe696ed0f9c
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Sun Nov 15 22:36:46 2020 -0600

    @1221 Enable basic move replica test.
---
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  2 +-
 .../solr/cloud/api/collections/MoveReplicaCmd.java | 99 ++++++++++------------
 .../java/org/apache/solr/core/CoreContainer.java   | 21 +----
 .../java/org/apache/solr/handler/IndexFetcher.java |  7 +-
 .../java/org/apache/solr/servlet/HttpSolrCall.java | 32 +++----
 .../org/apache/solr/cloud/MoveReplicaTest.java     | 26 ++----
 6 files changed, 77 insertions(+), 110 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index e203010..6d3ea41 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -635,7 +635,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
                                                                                             // though
       try {
         CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
-        final Replica leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 15000);
+        final Replica leader = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId(), 3000);
 
         log.info("Begin buffering updates. core=[{}]", coreName);
         // recalling buffer updates will drop the old buffer tlog
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/MoveReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/MoveReplicaCmd.java
index a9c5a0b..5455221 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/MoveReplicaCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/MoveReplicaCmd.java
@@ -121,7 +121,8 @@ public class MoveReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
       Slice slice = coll.getSlice(shardId);
       List<Replica> sliceReplicas = new ArrayList<>(slice.getReplicas(r -> sourceNode.equals(r.getNodeName())));
       if (sliceReplicas.isEmpty()) {
-        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection: " + collection + " node: " + sourceNode + " does not have any replica belonging to shard: " + shardId);
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection: " + collection +
+            " node: " + sourceNode + " does not have any replica belonging to shard: " + shardId + " collection=" + coll);
       }
       Collections.shuffle(sliceReplicas, OverseerCollectionMessageHandler.RANDOM);
       replica = sliceReplicas.iterator().next();
@@ -156,24 +157,26 @@ public class MoveReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
     assert slice != null;
     Object dataDir = replica.get("dataDir");
     boolean isSharedFS = replica.getBool(ZkStateReader.SHARED_STORAGE_PROP, false) && dataDir != null;
-    OverseerCollectionMessageHandler.Finalize finalizer = null;
+
+    AddReplicaCmd.Response resp = null;
     if (isSharedFS && inPlaceMove) {
       log.debug("-- moveHdfsReplica");
+      // nocommit TODO
       moveHdfsReplica(clusterState, results, dataDir.toString(), targetNode, async, coll, replica, slice, timeout, waitForFinalState);
     } else {
       log.debug("-- moveNormalReplica (inPlaceMove={}, isSharedFS={}", inPlaceMove, isSharedFS);
-      finalizer = moveNormalReplica(clusterState, results, targetNode, async, coll, replica, slice, timeout, waitForFinalState);
+      resp = moveNormalReplica(clusterState, results, targetNode, async, coll, replica, slice, timeout, waitForFinalState);
     }
 
     AddReplicaCmd.Response response = new AddReplicaCmd.Response();
 
-    OverseerCollectionMessageHandler.Finalize finalIzer = finalizer;
+    OverseerCollectionMessageHandler.Finalize finalizer = resp.asyncFinalRunner;
     response.asyncFinalRunner = new OverseerCollectionMessageHandler.Finalize() {
       @Override
       public AddReplicaCmd.Response call() {
-        if (finalIzer != null) {
+        if (finalizer != null) {
           try {
-            finalIzer.call();
+            finalizer.call();
           } catch (Exception e) {
             log.error("Exception during MoveReplica", e);
           }
@@ -183,7 +186,7 @@ public class MoveReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
       }
     };
 
-    response.clusterState = clusterState;
+    response.clusterState = null;
 
     return response;
   }
@@ -281,7 +284,7 @@ public class MoveReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
   }
 
   @SuppressWarnings({"unchecked"})
-  private OverseerCollectionMessageHandler.Finalize moveNormalReplica(ClusterState clusterState, @SuppressWarnings({"rawtypes"}) NamedList results, String targetNode, String async, DocCollection coll,
+  private AddReplicaCmd.Response moveNormalReplica(ClusterState clusterState, @SuppressWarnings({"rawtypes"}) NamedList results, String targetNode, String async, DocCollection coll,
       Replica replica, Slice slice, int timeout, boolean waitForFinalState) throws Exception {
     String newCoreName = Assign.buildSolrCoreName(coll, coll.getName(), slice.getName(), replica.getType());
     ZkNodeProps addReplicasProps = new ZkNodeProps(COLLECTION_PROP, coll.getName(), SHARD_ID_PROP, slice.getName(), CoreAdminParams.NODE, targetNode, CoreAdminParams.NAME, newCoreName,
@@ -293,64 +296,56 @@ public class MoveReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
 
     AddReplicaCmd.Response response = ocmh.addReplicaWithResp(clusterState, addReplicasProps, addResult);
 
-    DocCollection docColl = response.clusterState.getCollectionOrNull(coll.getName());
-    Map<String, DocCollection> collectionStates;
-    if (docColl != null) {
-      collectionStates = new HashMap<>();
-      collectionStates.put(docColl.getName(), docColl);
-    } else {
-      collectionStates = new HashMap<>();
-    }
-    ClusterState cs = new ClusterState(response.clusterState.getLiveNodes(), collectionStates);
-    ocmh.overseer.getZkStateWriter().enqueueUpdate(cs, null,false);
+    ocmh.overseer.getZkStateWriter().enqueueUpdate(response.clusterState, null,false);
     ocmh.overseer.writePendingUpdates();
 
 
     // wait for the other replica to be active if the source replica was a leader
 
+    AddReplicaCmd.Response finalResponse = new AddReplicaCmd.Response();
+    
+    finalResponse.clusterState = response.clusterState;
 
+    finalResponse.asyncFinalRunner = () -> {
+      log.debug("Waiting for leader's replica to recover.");
 
-    return new OverseerCollectionMessageHandler.Finalize() {
-      @Override
-      public AddReplicaCmd.Response call() throws Exception {
-        log.debug("Waiting for leader's replica to recover.");
+      response.asyncFinalRunner.call();
+
+      if (addResult.get("failure") != null) {
+        String errorString = String
+            .format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" + " on node=%s, failure=%s", coll.getName(), slice.getName(), targetNode, addResult.get("failure"));
+        log.warn(errorString);
+        results.add("failure", errorString);
+
+        AddReplicaCmd.Response response1 = new AddReplicaCmd.Response();
+        return response1;
+      } else {
 
-        response.asyncFinalRunner.call();
+        AddReplicaCmd.Response response1 = new AddReplicaCmd.Response();
 
-        if (addResult.get("failure") != null) {
-          String errorString = String
-              .format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" + " on node=%s, failure=%s", coll.getName(), slice.getName(), targetNode, addResult.get("failure"));
+        ZkNodeProps removeReplicasProps = new ZkNodeProps(COLLECTION_PROP, coll.getName(), SHARD_ID_PROP, slice.getName(), REPLICA_PROP, replica.getName());
+        if (async != null) removeReplicasProps.getProperties().put(ASYNC, async);
+        @SuppressWarnings({"rawtypes"}) NamedList deleteResult = new NamedList();
+        try {
+          response1.clusterState = ocmh.deleteReplica(clusterState, removeReplicasProps, deleteResult).clusterState;
+        } catch (SolrException e) {
+          deleteResult.add("failure", e.toString());
+        }
+        if (deleteResult.get("failure") != null) {
+          String errorString = String.format(Locale.ROOT, "Failed to cleanup replica collection=%s shard=%s name=%s, failure=%s", coll.getName(), slice.getName(), replica.getName(), deleteResult.get("failure"));
           log.warn(errorString);
           results.add("failure", errorString);
-
-          AddReplicaCmd.Response response = new AddReplicaCmd.Response();
-          return response;
         } else {
-
-          AddReplicaCmd.Response response = new AddReplicaCmd.Response();
-
-          ZkNodeProps removeReplicasProps = new ZkNodeProps(COLLECTION_PROP, coll.getName(), SHARD_ID_PROP, slice.getName(), REPLICA_PROP, replica.getName());
-          if (async != null) removeReplicasProps.getProperties().put(ASYNC, async);
-          @SuppressWarnings({"rawtypes"}) NamedList deleteResult = new NamedList();
-          try {
-            response.clusterState = ocmh.deleteReplica(clusterState, removeReplicasProps, deleteResult).clusterState;
-          } catch (SolrException e) {
-            deleteResult.add("failure", e.toString());
-          }
-          if (deleteResult.get("failure") != null) {
-            String errorString = String.format(Locale.ROOT, "Failed to cleanup replica collection=%s shard=%s name=%s, failure=%s", coll.getName(), slice.getName(), replica.getName(), deleteResult.get("failure"));
-            log.warn(errorString);
-            results.add("failure", errorString);
-          } else {
-            String successString = String
-                .format(Locale.ROOT, "MOVEREPLICA action completed successfully, moved replica=%s at node=%s " + "to replica=%s at node=%s", replica.getName(), replica.getNodeName(), newCoreName,
-                    targetNode);
-            results.add("success", successString);
-          }
-
-          return response;
+          String successString = String
+              .format(Locale.ROOT, "MOVEREPLICA action completed successfully, moved replica=%s at node=%s " + "to replica=%s at node=%s", replica.getName(), replica.getNodeName(), newCoreName,
+                  targetNode);
+          results.add("success", successString);
         }
+
+        return response1;
       }
     };
+    
+    return finalResponse;
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index 7d74696..4d4c36f 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -1346,11 +1346,11 @@ public class CoreContainer implements Closeable {
         core = processCoreCreateException(e, dcore, coreConfig);
       }
 
+      core.start();
 
       registerCore(dcore, core, isZooKeeperAware(), false);
       registered = true;
 
-      core.start();
 
       // always kick off recovery if we are in non-Cloud mode
       if (!isZooKeeperAware() && core.getUpdateHandler().getUpdateLog() != null) {
@@ -1782,16 +1782,7 @@ public class CoreContainer implements Closeable {
       if (cd == null) {
         throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
       }
-      if (cd != null) {
-        SolrCore.deleteUnloadedCore(cd, deleteDataDir, deleteInstanceDir);
-        solrCores.removeCoreDescriptor(cd);
-        coresLocator.delete(this, cd);
-        if (core == null) {
-          // transient core
-          SolrCore.deleteUnloadedCore(cd, deleteDataDir, deleteInstanceDir);
-          return;
-        }
-      }
+
     } finally {
       if (isZooKeeperAware()) {
         // cancel recovery in cloud mode
@@ -1838,14 +1829,6 @@ public class CoreContainer implements Closeable {
     } catch (TimeoutException e) {
       log.error("Timeout waiting for SolrCore close on unload", e);
       throw new SolrException(ErrorCode.SERVER_ERROR, "Timeout waiting for SolrCore close on unload", e);
-    } finally {
-      if (deleteInstanceDir && cd != null) {
-        try {
-          FileUtils.deleteDirectory(cd.getInstanceDir().toFile());
-        } catch (IOException e) {
-          SolrException.log(log, "Failed to delete instance dir for core:" + cd.getName() + " dir:" + cd.getInstanceDir());
-        }
-      }
     }
   }
 
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index c2fa9f6..bbe8dff 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -731,7 +731,12 @@ public class IndexFetcher {
         }
         try {
           if (indexDir != null) {
-            core.getDirectoryFactory().release(indexDir);
+            try {
+              core.getDirectoryFactory().release(indexDir);
+            } catch (IllegalArgumentException e) {
+              if (log.isDebugEnabled()) log.debug("Error realing directory in IndexFetcher", e);
+              // could already be removed
+            }
           }
         } catch (Exception e) {
           SolrException.log(log, e);
diff --git a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
index cfa4d7b..589695a 100644
--- a/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
+++ b/solr/core/src/java/org/apache/solr/servlet/HttpSolrCall.java
@@ -296,7 +296,7 @@ public class HttpSolrCall {
         } else {
           // if we couldn't find it locally, look on other nodes
           if (idx > 0) {
-            extractRemotePath(collectionName, origCorename);
+            extractRemotePath(null, origCorename);
             if (action == REMOTEQUERY) {
               path = path.substring(idx);
               return;
@@ -1040,7 +1040,9 @@ public class HttpSolrCall {
     return result;
   }
 
-  protected SolrCore getCoreByCollection(String collectionName, boolean isPreferLeader) {
+  protected SolrCore getCoreByCollection(String collectionName, boolean isPreferLeader) throws TimeoutException, InterruptedException {
+    ensureStatesAreAtLeastAtClient();
+
     ZkStateReader zkStateReader = cores.getZkController().getZkStateReader();
 
     ClusterState clusterState = zkStateReader.getClusterState();
@@ -1107,12 +1109,11 @@ public class HttpSolrCall {
     final DocCollection docCollection = clusterState.getCollectionOrNull(collectionName, false);
     Collection<Slice> slices = (docCollection != null) ? docCollection.getActiveSlices() : null;
     List<Slice> activeSlices = new ArrayList<>();
-    boolean byCoreName = false;
 
     int totalReplicas = 0;
 
     if (slices == null) {
-      byCoreName = true;
+
       activeSlices = new ArrayList<>();
       getSlicesForCollections(clusterState, activeSlices, true);
       if (activeSlices.isEmpty()) {
@@ -1135,8 +1136,8 @@ public class HttpSolrCall {
       collectionsList = new ArrayList<>(collectionsList);
       collectionsList.add(collectionName);
     }
-    String coreUrl = getCoreUrl(collectionName, origCorename, clusterState,
-        activeSlices, byCoreName, true);
+    String coreUrl = getCoreUrl(origCorename,
+        activeSlices, true);
 
     // Avoid getting into a recursive loop of requests being forwarded by
     // stopping forwarding and erroring out after (totalReplicas) forwards
@@ -1145,16 +1146,14 @@ public class HttpSolrCall {
         throw new SolrException(SolrException.ErrorCode.INVALID_STATE,
             "No active replicas found for collection: " + collectionName);
       }
-      coreUrl = getCoreUrl(collectionName, origCorename, clusterState,
-          activeSlices, byCoreName, false);
+      coreUrl = getCoreUrl(origCorename,
+          activeSlices, false);
     }
 
     return coreUrl;
   }
 
-  private String getCoreUrl(String collectionName,
-                            String origCorename, ClusterState clusterState, List<Slice> slices,
-                            boolean byCoreName, boolean activeReplicas) {
+  private String getCoreUrl(String origCorename, List<Slice> slices, boolean activeReplicas) {
     String coreUrl;
 
     Collections.shuffle(slices, random);
@@ -1167,7 +1166,7 @@ public class HttpSolrCall {
         if (!activeReplicas || (cores.getZkController().zkStateReader.getLiveNodes().contains(replica.getNodeName())
             && replica.getState() == Replica.State.ACTIVE)) {
 
-          if (byCoreName && (origCorename == null || !origCorename.equals(replica.getStr(CORE_NAME_PROP)))) {
+          if (!origCorename.equals(replica.getStr(CORE_NAME_PROP))) {
             // if it's by core name, make sure they match
             continue;
           }
@@ -1176,14 +1175,7 @@ public class HttpSolrCall {
             continue;
           }
 
-          if (origCorename != null) {
-            coreUrl = replica.getBaseUrl() + "/" + origCorename;
-          } else {
-            coreUrl = replica.getCoreUrl();
-            if (coreUrl.endsWith("/")) {
-              coreUrl = coreUrl.substring(0, coreUrl.length() - 1);
-            }
-          }
+          coreUrl = replica.getCoreUrl();
 
           return coreUrl;
         }
diff --git a/solr/core/src/test/org/apache/solr/cloud/MoveReplicaTest.java b/solr/core/src/test/org/apache/solr/cloud/MoveReplicaTest.java
index f2d0d73..f197d8c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/MoveReplicaTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/MoveReplicaTest.java
@@ -53,7 +53,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 @LuceneTestCase.SuppressCodecs({"MockRandom", "Direct", "SimpleText"})
-@Ignore // nocommit
 public class MoveReplicaTest extends SolrCloudTestCase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
@@ -119,9 +118,6 @@ public class MoveReplicaTest extends SolrCloudTestCase {
     create.setMaxShardsPerNode(2);
     cloudClient.request(create);
 
-    // wait for recovery
-    cluster.waitForActiveCollection(coll, create.getNumShards(), create.getNumShards() * create.getTotaleReplicaCount());
-
     addDocs(coll, 100);
 
     Replica replica = getRandomReplica(coll, cloudClient);
@@ -153,20 +149,17 @@ public class MoveReplicaTest extends SolrCloudTestCase {
     CollectionAdminRequest.RequestStatus requestStatus = CollectionAdminRequest.requestStatus(asyncId);
     // wait for async request success
     boolean success = false;
-    for (int i = 0; i < 600; i++) {
+    for (int i = 0; i < 200; i++) {
       CollectionAdminRequest.RequestStatusResponse rsp = requestStatus.process(cloudClient);
       if (rsp.getRequestStatus() == RequestStatusState.COMPLETED) {
         success = true;
         break;
       }
       assertNotSame(rsp.getRequestStatus(), RequestStatusState.FAILED);
-      Thread.sleep(250);
+      Thread.sleep(50);
     }
     assertTrue(success);
 
-    // wait for recovery
-    cluster.waitForActiveCollection(coll, create.getNumShards(), create.getNumShards() * (create.getNumNrtReplicas() + create.getNumPullReplicas() + create.getNumTlogReplicas()));
-
     assertEquals(100,  cluster.getSolrClient().query(coll, new SolrQuery("*:*")).getResults().getNumFound());
 
 //    assertEquals("should be one less core on the source node!", sourceNumCores - 1, getNumOfCores(cloudClient, replica.getNodeName(), coll, replica.getType().name()));
@@ -183,14 +176,13 @@ public class MoveReplicaTest extends SolrCloudTestCase {
     }
     assertNotNull(targetNode);
 
-    // nocommit  I think above get node logic is flakey Collection: MoveReplicaTest_coll_true node: 127.0.0.1:35129_solr does not have any replica belonging to shard: s1
-//    moveReplica = createMoveReplicaRequest(coll, replica, targetNode, shardId);
-//    moveReplica.setInPlaceMove(inPlaceMove);
-//    moveReplica.process(cloudClient);
-//
-//    assertEquals(100, cluster.getSolrClient().query(coll, new SolrQuery("*:*")).getResults().getNumFound());
-//
-//    checkNumOfCores(cloudClient, replica.getNodeName(), coll, sourceNumCores);
+    moveReplica = createMoveReplicaRequest(coll, replica, targetNode, shardId);
+    moveReplica.setInPlaceMove(inPlaceMove);
+    moveReplica.process(cloudClient);
+
+    assertEquals(100, cluster.getSolrClient().query(coll, new SolrQuery("*:*")).getResults().getNumFound());
+
+    checkNumOfCores(cloudClient, replica.getNodeName(), coll, sourceNumCores);
   }
 
   //Commented out 5-Dec-2017