You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2016/04/14 07:37:48 UTC

[7/7] lucene-solr:solr-5750: Enhance restore: * Restored conf name is configurable; won't overwrite existing. Defaults to original. * Customize replicationFactor and some other settings on restoration. * Shard/slice info is restored instead of reconstitu

Enhance restore:
* Restored conf name is configurable; won't overwrite existing. Defaults to original.
* Customize replicationFactor and some other settings on restoration.
* Shard/slice info is restored instead of reconstituted.  Thus shard hash ranges (from e.g. shard split) is restored.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/e8d29686
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/e8d29686
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/e8d29686

Branch: refs/heads/solr-5750
Commit: e8d296864c1fdab023624c12f7350a7a79c50ce8
Parents: 7e80651
Author: David Smiley <ds...@apache.org>
Authored: Thu Apr 14 01:37:33 2016 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Thu Apr 14 01:37:33 2016 -0400

----------------------------------------------------------------------
 .../cloud/OverseerCollectionMessageHandler.java | 213 +++++++++----------
 .../solr/cloud/TestCloudBackupRestore.java      | 165 ++++++++++----
 2 files changed, 220 insertions(+), 158 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8d29686/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java
index 8a35b6d..1832ead 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java
@@ -16,24 +16,19 @@
  */
 package org.apache.solr.cloud;
 
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
+import java.io.Reader;
+import java.io.Writer;
 import java.lang.invoke.MethodHandles;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.text.SimpleDateFormat;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -52,7 +47,6 @@ import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
 import org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException;
 import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
-import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.CoreAdminRequest;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.UpdateResponse;
@@ -85,12 +79,10 @@ import org.apache.solr.common.params.CoreAdminParams;
 import org.apache.solr.common.params.CoreAdminParams.CoreAdminAction;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.ShardParams;
-import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.util.StrUtils;
 import org.apache.solr.common.util.Utils;
-import org.apache.solr.handler.SnapShooter;
 import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.handler.component.ShardHandlerFactory;
 import org.apache.solr.handler.component.ShardRequest;
@@ -111,9 +103,9 @@ import static org.apache.solr.common.cloud.DocCollection.SNITCH;
 import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE;
-import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP;
@@ -333,86 +325,90 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
 
   //TODO move this out into it's own class.
   private void processRestoreAction(ZkNodeProps message, NamedList results) throws IOException, KeeperException, InterruptedException {
+    // TODO maybe we can inherit createCollection's options/code
     String restoreCollectionName =  message.getStr(COLLECTION_PROP);
-    String name =  message.getStr(NAME);
+    String backupName =  message.getStr(NAME); // of backup
     String location = message.getStr("location");
     ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
-    final String asyncId = message.getStr(ASYNC);
+    String asyncId = message.getStr(ASYNC);
     Map<String, String> requestMap = new HashMap<>();
 
-    Path backupPath = Paths.get(location).resolve(name).toAbsolutePath();
-
+    Path backupPath = Paths.get(location).resolve(backupName).toAbsolutePath();
+    if (!Files.exists(backupPath)) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Couldn't restore since doesn't exist: " + backupPath);
+    }
     Path backupZkPath =  backupPath.resolve("zk_backup");
 
-    Path backupZkPropsPath = backupZkPath.resolve("backup.properties");
-
     Properties properties = new Properties();
-    try (InputStream in = Files.newInputStream(backupZkPropsPath)) {
-      properties.load(new InputStreamReader(in, StandardCharsets.UTF_8));
-    } catch (IOException e) {
-      String errorMsg = String.format(Locale.ROOT, "Could not load properties from %s: %s:",
-          backupZkPropsPath.toAbsolutePath().toString(), e.toString());
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, errorMsg);
+    try (Reader in = Files.newBufferedReader(backupZkPath.resolve("backup.properties"), StandardCharsets.UTF_8)) {
+      properties.load(in);
     }
 
-    String backupCollection = (String) properties.get("collectionName");
-    Path collectionStatePath = backupZkPath.resolve("collection_state_backup.json");
-    byte[] data = Files.readAllBytes(collectionStatePath);
-    @SuppressWarnings("unchecked")
-    Map<String, Object> collectionProps = (Map<String, Object>) ((Map<String, Object>) Utils.fromJSON(data)).get(backupCollection);
-
-    //Download the configs
-    String configName = (String) properties.get("collection.configName");
+    String backupCollection = (String) properties.get("collection");
+    byte[] data = Files.readAllBytes(backupZkPath.resolve("collection_state_backup.json"));
+    ClusterState backupClusterState = ClusterState.load(-1, data, Collections.emptySet());
+    DocCollection backupCollectionState = backupClusterState.getCollection(backupCollection);
 
-    //Use a name such as restore.<restore_name>.<original_config_name>
-    // in ZK for the configs
-    String restoreConfigName = "restore." + configName;
-    zkStateReader.getConfigManager().uploadConfigDir(backupZkPath.resolve("configs").resolve(configName), restoreConfigName);
+    //Upload the configs
+    String configName = (String) properties.get(COLL_CONF);
+    String restoreConfigName = message.getStr(COLL_CONF, configName);
+    if (zkStateReader.getConfigManager().configExists(restoreConfigName)) {
+      log.info("Using existing config {}", restoreConfigName);
+      //TODO add overwrite option?
+    } else {
+      log.info("Uploading config {}", restoreConfigName);
+      zkStateReader.getConfigManager().uploadConfigDir(backupZkPath.resolve("configs").resolve(configName), restoreConfigName);
+    }
 
-    log.debug("Starting restore into collection={} with backup_name={} at location={}", restoreCollectionName, name,
+    log.info("Starting restore into collection={} with backup_name={} at location={}", restoreCollectionName, backupName,
         backupPath);
 
     //Create core-less collection
     {
       Map<String, Object> propMap = new HashMap<>();
+      propMap.put(Overseer.QUEUE_OPERATION, CREATE.toString());
+      propMap.put("fromApi", "true"); // mostly true.  Prevents autoCreated=true in the collection state.
+
+      // inherit settings from input API, defaulting to the backup's setting.  Ex: replicationFactor
+      for (String collProp : COLL_PROPS.keySet()) {
+        Object val = message.getProperties().getOrDefault(collProp, backupCollectionState.get(collProp));
+        if (val != null) {
+          propMap.put(collProp, val);
+        }
+      }
+
       propMap.put(NAME, restoreCollectionName);
       propMap.put(CREATE_NODE_SET, CREATE_NODE_SET_EMPTY); //no cores
-      propMap.put("collection.configName", restoreConfigName);
-      // add async param
-      if (asyncId != null) {
-        propMap.put(ASYNC, asyncId);
+      propMap.put(COLL_CONF, restoreConfigName);
+
+      // router.*
+      @SuppressWarnings("unchecked")
+      Map<String, Object> routerProps = (Map<String, Object>) backupCollectionState.getProperties().get(DocCollection.DOC_ROUTER);
+      for (Map.Entry<String, Object> pair : routerProps.entrySet()) {
+        propMap.put(DocCollection.DOC_ROUTER + "." + pair.getKey(), pair.getValue());
       }
-      String router = (String) ((Map)collectionProps.get("router")).get("name");
-      propMap.put("router.name", router);
-      Map slices = (Map) collectionProps.get(SHARDS_PROP);
-      if (slices != null) { //Implicit routers may not have a shards defined
-        propMap.put(NUM_SLICES, slices.size());
-      }
-      if (ImplicitDocRouter.NAME.equals(router)) {
-        Iterator keys = ((Map) collectionProps.get(SHARDS_PROP)).keySet().iterator();
-        StringBuilder shardsBuilder = new StringBuilder();
-        while (keys.hasNext()) {
-          String shard = (String) keys.next();
-          shardsBuilder.append(shard);
-          shardsBuilder.append(",");
+
+      Set<String> sliceNames = backupCollectionState.getActiveSlicesMap().keySet();
+      if (backupCollectionState.getRouter() instanceof ImplicitDocRouter) {
+        propMap.put(SHARDS_PROP, StrUtils.join(sliceNames, ','));
+      } else {
+        propMap.put(NUM_SLICES, sliceNames.size());
+        // ClusterStateMutator.createCollection detects that "slices" is in fact a slice structure instead of a
+        //   list of names, and if so uses this instead of building it.  We clear the replica list.
+        Collection<Slice> backupSlices = backupCollectionState.getActiveSlices();
+        Map<String,Slice> newSlices = new LinkedHashMap<>(backupSlices.size());
+        for (Slice backupSlice : backupSlices) {
+          newSlices.put(backupSlice.getName(),
+              new Slice(backupSlice.getName(), Collections.emptyMap(), backupSlice.getProperties()));
         }
-        String shards = shardsBuilder.deleteCharAt(shardsBuilder.length()-1).toString();//delete trailing comma
-        propMap.put(SHARDS_PROP, shards);
+        propMap.put(SHARDS_PROP, newSlices);
       }
-      // TODO nocommit loop all from a list and blacklist those we know could never work.
-      //     The user could always edit the properties file if they need to.
-      propMap.put(MAX_SHARDS_PER_NODE, Integer.parseInt((String) collectionProps.get(MAX_SHARDS_PER_NODE)));
-      propMap.put(ZkStateReader.AUTO_ADD_REPLICAS, Boolean.parseBoolean((String) collectionProps.get(ZkStateReader.AUTO_ADD_REPLICAS)));
-      propMap.put(Overseer.QUEUE_OPERATION, CREATE.toString());
 
       createCollection(zkStateReader.getClusterState(), new ZkNodeProps(propMap), new NamedList());
+      // note: when createCollection() returns, the collection exists (no race)
     }
 
-    //No need to wait. CreateCollection takes care of it by calling waitToSeeReplicasInState()
     DocCollection restoreCollection = zkStateReader.getClusterState().getCollection(restoreCollectionName);
-    if (restoreCollection == null) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Could not create restore collection");
-    }
 
     DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
 
@@ -427,18 +423,22 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
       inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
     }
 
+    // TODO how do we leverage the CREATE_NODE_SET / RULE / SNITCH logic in createCollection?
+
     ClusterState clusterState = zkStateReader.getClusterState();
     //Create one replica per shard and copy backed up data to it
     for (Slice slice: restoreCollection.getSlices()) {
       log.debug("Adding replica for shard={} collection={} ", slice.getName(), restoreCollection);
       HashMap<String, Object> propMap = new HashMap<>();
-      propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower());
+      propMap.put(Overseer.QUEUE_OPERATION, CREATESHARD);
       propMap.put(COLLECTION_PROP, restoreCollectionName);
       propMap.put(SHARD_ID_PROP, slice.getName());
       // add async param
       if (asyncId != null) {
         propMap.put(ASYNC, asyncId);
       }
+      addPropertyParams(message, propMap);
+
       addReplica(clusterState, new ZkNodeProps(propMap), new NamedList());
     }
 
@@ -456,7 +456,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
     processResponses(new NamedList(), shardHandler, true, "Could not restore core", asyncId, requestMap);
 
     //Mark all shards in ACTIVE STATE
-
     {
       HashMap<String, Object> propMap = new HashMap<>();
       propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
@@ -470,18 +469,11 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
     //refresh the location copy of collection state
     restoreCollection = zkStateReader.getClusterState().getCollection(restoreCollectionName);
 
-    //Update the replicationFactor to be 1 as that's what it is currently. Otherwise addreplica assigns wrong core names
-    {
-      HashMap<String, Object> propMap = new HashMap<>();
-      propMap.put(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower());
-      propMap.put(COLLECTION_PROP, restoreCollectionName);
-      propMap.put(REPLICATION_FACTOR, 1);
-      inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
-    }
-
     //Add the remaining replicas for each shard
-    int numReplicas = Integer.parseInt((String) collectionProps.get(REPLICATION_FACTOR));
-    if (numReplicas > 1) {
+    Integer numReplicas = restoreCollection.getReplicationFactor();
+    if (numReplicas != null && numReplicas > 1) {
+      log.info("Adding replicas to restored collection={}", restoreCollection);
+
       for (Slice slice: restoreCollection.getSlices()) {
         for(int i=1; i<numReplicas; i++) {
           log.debug("Adding replica for shard={} collection={} ", slice.getName(), restoreCollection);
@@ -492,19 +484,14 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
           if (asyncId != null) {
             propMap.put(ASYNC, asyncId);
           }
+          addPropertyParams(message, propMap);
+
           addReplica(zkStateReader.getClusterState(), new ZkNodeProps(propMap), results);
         }
       }
-
-      //Update the replicationFactor property in cluster state for this collection
-      log.info("Modifying replication factor to the expected value of={}", numReplicas);
-      HashMap<String, Object>propMap = new HashMap<>();
-      propMap.put(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.MODIFYCOLLECTION.toLower());
-      propMap.put(COLLECTION_PROP, restoreCollectionName);
-      propMap.put(REPLICATION_FACTOR, numReplicas);
-      inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
-      //nocommit do we need to wait for the result to be done before returning?
     }
+
+    log.info("Completed restoring collection={} backup={}", restoreCollection, backupName);
   }
 
   private void processBackupAction(ZkNodeProps message, NamedList results) throws IOException, KeeperException, InterruptedException {
@@ -513,11 +500,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
     String location = message.getStr("location");
     ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
     String asyncId = message.getStr(ASYNC);
-    Map<String, String> requestMap = null;
-    if (asyncId != null) {
-      requestMap = new HashMap<>();
-    }
+    Map<String, String> requestMap = new HashMap<>();
+    Instant startTime = Instant.now();
 
+    // note: we assume a shared files system to backup a collection, since a collection is distributed
     Path backupPath = Paths.get(location).resolve(name).toAbsolutePath();
 
     //Validating if the directory already exists.
@@ -525,20 +511,21 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
           "Backup directory already exists: " + backupPath);
     }
+    Files.createDirectory(backupPath); // create now
 
-    log.debug("Starting backup of collection={} with backup_name={} at location={}", collectionName, name,
+    log.info("Starting backup of collection={} with backup_name={} at location={}", collectionName, name,
         backupPath);
 
-    for (Slice slice : zkStateReader.getClusterState().getActiveSlices(collectionName)) {
+    for (Slice slice : zkStateReader.getClusterState().getCollection(collectionName).getActiveSlices()) {
       Replica replica = slice.getLeader();
 
       String coreName = replica.getStr(CORE_NAME_PROP);
 
       ModifiableSolrParams params = new ModifiableSolrParams();
+      params.set(CoreAdminParams.ACTION, CoreAdminAction.BACKUPCORE.toString());
       params.set(NAME, slice.getName());
-      params.set("location", backupPath.toString());
+      params.set("location", backupPath.toString()); // note: index dir will be here then the "snapshot." + slice name
       params.set(CORE_NAME_PROP, coreName);
-      params.set(CoreAdminParams.ACTION, CoreAdminAction.BACKUPCORE.toString());
 
       sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap);
       log.debug("Sent backup request to core={} for backup_name={}", coreName, name);
@@ -547,7 +534,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
 
     processResponses(results, shardHandler, true, "Could not backup all replicas", asyncId, requestMap);
 
-    log.debug("Starting to backup ZK data for backup_name={}", name);
+    log.info("Starting to backup ZK data for backup_name={}", name);
 
     //Download the configs
     String configName = zkStateReader.readConfigName(collectionName);
@@ -556,32 +543,27 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
 
     //Save the collection's state. Can be part of the monolithic clusterstate.json or a individual state.json
     //Since we don't want to distinguish we extract the state and back it up as a separate json
-    Path collectionStatePath = zkBackup.resolve("collection_state_backup.json");
     DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
-    byte[] bytes = Utils.toJSON(Collections.singletonMap(collectionName, collection));
-    Files.write(collectionStatePath, bytes);
+    Files.write(zkBackup.resolve("collection_state_backup.json"),//nocommit or simply clusterstate.json?
+        Utils.toJSON(Collections.singletonMap(collectionName, collection)));
 
+    //nocommit why is it stored in zk_backup; shouldn't it be in backupPath?
     Path propertiesPath = zkBackup.resolve("backup.properties");
     Properties properties = new Properties();
-    properties.put("collectionName", collectionName);
-    properties.put("snapshotName", name);
+
+    properties.put("snapshotName", name); //nocommit or simply "backupName" (consistent with "backup" term?)
+    properties.put("collection", collectionName);
     properties.put("collection.configName", configName);
-    properties.put("stateFormat", Integer.toString(collection.getStateFormat()));
-    properties.put("startTime", new SimpleDateFormat(SnapShooter.DATE_FMT, Locale.ROOT).format(new Date()));
+    properties.put("startTime", startTime.toString());
     //TODO: Add MD5 of the configset. If during restore the same name configset exists then we can compare checksums to see if they are the same.
-    //if they are not the same then we can throw and error or have a 'overwriteConfig' flag
+    //if they are not the same then we can throw an error or have an 'overwriteConfig' flag
     //TODO save numDocs for the shardLeader. We can use it to sanity check the restore.
 
-    try (OutputStreamWriter os = new OutputStreamWriter(
-        new FileOutputStream(propertiesPath.toAbsolutePath().toString()), StandardCharsets.UTF_8)) {
+    try (Writer os = Files.newBufferedWriter(propertiesPath, StandardCharsets.UTF_8)) {
       properties.store(os, "Snapshot properties file");
-    } catch (IOException e) {
-      String errorMsg = String.format(Locale.ROOT, "Could not write properties to %s: %s:",
-          propertiesPath.toAbsolutePath().toString(), e.toString());
-      throw new SolrException(ErrorCode.SERVER_ERROR, errorMsg);
     }
 
-    log.debug("Completed backing up ZK data for backup={}", name);
+    log.info("Completed backing up ZK data for backup={}", name);
   }
 
   @SuppressWarnings("unchecked")
@@ -2033,6 +2015,15 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
     }
   }
 
+  private void addPropertyParams(ZkNodeProps message, Map<String,Object> map) {
+    // Now add the property.key=value pairs
+    for (String key : message.keySet()) {
+      if (key.startsWith(COLL_PROP_PREFIX)) {
+        map.put(key, message.getStr(key));
+      }
+    }
+  }
+
   private static List<String> getLiveOrLiveAndCreateNodeSetList(final Set<String> liveNodes, final ZkNodeProps message, final Random random) {
     // TODO: add smarter options that look at the current number of cores per
     // node?

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/e8d29686/solr/core/src/test/org/apache/solr/cloud/TestCloudBackupRestore.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudBackupRestore.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudBackupRestore.java
index 7dbefda..f0817e2 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudBackupRestore.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudBackupRestore.java
@@ -17,13 +17,24 @@
 
 package org.apache.solr.cloud;
 
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Random;
+import java.util.TreeMap;
+
 import org.apache.lucene.util.TestUtil;
 import org.apache.solr.client.solrj.SolrQuery;
+import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.ImplicitDocRouter;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.NamedList;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -36,68 +47,101 @@ public class TestCloudBackupRestore extends SolrCloudTestCase {
 
   private static Logger log = LoggerFactory.getLogger(TestCloudBackupRestore.class);
 
-  private static final int NUM_SHARDS = 2;
+  private static final int NUM_SHARDS = 2;//granted we sometimes shard split to get more
+
+  private static long docsSeed; // see indexDocs()
 
   @BeforeClass
   public static void createCluster() throws Exception {
-    configureCluster(2)
+    configureCluster(2)// nodes
         .addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
         .configure();
+
+    docsSeed = random().nextLong();
   }
 
   @Test
   public void test() throws Exception {
     String collectionName = "backuprestore";
-    String restoreCollectionName = collectionName + "_restored";
     boolean isImplicit = random().nextBoolean();
-    int numReplicas = TestUtil.nextInt(random(), 1, 2);
+    int replFactor = TestUtil.nextInt(random(), 1, 2);
     CollectionAdminRequest.Create create =
-        CollectionAdminRequest.createCollection(collectionName, "conf1", NUM_SHARDS, numReplicas);
-    create.setMaxShardsPerNode(NUM_SHARDS);
+        CollectionAdminRequest.createCollection(collectionName, "conf1", NUM_SHARDS, replFactor);
+    if (NUM_SHARDS * replFactor > cluster.getJettySolrRunners().size() || random().nextBoolean()) {
+      create.setMaxShardsPerNode(NUM_SHARDS);//just to assert it survives the restoration
+    }
+    if (random().nextBoolean()) {
+      create.setAutoAddReplicas(true);//just to assert it survives the restoration
+    }
+    Properties coreProps = new Properties();
+    coreProps.put("customKey", "customValue");//just to assert it survives the restoration
+    create.setProperties(coreProps);
     if (isImplicit) { //implicit router
       create.setRouterName(ImplicitDocRouter.NAME);
-      create.setNumShards(null);//erase it
+      create.setNumShards(null);//erase it. TODO suggest a new createCollectionWithImplicitRouter method
       create.setShards("shard1,shard2"); // however still same number as NUM_SHARDS; we assume this later
       create.setRouterField("shard_s");
+    } else {//composite id router
+      if (random().nextBoolean()) {
+        create.setRouterField("shard_s");
+      }
     }
-//TODO nocommit test shard split & custom doc route?
+
     create.process(cluster.getSolrClient());
     waitForCollection(collectionName);
     indexDocs(collectionName);
-    testBackupAndRestore(collectionName, restoreCollectionName, isImplicit);
+
+    if (!isImplicit && random().nextBoolean()) {
+      // shard split the first shard
+      int prevActiveSliceCount = getActiveSliceCount(collectionName);
+      CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(collectionName);
+      splitShard.setShardName("shard1");
+      splitShard.process(cluster.getSolrClient());
+      // wait until we see one more active slice...
+      for (int i = 0; getActiveSliceCount(collectionName) != prevActiveSliceCount + 1; i++) {
+        assertTrue(i < 30);
+        Thread.sleep(500);
+      }
+      // issue a hard commit.  Split shard does a soft commit which isn't good enough for the backup/snapshooter to see
+      cluster.getSolrClient().commit();
+    }
+
+    testBackupAndRestore(collectionName);
+  }
+
+  private int getActiveSliceCount(String collectionName) {
+    return cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(collectionName).getActiveSlices().size();
   }
 
   private void indexDocs(String collectionName) throws Exception {
-    int numDocs = TestUtil.nextInt(random(), 10, 100);
+    Random random = new Random(docsSeed);// use a constant seed for the whole test run so that we can easily re-index.
+    int numDocs = random.nextInt(100);
+    if (numDocs == 0) {
+      log.info("Indexing ZERO test docs");
+      return;
+    }
     CloudSolrClient client = cluster.getSolrClient();
     client.setDefaultCollection(collectionName);
+    List<SolrInputDocument> docs = new ArrayList<>(numDocs);
     for (int i=0; i<numDocs; i++) {
-      //We index the shard_s fields for whichever router gets chosen but only use it when implicit router was selected
-      if (random().nextBoolean()) {
-        SolrInputDocument doc = new SolrInputDocument();
-        doc.addField("id", i);
-        doc.addField("shard_s", "shard1");
-        client.add(doc);
-      } else {
-        SolrInputDocument doc = new SolrInputDocument();
-        doc.addField("id", i);
-        doc.addField("shard_s", "shard2");
-        client.add(doc);
-      }
+      SolrInputDocument doc = new SolrInputDocument();
+      doc.addField("id", i);
+      doc.addField("shard_s", "shard" + (1 + random.nextInt(NUM_SHARDS))); // for implicit router
+      docs.add(doc);
     }
+    client.add(docs);// batch
     client.commit();
   }
 
-  private void testBackupAndRestore(String collectionName, String restoreCollectionName, boolean isImplicit) throws Exception {
+  private void testBackupAndRestore(String collectionName) throws Exception {
     String backupName = "mytestbackup";
+    String restoreCollectionName = collectionName + "_restored";
+
     CloudSolrClient client = cluster.getSolrClient();
-    long totalDocs = client.query(collectionName, new SolrQuery("*:*")).getResults().getNumFound();
-    long shard1Docs = 0, shard2Docs = 0;
-    if (isImplicit) {
-      shard1Docs = client.query(collectionName, new SolrQuery("*:*").setParam(_ROUTE_, "shard1")).getResults().getNumFound();
-      shard2Docs = client.query(collectionName, new SolrQuery("*:*").setParam(_ROUTE_, "shard2")).getResults().getNumFound();
-      assertTrue(totalDocs == shard1Docs + shard2Docs);
-    }
+    DocCollection backupCollection = client.getZkStateReader().getClusterState().getCollection(collectionName);
+
+    Map<String, Integer> origShardToDocCount = getShardToDocCountMap(client, backupCollection);
+    assert origShardToDocCount.isEmpty() == false;
 
     String location = createTempDir().toFile().getAbsolutePath();
 
@@ -110,36 +154,63 @@ public class TestCloudBackupRestore extends SolrCloudTestCase {
 
     log.info("Triggering Restore command");
 
+    //nocommit test with async
     CollectionAdminRequest.Restore restore = CollectionAdminRequest.restoreCollection(restoreCollectionName, backupName)
         .setLocation(location);
-    rsp = cluster.getSolrClient().request(restore);
+    if (origShardToDocCount.size() > cluster.getJettySolrRunners().size()) {
+      // may need to increase maxShardsPerNode (e.g. if it was shard split, then now we need more)
+      restore.getCreateOptions().setMaxShardsPerNode(origShardToDocCount.size());
+    }
+    Properties props = new Properties();
+    props.setProperty("customKey", "customVal");
+    restore.getCreateOptions().setProperties(props);
+    boolean sameConfig = random().nextBoolean();
+    if (sameConfig==false) {
+      restore.getCreateOptions().setConfigName("customConfigName");//nocommit ugh, this is deprecated
+    }
+    rsp = cluster.getSolrClient().request(restore); // DO IT!
     assertEquals(0, ((NamedList)rsp.get("responseHeader")).get("status"));
     waitForCollection(restoreCollectionName);
 
     //Check the number of results are the same
-    long restoredNumDocs = client.query(restoreCollectionName, new SolrQuery("*:*")).getResults().getNumFound();
-    assertEquals(totalDocs, restoredNumDocs);
-
-    if (isImplicit) {
-      long restoredShard1Docs = client.query(restoreCollectionName, new SolrQuery("*:*").setParam(_ROUTE_, "shard1")).getResults().getNumFound();
-      long restoredShard2Docs = client.query(restoreCollectionName, new SolrQuery("*:*").setParam(_ROUTE_, "shard2")).getResults().getNumFound();
-
-      assertEquals(shard2Docs, restoredShard2Docs);
-      assertEquals(shard1Docs, restoredShard1Docs);
-    }
-
-    DocCollection backupCollection = client.getZkStateReader().getClusterState().getCollection(collectionName);
     DocCollection restoreCollection = client.getZkStateReader().getClusterState().getCollection(restoreCollectionName);
+    assertEquals(origShardToDocCount, getShardToDocCountMap(client, restoreCollection));
+    //Re-index same docs (should be identical docs given same random seed) and test we have the same result.  Helps
+    //  test we reconstituted the hash ranges / doc router.
+    if (!(restoreCollection.getRouter() instanceof ImplicitDocRouter) && random().nextBoolean()) {
+      indexDocs(restoreCollectionName);
+      assertEquals(origShardToDocCount, getShardToDocCountMap(client, restoreCollection));
+    }
 
     assertEquals(backupCollection.getReplicationFactor(), restoreCollection.getReplicationFactor());
+    assertEquals(backupCollection.getAutoAddReplicas(), restoreCollection.getAutoAddReplicas());
+    assertEquals(backupCollection.getActiveSlices().iterator().next().getReplicas().size(),
+        restoreCollection.getActiveSlices().iterator().next().getReplicas().size());
+    assertEquals(sameConfig ? "conf1" : "customConfigName",
+        cluster.getSolrClient().getZkStateReader().readConfigName(restoreCollectionName));
+
+    // assert added core properties:
+    // nocommit how?
+  }
 
-    assertEquals("restore.conf1", cluster.getSolrClient().getZkStateReader().readConfigName(restoreCollectionName));
+  private Map<String, Integer> getShardToDocCountMap(CloudSolrClient client, DocCollection docCollection) throws SolrServerException, IOException {
+    Map<String,Integer> shardToDocCount = new TreeMap<>();
+    for (Slice slice : docCollection.getActiveSlices()) {
+      String shardName = slice.getName();
+      long docsInShard = client.query(docCollection.getName(), new SolrQuery("*:*").setParam(_ROUTE_, shardName))
+          .getResults().getNumFound();
+      shardToDocCount.put(shardName, (int) docsInShard);
+    }
+    return shardToDocCount;
   }
 
   public void waitForCollection(String collection) throws Exception {
-    AbstractFullDistribZkTestBase.waitForCollection(cluster.getSolrClient().getZkStateReader(), collection, NUM_SHARDS);
-    AbstractDistribZkTestBase.waitForRecoveriesToFinish(collection, cluster.getSolrClient().getZkStateReader(), log.isDebugEnabled(), true, 30);
-    AbstractDistribZkTestBase.assertAllActive(collection, cluster.getSolrClient().getZkStateReader());
+    // note: NUM_SHARDS may be too small because of shard split, but that's okay?
+    ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
+    AbstractFullDistribZkTestBase.waitForCollection(zkStateReader, collection, NUM_SHARDS);
+    AbstractDistribZkTestBase.waitForRecoveriesToFinish(collection, zkStateReader, log.isDebugEnabled(), true, 30);
+    AbstractDistribZkTestBase.assertAllActive(collection, zkStateReader);
+    assertFalse(zkStateReader.getClusterState().getCollection(collection).getActiveSlices().isEmpty());
   }
 
 }