You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/07/09 23:20:52 UTC

[lucene-solr] branch reference_impl updated (c0e621e -> afaf7c2)

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a change to branch reference_impl
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


 discard c0e621e  checkpoint
 discard cf0a20c  fix a test, fix overseer close
 discard 1145b8c  more test tweaks
 discard 44d1e73  working on some slow test stuff
 discard 1f6a175  more test fixes, replace another executor.
 discard cef4a93  fix test
 discard 578e1b4  start using per thread executor for httpshardhandler, cleanup some shutdown, parallel metrics reporter load
 discard 64ff0b6  fix init race.
 discard 443ffc1  boost test ram temporarily
 discard 0cdfbd8  Switch over facets executor and make rrddbs threadsafe.
 discard c3f52f4  speed up test for non nightly
 discard 005aa64  A couple test fixes and speed up non SolrCloudTestCase Jetty clusters.
 discard d5f22c1  fix jetty stop for non solrcloudtest tests.
 discard 01b8e64  speed up tests
 discard acbd9f8  checkpoint
 discard 9c284fc  Update and fix a variety of issues.
 discard 0857dfe  Add missing woodstox dep to ant build.
 discard a60bf18  checkpoint
 discard e91224f  leader election fixes
 discard b577af7  checkpoint
 discard d89104d  #42 The initial base work to make core tests more reasonable.
 discard cd2ded5  #1 A few additions to address TestCloudConsistency fail with a few related cleanups and a couple other test fail fixes.
 discard 1e5d8e9  #1 Wait for collections to be fully created before returning and other small collections API improvements and fixes.
     new afaf7c2  #42 The minimum starting base work necessary for a fast and stable SolrCloud.

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (c0e621e)
            \
             N -- N -- N   refs/heads/reference_impl (afaf7c2)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../client/solrj/embedded/JettySolrRunner.java     | 10 +++---
 .../java/org/apache/solr/request/SimpleFacets.java | 36 +++++++++++++---------
 .../apache/solr/response/SolrQueryResponse.java    |  4 +++
 .../solr/schema/TestUseDocValuesAsStored.java      |  2 +-
 .../src/java/org/apache/solr/common/ParWork.java   |  6 ++--
 .../src/java/org/apache/solr/SolrTestCase.java     | 27 ++++++++++------
 .../src/java/org/apache/solr/SolrTestCaseJ4.java   |  4 +--
 7 files changed, 53 insertions(+), 36 deletions(-)


[lucene-solr] 01/01: #42 The minimum starting base work necessary for a fast and stable SolrCloud.

Posted by ma...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit afaf7c21b48778c7db15f1a1305df98377de818d
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Thu Jul 9 18:20:23 2020 -0500

    #42 The minimum starting base work necessary for a fast and stable SolrCloud.
---
 build.gradle                                       |   17 +
 gradle/testing/defaults-tests.gradle               |    7 +-
 gradle/testing/policies/solr-tests.policy          |    2 +
 lucene/ivy-versions.properties                     |    5 +-
 .../org/apache/lucene/analysis/MockTokenizer.java  |    5 +-
 .../util/TestRuleSetupAndRestoreClassEnv.java      |    4 +-
 .../collection1/conf/solrconfig-icucollate.xml     |    1 +
 .../conf/solrconfig.snippet.randomindexconfig.xml  |    2 +
 .../configsets/cloud-analytics/conf/solrconfig.xml |    5 +
 .../legacy/LegacyAbstractAnalyticsTest.java        |    3 +-
 .../facet/LegacyAbstractAnalyticsFacetTest.java    |    9 +-
 .../DistributedClusteringComponentTest.java        |    1 -
 .../collection1/conf/dataimport-solrconfig.xml     |    1 +
 .../handler/dataimport/SolrEntityProcessor.java    |    2 +
 .../collection1/conf/contentstream-solrconfig.xml  |    1 +
 .../collection1/conf/dataimport-solrconfig.xml     |    1 +
 .../solr/handler/dataimport/DestroyCountCache.java |    3 +-
 .../solr/handler/dataimport/TestErrorHandling.java |    2 +-
 .../solr/collection1/conf/solrconfig.xml           |    1 +
 .../solr/collection1/conf/solrconfig.xml           |    5 +
 .../conf/solrconfig-languageidentifier.xml         |    1 +
 .../solr/collection1/conf/solrconfig-ltr.xml       |    5 +
 .../collection1/conf/solrconfig-ltr_Th10_10.xml    |    5 +
 .../solr/collection1/conf/solrconfig-multiseg.xml  |    5 +
 .../prometheus/exporter/SolrClientFactory.java     |    2 +-
 .../velocity/solr/collection1/conf/solrconfig.xml  |    5 +
 solr/core/build.gradle                             |    3 +
 solr/core/ivy.xml                                  |    4 +
 .../solr/client/solrj/embedded/JettyConfig.java    |   30 +-
 .../client/solrj/embedded/JettySolrRunner.java     |  437 +++++--
 .../solrj/embedded/SolrQueuedThreadPool.java       |  108 ++
 .../apache/solr/cloud/CloudConfigSetService.java   |    3 +-
 .../java/org/apache/solr/cloud/DistributedMap.java |   11 -
 .../org/apache/solr/cloud/ElectionContext.java     |   52 +-
 .../java/org/apache/solr/cloud/LeaderElector.java  |  178 ++-
 .../src/java/org/apache/solr/cloud/Overseer.java   |  469 ++++---
 .../apache/solr/cloud/OverseerElectionContext.java |   93 +-
 .../apache/solr/cloud/OverseerMessageHandler.java  |    2 +-
 .../apache/solr/cloud/OverseerTaskProcessor.java   |  470 +++----
 .../org/apache/solr/cloud/OverseerTaskQueue.java   |    4 +-
 .../org/apache/solr/cloud/RecoveryStrategy.java    |  103 +-
 .../org/apache/solr/cloud/ReplicateFromLeader.java |   11 +-
 .../solr/cloud/ShardLeaderElectionContext.java     |  356 +++---
 .../solr/cloud/ShardLeaderElectionContextBase.java |  206 ++--
 .../java/org/apache/solr/cloud/SolrZkServer.java   |   10 +-
 .../java/org/apache/solr/cloud/SyncStrategy.java   |   62 +-
 .../core/src/java/org/apache/solr/cloud/ZkCLI.java |    8 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 1303 +++++++++++---------
 .../org/apache/solr/cloud/ZkDistributedQueue.java  |   18 +-
 .../java/org/apache/solr/cloud/ZkShardTerms.java   |    5 +
 .../apache/solr/cloud/ZkSolrResourceLoader.java    |    2 +-
 .../solr/cloud/api/collections/AddReplicaCmd.java  |   44 +-
 .../solr/cloud/api/collections/AliasCmd.java       |   15 +-
 .../apache/solr/cloud/api/collections/Assign.java  |    5 +-
 .../cloud/api/collections/CreateCollectionCmd.java |  477 ++++---
 .../solr/cloud/api/collections/CreateShardCmd.java |    7 +-
 .../cloud/api/collections/DeleteCollectionCmd.java |   23 +-
 .../cloud/api/collections/DeleteReplicaCmd.java    |   60 +-
 .../solr/cloud/api/collections/DeleteShardCmd.java |    3 +
 .../api/collections/MaintainRoutedAliasCmd.java    |   23 +-
 .../solr/cloud/api/collections/MigrateCmd.java     |    8 +-
 .../OverseerCollectionMessageHandler.java          |  223 ++--
 .../api/collections/ReindexCollectionCmd.java      |    3 +-
 .../solr/cloud/api/collections/RestoreCmd.java     |    4 +-
 .../solr/cloud/api/collections/SplitShardCmd.java  |    7 +-
 .../apache/solr/cloud/autoscaling/AutoScaling.java |    3 -
 .../solr/cloud/autoscaling/AutoScalingHandler.java |    1 +
 .../solr/cloud/autoscaling/ComputePlanAction.java  |    6 +-
 .../solr/cloud/autoscaling/ExecutePlanAction.java  |    8 +-
 .../cloud/autoscaling/HttpTriggerListener.java     |    7 +-
 .../autoscaling/InactiveMarkersPlanAction.java     |    6 +-
 .../cloud/autoscaling/InactiveShardPlanAction.java |    9 +-
 .../solr/cloud/autoscaling/IndexSizeTrigger.java   |   13 +-
 .../solr/cloud/autoscaling/MetricTrigger.java      |   16 +-
 .../solr/cloud/autoscaling/NodeAddedTrigger.java   |    9 +-
 .../solr/cloud/autoscaling/NodeLostTrigger.java    |   12 +-
 .../cloud/autoscaling/OverseerTriggerThread.java   |   94 +-
 .../solr/cloud/autoscaling/ScheduledTrigger.java   |   12 +-
 .../solr/cloud/autoscaling/ScheduledTriggers.java  |   35 +-
 .../solr/cloud/autoscaling/SearchRateTrigger.java  |   34 +-
 .../solr/cloud/autoscaling/TriggerActionBase.java  |   16 +-
 .../apache/solr/cloud/autoscaling/TriggerBase.java |   32 +-
 .../solr/cloud/autoscaling/TriggerEvent.java       |    6 +-
 .../cloud/autoscaling/TriggerListenerBase.java     |    7 +-
 .../autoscaling/TriggerValidationException.java    |    3 +-
 .../cloud/autoscaling/sim/SimCloudManager.java     |   18 +-
 .../autoscaling/sim/SimClusterStateProvider.java   |    8 +-
 .../solr/cloud/overseer/ClusterStateMutator.java   |   42 +-
 .../apache/solr/cloud/overseer/ReplicaMutator.java |    3 +-
 .../apache/solr/cloud/overseer/SliceMutator.java   |  128 +-
 .../apache/solr/cloud/overseer/ZkStateWriter.java  |  333 +++--
 .../apache/solr/core/CachingDirectoryFactory.java  |  321 ++++-
 .../src/java/org/apache/solr/core/CloudConfig.java |    2 +-
 .../java/org/apache/solr/core/CoreContainer.java   |  822 ++++++------
 .../solr/core/EphemeralDirectoryFactory.java       |    4 +-
 .../org/apache/solr/core/HdfsDirectoryFactory.java |   34 +-
 .../src/java/org/apache/solr/core/NodeConfig.java  |    6 +-
 .../src/java/org/apache/solr/core/PluginBag.java   |   66 +-
 .../java/org/apache/solr/core/RequestHandlers.java |    3 +-
 .../src/java/org/apache/solr/core/SolrCore.java    |  606 +++++----
 .../src/java/org/apache/solr/core/SolrCores.java   |  435 ++++---
 .../org/apache/solr/core/SolrResourceLoader.java   |  179 ++-
 .../java/org/apache/solr/core/SolrXmlConfig.java   |    3 +-
 .../solr/core/TransientSolrCoreCacheDefault.java   |   33 +-
 .../java/org/apache/solr/core/XmlConfigFile.java   |   60 +-
 .../src/java/org/apache/solr/core/ZkContainer.java |  116 +-
 .../apache/solr/filestore/DistribPackageStore.java |    3 +-
 .../apache/solr/handler/CdcrReplicatorManager.java |    6 +-
 .../apache/solr/handler/CdcrRequestHandler.java    |    3 +-
 .../solr/handler/CdcrUpdateLogSynchronizer.java    |    1 +
 .../java/org/apache/solr/handler/IndexFetcher.java |   28 +-
 .../apache/solr/handler/ReplicationHandler.java    |   39 +-
 .../apache/solr/handler/RequestHandlerBase.java    |    9 +-
 .../org/apache/solr/handler/SolrConfigHandler.java |   13 +-
 .../solr/handler/admin/AdminHandlersProxy.java     |    2 +-
 .../solr/handler/admin/CollectionsHandler.java     |  149 +--
 .../solr/handler/admin/CoreAdminHandler.java       |   17 +-
 .../apache/solr/handler/admin/MetricsHandler.java  |    7 +-
 .../solr/handler/admin/MetricsHistoryHandler.java  |   42 +-
 .../apache/solr/handler/admin/PrepRecoveryOp.java  |   40 +-
 .../solr/handler/admin/SystemInfoHandler.java      |   21 +-
 .../handler/component/HttpShardHandlerFactory.java |   85 +-
 .../handler/component/IterativeMergeStrategy.java  |    1 +
 .../handler/component/QueryElevationComponent.java |    2 +-
 .../handler/component/RealTimeGetComponent.java    |    4 +
 .../handler/component/ShardHandlerFactory.java     |    3 +-
 .../solr/handler/component/ShardRequestor.java     |   15 +-
 .../solr/handler/component/SuggestComponent.java   |   29 +-
 .../org/apache/solr/handler/loader/XMLLoader.java  |   31 +-
 .../java/org/apache/solr/metrics/MetricsMap.java   |    4 +-
 .../apache/solr/metrics/SolrCoreMetricManager.java |   13 +-
 .../org/apache/solr/metrics/SolrMetricManager.java |   75 +-
 .../apache/solr/metrics/SolrMetricsContext.java    |    2 +-
 .../reporters/jmx/JmxObjectNameFactory.java        |    2 +-
 .../solr/metrics/rrd/SolrRrdBackendFactory.java    |    2 +
 .../apache/solr/packagemanager/PackageManager.java |   24 +-
 .../java/org/apache/solr/request/SimpleFacets.java |   76 +-
 .../apache/solr/request/SolrRequestHandler.java    |    2 +
 .../apache/solr/response/SolrQueryResponse.java    |    4 +
 .../org/apache/solr/schema/AbstractEnumField.java  |    5 +-
 .../apache/solr/schema/FieldTypePluginLoader.java  |    3 +-
 .../solr/schema/FileExchangeRateProvider.java      |    4 +-
 .../org/apache/solr/schema/ManagedIndexSchema.java |    2 +-
 .../org/apache/solr/search/SolrIndexSearcher.java  |   55 +-
 .../java/org/apache/solr/servlet/HttpSolrCall.java |  254 +++-
 .../org/apache/solr/servlet/ResponseUtils.java     |    3 +-
 .../apache/solr/servlet/SolrDispatchFilter.java    |   51 +-
 .../org/apache/solr/servlet/SolrQoSFilter.java     |   85 ++
 .../apache/solr/servlet/SolrRequestParsers.java    |   10 +-
 .../solr/servlet/cache/HttpCacheHeaderUtil.java    |    6 +-
 .../solr/spelling/AbstractLuceneSpellChecker.java  |   16 +-
 .../solr/spelling/suggest/SolrSuggester.java       |   12 +
 .../java/org/apache/solr/update/CdcrUpdateLog.java |    4 +-
 .../java/org/apache/solr/update/CommitTracker.java |    8 +-
 .../apache/solr/update/DefaultSolrCoreState.java   |  170 ++-
 .../apache/solr/update/DirectUpdateHandler2.java   |   26 +-
 .../org/apache/solr/update/PeerSyncWithLeader.java |    2 +-
 .../java/org/apache/solr/update/SolrCoreState.java |    4 +-
 .../org/apache/solr/update/SolrIndexSplitter.java  |    5 +-
 .../org/apache/solr/update/SolrIndexWriter.java    |  367 +++---
 .../apache/solr/update/StreamingSolrClients.java   |    3 +
 .../org/apache/solr/update/TransactionLog.java     |   14 +-
 .../java/org/apache/solr/update/UpdateHandler.java |   75 +-
 .../src/java/org/apache/solr/update/UpdateLog.java |  179 +--
 .../org/apache/solr/update/UpdateShardHandler.java |   35 +-
 .../processor/DistributedUpdateProcessor.java      |    4 +-
 .../processor/DistributedZkUpdateProcessor.java    |  143 ++-
 .../src/java/org/apache/solr/util/ExportTool.java  |  105 +-
 .../src/java/org/apache/solr/util/PackageTool.java |    6 +-
 .../java/org/apache/solr/util/SimplePostTool.java  |    5 +-
 .../src/java/org/apache/solr/util/SolrCLI.java     |   10 +-
 .../java/org/apache/solr/util/SolrLogPostTool.java |    2 +-
 .../java/org/apache/solr/util/TestInjection.java   |    5 +-
 .../src/java/org/apache/solr/util/TimeOut.java     |    8 +-
 .../configuration/SSLConfigurationsFactory.java    |    2 +-
 .../src/resources/SystemCollectionSolrConfig.xml   |    3 +
 .../solr/collection1/conf/bad-mpf-solrconfig.xml   |    1 +
 .../conf/bad-solrconfig-multiple-cfs.xml           |    1 +
 .../conf/bad-solrconfig-multiple-indexconfigs.xml  |    1 +
 .../collection1/conf/bad-solrconfig-nrtmode.xml    |    1 +
 .../solr/collection1/conf/bad_solrconfig.xml       |    1 +
 ...g-add-schema-fields-update-processor-chains.xml |    6 +
 .../conf/solrconfig-concurrentmergescheduler.xml   |    1 +
 .../conf/solrconfig-doctransformers.xml            |    1 +
 .../solr/collection1/conf/solrconfig-hash.xml      |    1 +
 .../solrconfig-indexconfig-mergepolicyfactory.xml  |    1 +
 .../collection1/conf/solrconfig-indexmetrics.xml   |    1 +
 .../conf/solrconfig-infostream-logging.xml         |    1 +
 .../conf/solrconfig-logmergepolicyfactory.xml      |    1 +
 .../collection1/conf/solrconfig-managed-schema.xml |    2 +-
 .../conf/solrconfig-mergepolicy-defaults.xml       |    1 +
 .../conf/solrconfig-mergepolicy-legacy.xml         |    1 +
 .../conf/solrconfig-mergepolicyfactory-nocfs.xml   |    1 +
 .../conf/solrconfig-nomergepolicyfactory.xml       |    1 +
 .../solrconfig-parsing-update-processor-chains.xml |    4 +
 .../solr/collection1/conf/solrconfig-sql.xml       |    1 +
 .../solr/collection1/conf/solrconfig-tagger.xml    |    4 +
 .../conf/solrconfig-tieredmergepolicyfactory.xml   |    1 +
 ...rconfig-uninvertdocvaluesmergepolicyfactory.xml |    2 +-
 .../solr/collection1/conf/solrconfig.xml           |   12 +-
 .../solr/configsets/backcompat/conf/solrconfig.xml |    5 +
 .../configsets/bad-mergepolicy/conf/solrconfig.xml |    2 +
 .../configsets/cdcr-cluster1/conf/solrconfig.xml   |    6 +
 .../configsets/cdcr-cluster2/conf/solrconfig.xml   |    6 +
 .../cdcr-source-disabled/conf/solrconfig.xml       |    6 +
 .../configsets/cdcr-source/conf/solrconfig.xml     |    6 +
 .../configsets/cdcr-target/conf/solrconfig.xml     |    5 +
 .../configsets/cloud-dynamic/conf/solrconfig.xml   |    5 +
 .../cloud-managed-preanalyzed/conf/solrconfig.xml  |    5 +
 .../configsets/cloud-managed/conf/solrconfig.xml   |    6 +
 .../conf/solrconfig.xml                            |    5 +
 .../configsets/cloud-minimal/conf/solrconfig.xml   |    4 +-
 .../configsets/configset-2/conf/solrconfig.xml     |    5 +
 .../exitable-directory/conf/solrconfig.xml         |    5 +
 .../solr/configsets/minimal/conf/solrconfig.xml    |    6 +
 .../configsets/resource-sharing/solrconfig.xml     |    4 +
 .../solr/configsets/sql/conf/solrconfig.xml        |    4 +
 .../solr/configsets/upload/regular/solrconfig.xml  |    5 +
 .../upload/with-script-processor/solrconfig.xml    |    5 +
 .../src/test/org/apache/solr/CursorPagingTest.java |    2 +-
 .../solr/DistributedIntervalFacetingTest.java      |   13 +-
 .../apache/solr/HelloWorldSolrCloudTestCase.java   |    2 +
 .../org/apache/solr/TestDistributedGrouping.java   |  172 ++-
 .../apache/solr/TestDistributedMissingSort.java    |    1 -
 .../org/apache/solr/TestDistributedSearch.java     |   25 +-
 .../apache/solr/TestHighlightDedupGrouping.java    |    8 +-
 .../test/org/apache/solr/TestRandomDVFaceting.java |   12 +-
 .../test/org/apache/solr/TestRandomFaceting.java   |   10 +-
 .../solr/backcompat/TestLuceneIndexBackCompat.java |   10 +
 .../client/solrj/embedded/TestJettySolrRunner.java |   68 +-
 .../client/solrj/impl/ConnectionReuseTest.java     |    5 +-
 .../org/apache/solr/cloud/ActionThrottleTest.java  |   12 +-
 .../test/org/apache/solr/cloud/AddReplicaTest.java |   28 +-
 .../apache/solr/cloud/AliasIntegrationTest.java    |   39 +-
 .../cloud/AssignBackwardCompatibilityTest.java     |    2 +
 .../apache/solr/cloud/BasicDistributedZk2Test.java |   15 +-
 .../apache/solr/cloud/BasicDistributedZkTest.java  |  148 ++-
 .../test/org/apache/solr/cloud/BasicZkTest.java    |    2 +
 .../solr/cloud/ChaosMonkeyNothingIsSafeTest.java   |   31 +-
 ...aosMonkeyNothingIsSafeWithPullReplicasTest.java |   17 +-
 .../solr/cloud/ChaosMonkeySafeLeaderTest.java      |   35 +-
 .../ChaosMonkeySafeLeaderWithPullReplicasTest.java |   20 +-
 .../solr/cloud/ChaosMonkeyShardSplitTest.java      |   56 +-
 .../org/apache/solr/cloud/CleanupOldIndexTest.java |   16 +-
 .../cloud/CloudExitableDirectoryReaderTest.java    |   21 +-
 .../solr/cloud/CollectionStateFormat2Test.java     |    6 +-
 .../apache/solr/cloud/CollectionsAPISolrJTest.java |   75 +-
 .../org/apache/solr/cloud/ConfigSetsAPITest.java   |    3 +-
 .../apache/solr/cloud/ConnectionManagerTest.java   |    2 +-
 .../apache/solr/cloud/CreateRoutedAliasTest.java   |   25 +-
 .../solr/cloud/DeleteInactiveReplicaTest.java      |   12 +-
 .../cloud/DeleteLastCustomShardedReplicaTest.java  |    2 +
 .../test/org/apache/solr/cloud/DeleteNodeTest.java |   16 +-
 .../org/apache/solr/cloud/DeleteReplicaTest.java   |   53 +-
 .../org/apache/solr/cloud/DeleteShardTest.java     |    3 +
 .../apache/solr/cloud/DistribCursorPagingTest.java |   56 +-
 .../DistribDocExpirationUpdateProcessorTest.java   |    2 +
 .../solr/cloud/DistribJoinFromCollectionTest.java  |   13 +-
 .../apache/solr/cloud/DistributedQueueTest.java    |   21 +-
 .../solr/cloud/DistributedVersionInfoTest.java     |    4 +-
 .../apache/solr/cloud/DocValuesNotIndexedTest.java |    1 +
 .../org/apache/solr/cloud/ForceLeaderTest.java     |   17 +-
 .../cloud/ForceLeaderWithTlogReplicasTest.java     |    3 +
 .../solr/cloud/FullSolrCloudDistribCmdsTest.java   |   46 +-
 .../solr/cloud/HttpPartitionOnCommitTest.java      |   26 +-
 .../org/apache/solr/cloud/HttpPartitionTest.java   |  123 +-
 .../cloud/HttpPartitionWithTlogReplicasTest.java   |    6 +-
 .../solr/cloud/LeaderElectionContextKeyTest.java   |    2 +
 .../solr/cloud/LeaderElectionIntegrationTest.java  |   31 +-
 .../org/apache/solr/cloud/LeaderElectionTest.java  |  109 +-
 .../cloud/LeaderFailoverAfterPartitionTest.java    |   44 +-
 .../cloud/LeaderFailureAfterFreshStartTest.java    |   50 +-
 .../solr/cloud/LeaderVoteWaitTimeoutTest.java      |   15 +-
 .../solr/cloud/LegacyCloudClusterPropTest.java     |    4 +-
 .../solr/cloud/MetricsHistoryIntegrationTest.java  |    3 +
 .../MetricsHistoryWithAuthIntegrationTest.java     |   39 +-
 .../org/apache/solr/cloud/MigrateRouteKeyTest.java |    2 +
 .../solr/cloud/MissingSegmentRecoveryTest.java     |    1 -
 .../solr/cloud/MoveReplicaHDFSFailoverTest.java    |   20 +-
 .../org/apache/solr/cloud/MoveReplicaHDFSTest.java |    6 +-
 .../org/apache/solr/cloud/MoveReplicaTest.java     |   48 +-
 .../apache/solr/cloud/MultiThreadedOCPTest.java    |   17 +-
 .../solr/cloud/NestedShardedAtomicUpdateTest.java  |    6 +
 ...OverriddenZkACLAndCredentialsProvidersTest.java |    2 +
 .../OverseerCollectionConfigSetProcessorTest.java  |   27 +-
 .../org/apache/solr/cloud/OverseerRolesTest.java   |    2 +
 .../org/apache/solr/cloud/OverseerStatusTest.java  |   16 +-
 .../apache/solr/cloud/OverseerTaskQueueTest.java   |    2 +-
 .../test/org/apache/solr/cloud/OverseerTest.java   |   36 +-
 .../apache/solr/cloud/PeerSyncReplicationTest.java |  239 ++--
 .../solr/cloud/RecoveryAfterSoftCommitTest.java    |   52 +-
 .../test/org/apache/solr/cloud/RecoveryZkTest.java |   10 +-
 .../apache/solr/cloud/ReindexCollectionTest.java   |   11 +-
 .../apache/solr/cloud/RemoteQueryErrorTest.java    |    1 +
 .../org/apache/solr/cloud/ReplaceNodeTest.java     |   12 +-
 .../apache/solr/cloud/ReplicationFactorTest.java   |   55 +-
 .../solr/cloud/RestartWhileUpdatingTest.java       |    8 -
 .../org/apache/solr/cloud/RollingRestartTest.java  |    4 +-
 .../apache/solr/cloud/SaslZkACLProviderTest.java   |    5 +
 .../apache/solr/cloud/ShardRoutingCustomTest.java  |    2 +
 .../org/apache/solr/cloud/ShardRoutingTest.java    |    2 +
 .../cloud/SharedFSAutoReplicaFailoverTest.java     |    6 +-
 .../org/apache/solr/cloud/SolrCLIZkUtilsTest.java  |    3 +
 .../apache/solr/cloud/SolrCloudBridgeTestCase.java |  674 ++++++++++
 .../apache/solr/cloud/SolrCloudExampleTest.java    |   14 +-
 .../test/org/apache/solr/cloud/SplitShardTest.java |    2 +
 .../test/org/apache/solr/cloud/SyncSliceTest.java  |  136 +-
 .../solr/cloud/SystemCollectionCompatTest.java     |    5 +
 .../solr/cloud/TestAuthenticationFramework.java    |   10 +-
 .../apache/solr/cloud/TestCloudConsistency.java    |   52 +-
 .../org/apache/solr/cloud/TestCloudPivotFacet.java |    2 -
 .../org/apache/solr/cloud/TestCloudRecovery.java   |   52 +-
 .../org/apache/solr/cloud/TestCloudRecovery2.java  |   10 +-
 .../solr/cloud/TestCloudSearcherWarming.java       |    2 +
 .../org/apache/solr/cloud/TestConfigSetsAPI.java   |    1 +
 .../solr/cloud/TestConfigSetsAPIExclusivity.java   |    2 +
 .../test/org/apache/solr/cloud/TestCryptoKeys.java |    9 +
 .../solr/cloud/TestDistribDocBasedVersion.java     |   35 +-
 .../solr/cloud/TestDownShardTolerantSearch.java    |    4 +
 .../cloud/TestDynamicFieldNamesIndexCorrectly.java |    5 +-
 .../org/apache/solr/cloud/TestHashPartitioner.java |    2 +-
 .../cloud/TestLeaderElectionWithEmptyReplica.java  |   12 +-
 .../solr/cloud/TestLeaderElectionZkExpiry.java     |   16 +-
 .../solr/cloud/TestMiniSolrCloudClusterSSL.java    |    2 +
 .../solr/cloud/TestOnReconnectListenerSupport.java |   11 +-
 .../org/apache/solr/cloud/TestPrepRecovery.java    |   15 +-
 .../solr/cloud/TestPullReplicaErrorHandling.java   |   10 +-
 .../solr/cloud/TestQueryingOnDownCollection.java   |    6 +-
 .../apache/solr/cloud/TestRandomFlRTGCloud.java    |    4 +-
 .../solr/cloud/TestRandomRequestDistribution.java  |    8 +-
 .../apache/solr/cloud/TestRebalanceLeaders.java    |    2 +
 .../apache/solr/cloud/TestRequestForwarding.java   |    8 +-
 .../apache/solr/cloud/TestSSLRandomization.java    |    2 +
 .../solr/cloud/TestShortCircuitedRequests.java     |    1 -
 .../solr/cloud/TestSkipOverseerOperations.java     |    8 +-
 .../cloud/TestSolrCloudWithDelegationTokens.java   |    3 +
 .../solr/cloud/TestSolrCloudWithKerberosAlt.java   |    9 +-
 .../TestSolrCloudWithSecureImpersonation.java      |    2 +
 .../solr/cloud/TestStressInPlaceUpdates.java       |    7 +-
 .../org/apache/solr/cloud/TestStressLiveNodes.java |   22 +-
 .../org/apache/solr/cloud/TestTlogReplica.java     |   17 +-
 .../cloud/TestTolerantUpdateProcessorCloud.java    |    2 +
 .../org/apache/solr/cloud/TestUtilizeNode.java     |    2 +
 .../cloud/TestWaitForStateWithJettyShutdowns.java  |    8 +-
 .../org/apache/solr/cloud/TestWithCollection.java  |   20 +-
 .../test/org/apache/solr/cloud/TestZkChroot.java   |    2 +
 .../cloud/TlogReplayBufferedWhileIndexingTest.java |    9 +-
 .../solr/cloud/TrollingIndexReaderFactory.java     |    8 +-
 .../apache/solr/cloud/UnloadDistributedZkTest.java |  191 +--
 .../VMParamsZkACLAndCredentialsProvidersTest.java  |    2 +
 .../src/test/org/apache/solr/cloud/ZkCLITest.java  |    9 +-
 .../org/apache/solr/cloud/ZkControllerTest.java    |   22 +-
 .../test/org/apache/solr/cloud/ZkFailoverTest.java |    2 +
 .../org/apache/solr/cloud/ZkShardTermsTest.java    |   13 +-
 .../org/apache/solr/cloud/ZkSolrClientTest.java    |   31 +-
 .../AbstractCloudBackupRestoreTestCase.java        |   10 +-
 .../solr/cloud/api/collections/AssignTest.java     |    3 +-
 .../api/collections/CollectionReloadTest.java      |    5 +-
 .../collections/CollectionTooManyReplicasTest.java |    2 +
 .../CollectionsAPIAsyncDistributedZkTest.java      |    2 +
 .../CollectionsAPIDistributedZkTest.java           |   41 +-
 .../ConcurrentCreateCollectionTest.java            |    3 +-
 .../api/collections/CustomCollectionTest.java      |    2 +
 .../HdfsCollectionsAPIDistributedZkTest.java       |    6 +-
 .../solr/cloud/api/collections/ShardSplitTest.java |  616 +++++----
 .../SimpleCollectionCreateDeleteTest.java          |    2 +
 .../cloud/api/collections/SplitByPrefixTest.java   |    2 +
 .../cloud/api/collections/TestCollectionAPI.java   |   38 +-
 .../TestCollectionsAPIViaSolrCloudCluster.java     |    8 +-
 .../collections/TestHdfsCloudBackupRestore.java    |    8 +-
 .../collections/TestLocalFSCloudBackupRestore.java |    2 +
 .../api/collections/TestReplicaProperties.java     |    4 +-
 .../TestRequestStatusCollectionAPI.java            |   38 +-
 .../AutoAddReplicasIntegrationTest.java            |    2 +
 .../autoscaling/AutoAddReplicasPlanActionTest.java |   27 +-
 .../cloud/autoscaling/ComputePlanActionTest.java   |   13 +-
 .../cloud/autoscaling/ExecutePlanActionTest.java   |   22 +-
 .../HdfsAutoAddReplicasIntegrationTest.java        |    6 +-
 .../cloud/autoscaling/HttpTriggerListenerTest.java |    2 +
 .../IndexSizeTriggerMixedBoundsTest.java           |    6 +-
 .../IndexSizeTriggerSizeEstimationTest.java        |    5 +-
 .../cloud/autoscaling/IndexSizeTriggerTest.java    |    6 +-
 .../autoscaling/MetricTriggerIntegrationTest.java  |    9 +-
 .../solr/cloud/autoscaling/MetricTriggerTest.java  |    2 +
 .../NodeAddedTriggerIntegrationTest.java           |    3 +
 .../cloud/autoscaling/NodeAddedTriggerTest.java    |    2 +
 .../NodeLostTriggerIntegrationTest.java            |    3 +
 .../cloud/autoscaling/NodeLostTriggerTest.java     |    2 +
 .../autoscaling/NodeMarkersRegistrationTest.java   |   16 +-
 .../cloud/autoscaling/RestoreTriggerStateTest.java |    6 +-
 .../ScheduledMaintenanceTriggerTest.java           |    2 +
 .../ScheduledTriggerIntegrationTest.java           |    2 +
 .../cloud/autoscaling/SearchRateTriggerTest.java   |    6 +-
 .../cloud/autoscaling/SystemLogListenerTest.java   |    2 +
 .../solr/cloud/autoscaling/TestPolicyCloud.java    |    2 +
 .../TriggerCooldownIntegrationTest.java            |    5 +-
 .../cloud/autoscaling/TriggerEventQueueTest.java   |    2 +
 .../cloud/autoscaling/TriggerIntegrationTest.java  |   62 +-
 .../TriggerSetPropertiesIntegrationTest.java       |    3 +
 .../autoscaling/sim/SimSolrCloudTestCase.java      |    2 +-
 .../sim/TestSimClusterStateProvider.java           |   10 +-
 .../autoscaling/sim/TestSimComputePlanAction.java  |    2 +-
 .../autoscaling/sim/TestSimDistributedQueue.java   |    4 +-
 .../cloud/autoscaling/sim/TestSimLargeCluster.java |    2 +
 .../cloud/autoscaling/sim/TestSimPolicyCloud.java  |    2 +
 .../cloud/autoscaling/sim/TestSimScenario.java     |    2 +
 .../autoscaling/sim/TestSimTriggerIntegration.java |   29 +-
 .../autoscaling/sim/TestSnapshotCloudManager.java  |    3 +
 .../solr/cloud/cdcr/BaseCdcrDistributedZkTest.java |   14 +-
 .../apache/solr/cloud/cdcr/CdcrBootstrapTest.java  |    2 +
 .../cloud/cdcr/CdcrVersionReplicationTest.java     |    9 +-
 .../solr/cloud/hdfs/HDFSCollectionsAPITest.java    |   26 +-
 .../cloud/hdfs/HdfsBasicDistributedZk2Test.java    |    6 +-
 .../cloud/hdfs/HdfsBasicDistributedZkTest.java     |    6 +-
 .../hdfs/HdfsChaosMonkeyNothingIsSafeTest.java     |    6 +-
 .../cloud/hdfs/HdfsChaosMonkeySafeLeaderTest.java  |    6 +-
 .../apache/solr/cloud/hdfs/HdfsNNFailoverTest.java |    6 +-
 .../solr/cloud/hdfs/HdfsRecoverLeaseTest.java      |    9 +-
 .../apache/solr/cloud/hdfs/HdfsRecoveryZkTest.java |    6 +-
 .../cloud/hdfs/HdfsRestartWhileUpdatingTest.java   |    6 +-
 .../apache/solr/cloud/hdfs/HdfsSyncSliceTest.java  |   11 +-
 .../org/apache/solr/cloud/hdfs/HdfsTestUtil.java   |   29 +-
 .../apache/solr/cloud/hdfs/HdfsThreadLeakTest.java |    6 +-
 .../HdfsTlogReplayBufferedWhileIndexingTest.java   |    6 +-
 .../cloud/hdfs/HdfsUnloadDistributedZkTest.java    |   11 +-
 .../hdfs/HdfsWriteToMultipleCollectionsTest.java   |    8 +-
 .../org/apache/solr/cloud/hdfs/StressHdfsTest.java |   10 +-
 .../overseer/ZkCollectionPropsCachingTest.java     |    8 +-
 .../solr/cloud/overseer/ZkStateReaderTest.java     |   22 +-
 .../solr/cloud/overseer/ZkStateWriterTest.java     |   22 +-
 .../test/org/apache/solr/cloud/rule/RulesTest.java |    3 +
 .../apache/solr/core/BlobRepositoryCloudTest.java  |    2 +
 .../solr/core/CachingDirectoryFactoryTest.java     |   33 +-
 .../org/apache/solr/core/DirectoryFactoryTest.java |    3 +
 .../apache/solr/core/HdfsDirectoryFactoryTest.java |    6 +-
 .../org/apache/solr/core/ResourceLoaderTest.java   |    4 +-
 .../test/org/apache/solr/core/SolrCoreTest.java    |    1 -
 .../test/org/apache/solr/core/TestBadConfig.java   |    2 +
 .../org/apache/solr/core/TestCodecSupport.java     |    2 +
 .../test/org/apache/solr/core/TestConfigSets.java  |    6 +
 .../org/apache/solr/core/TestCoreContainer.java    |    3 +-
 .../org/apache/solr/core/TestCoreDiscovery.java    |    3 +
 .../org/apache/solr/core/TestCustomStream.java     |    2 +
 .../org/apache/solr/core/TestDynamicLoading.java   |    2 +
 .../apache/solr/core/TestDynamicLoadingUrl.java    |    2 +
 .../test/org/apache/solr/core/TestDynamicURP.java  |    3 +
 .../solr/core/TestImplicitCoreProperties.java      |    2 +-
 .../org/apache/solr/core/TestJmxIntegration.java   |    5 +-
 .../test/org/apache/solr/core/TestLazyCores.java   |   18 +-
 .../apache/solr/core/TestSolrConfigHandler.java    |   13 +-
 .../repository/HdfsBackupRepositoryTest.java       |    2 +
 .../core/snapshots/TestSolrCloudSnapshots.java     |    2 +
 .../solr/filestore/TestDistribPackageStore.java    |    8 +-
 .../handler/BinaryUpdateRequestHandlerTest.java    |    2 +-
 .../org/apache/solr/handler/TestBlobHandler.java   |    2 +
 .../org/apache/solr/handler/TestConfigReload.java  |    9 +-
 .../solr/handler/TestHdfsBackupRestoreCore.java    |    6 +-
 .../solr/handler/TestReplicationHandler.java       |    4 +-
 .../solr/handler/TestReplicationHandlerBackup.java |    4 +
 .../TestReplicationHandlerDiskOverFlow.java        |    2 +
 .../org/apache/solr/handler/TestRestoreCore.java   |    4 +-
 .../solr/handler/TestSolrConfigHandlerCloud.java   |    2 +
 .../solr/handler/TestSystemCollAutoCreate.java     |   16 +-
 .../admin/AutoscalingHistoryHandlerTest.java       |    3 +
 .../solr/handler/admin/CoreAdminHandlerTest.java   |    4 +
 .../solr/handler/admin/DaemonStreamApiTest.java    |    2 +
 .../solr/handler/admin/HealthCheckHandlerTest.java |    3 +-
 .../apache/solr/handler/admin/InfoHandlerTest.java |    3 +
 .../solr/handler/admin/MBeansHandlerTest.java      |    8 +-
 .../solr/handler/admin/MetricsHandlerTest.java     |    4 +-
 .../handler/admin/MetricsHistoryHandlerTest.java   |   18 +-
 .../solr/handler/admin/ZookeeperReadAPITest.java   |    5 +-
 .../handler/admin/ZookeeperStatusHandlerTest.java  |   13 +-
 .../solr/handler/component/BadComponentTest.java   |    2 +
 .../component/DistributedExpandComponentTest.java  |    2 -
 .../component/DistributedFacetExistsSmallTest.java |    2 -
 .../component/DistributedFacetPivotLargeTest.java  |    5 -
 .../DistributedFacetPivotSmallAdvancedTest.java    |    1 -
 .../component/DistributedFacetPivotSmallTest.java  |    2 -
 .../DistributedFacetPivotWhiteBoxTest.java         |    2 -
 .../component/DistributedMLTComponentTest.java     |    1 -
 .../DistributedQueryComponentCustomSortTest.java   |    2 -
 .../DistributedQueryComponentOptimizationTest.java |    4 +-
 .../DistributedQueryElevationComponentTest.java    |    2 -
 .../DistributedSpellCheckComponentTest.java        |   13 +-
 .../component/DistributedSuggestComponentTest.java |    1 -
 .../component/DistributedTermsComponentTest.java   |    1 -
 .../solr/handler/component/SearchHandlerTest.java  |    8 +-
 .../handler/component/ShardsWhitelistTest.java     |    2 +-
 .../handler/component/SpellCheckComponentTest.java |    1 +
 .../solr/handler/component/StatsComponentTest.java |    4 +-
 .../handler/component/SuggestComponentTest.java    |    1 +
 .../TestDistributedStatsComponentCardinality.java  |   14 +-
 .../component/TestTrackingShardHandlerFactory.java |    9 +-
 .../solr/handler/export/TestExportWriter.java      |   22 +-
 .../apache/solr/index/hdfs/CheckHdfsIndexTest.java |   23 +-
 .../apache/solr/metrics/SolrMetricManagerTest.java |    6 +
 .../solr/metrics/SolrMetricsIntegrationTest.java   |   11 +-
 .../solr/metrics/reporters/MockMetricReporter.java |    6 +-
 .../reporters/SolrGraphiteReporterTest.java        |   12 +-
 .../reporters/SolrJmxReporterCloudTest.java        |    2 +-
 .../metrics/reporters/SolrSlf4jReporterTest.java   |    8 +
 .../reporters/solr/SolrCloudReportersTest.java     |   10 +-
 .../reporters/solr/SolrShardReporterTest.java      |    2 +
 .../metrics/rrd/SolrRrdBackendFactoryTest.java     |    4 +-
 .../apache/solr/request/TestIntervalFaceting.java  |    8 +-
 .../solr/response/TestGraphMLResponseWriter.java   |    2 +
 .../solr/response/TestRetrieveFieldsOptimizer.java |    2 +
 .../org/apache/solr/schema/BadIndexSchemaTest.java |    2 +
 .../test/org/apache/solr/schema/DocValuesTest.java |    4 +-
 .../schema/ManagedSchemaRoundRobinCloudTest.java   |    2 +
 .../apache/solr/schema/SchemaApiFailureTest.java   |    3 +
 .../schema/SchemaVersionSpecificBehaviorTest.java  |    4 +-
 .../solr/schema/SpatialRPTFieldTypeTest.java       |    2 +
 .../solr/schema/TestBulkSchemaConcurrent.java      |   66 +-
 .../apache/solr/schema/TestCloudManagedSchema.java |    2 +
 .../apache/solr/schema/TestCloudSchemaless.java    |   18 +-
 .../org/apache/solr/schema/TestManagedSchema.java  |    2 +
 .../solr/schema/TestUseDocValuesAsStored.java      |    5 +-
 .../solr/search/AnalyticsMergeStrategyTest.java    |    1 -
 .../solr/search/CurrencyRangeFacetCloudTest.java   |    2 +
 .../org/apache/solr/search/MergeStrategyTest.java  |    2 -
 .../org/apache/solr/search/TestCaffeineCache.java  |    4 +-
 .../org/apache/solr/search/TestIndexSearcher.java  |    4 +-
 .../org/apache/solr/search/TestRangeQuery.java     |    6 +-
 .../test/org/apache/solr/search/TestRecovery.java  |   13 +-
 .../org/apache/solr/search/TestRecoveryHdfs.java   |   13 +-
 .../org/apache/solr/search/TestSolr4Spatial2.java  |    5 +-
 .../org/apache/solr/search/TestXmlQParser.java     |    2 +
 .../org/apache/solr/search/facet/DebugAgg.java     |    5 +-
 .../solr/search/facet/TestCloudJSONFacetSKG.java   |    8 +-
 .../apache/solr/search/facet/TestJsonFacets.java   |   53 +-
 .../org/apache/solr/search/join/XCJFQueryTest.java |    9 +
 .../solr/search/mlt/CloudMLTQParserTest.java       |    2 +
 .../solr/search/stats/TestDefaultStatsCache.java   |    2 -
 .../solr/security/AuditLoggerIntegrationTest.java  |    2 +
 .../solr/security/BasicAuthIntegrationTest.java    |    4 +-
 .../solr/security/BasicAuthOnSingleNodeTest.java   |    2 +
 .../security/JWTAuthPluginIntegrationTest.java     |    8 +
 .../security/PKIAuthenticationIntegrationTest.java |    3 +
 .../solr/security/TestAuthorizationFramework.java  |    9 +-
 .../hadoop/TestDelegationWithHadoopAuth.java       |   16 +-
 .../hadoop/TestImpersonationWithHadoopAuth.java    |    4 +
 .../hadoop/TestSolrCloudWithHadoopAuthPlugin.java  |    4 +
 .../security/hadoop/TestZkAclsWithHadoopAuth.java  |    2 +
 .../solr/spelling/suggest/SuggesterTest.java       |    3 +-
 .../solr/store/blockcache/BlockCacheTest.java      |    1 +
 .../apache/solr/store/hdfs/HdfsDirectoryTest.java  |    8 +-
 .../solr/store/hdfs/HdfsLockFactoryTest.java       |    6 +-
 .../apache/solr/uninverting/TestFieldCache.java    |    6 +-
 .../uninverting/TestFieldCacheWithThreads.java     |    3 +-
 .../solr/uninverting/TestLegacyFieldCache.java     |    5 +-
 .../org/apache/solr/update/CdcrUpdateLogTest.java  |    3 +-
 .../solr/update/DirectUpdateHandlerTest.java       |    3 +-
 .../org/apache/solr/update/SoftAutoCommitTest.java |    2 +
 .../apache/solr/update/SolrCmdDistributorTest.java |    6 +-
 .../apache/solr/update/SolrIndexMetricsTest.java   |    1 +
 .../org/apache/solr/update/TestHdfsUpdateLog.java  |    8 +-
 .../update/TestInPlaceUpdateWithRouteField.java    |    4 +-
 .../solr/update/TestInPlaceUpdatesDistrib.java     |    2 +
 .../solr/update/TestIndexingPerformance.java       |    3 +-
 .../processor/DistributedUpdateProcessorTest.java  |    8 +-
 .../processor/TemplateUpdateProcessorTest.java     |    2 +-
 .../processor/TestNamedUpdateProcessors.java       |    3 +
 .../org/apache/solr/util/OrderedExecutorTest.java  |    1 +
 .../apache/solr/util/TestSolrCLIRunExample.java    |    2 +
 .../org/apache/solr/util/TestTestInjection.java    |    3 +-
 .../org/apache/solr/client/solrj/SolrClient.java   |    2 +-
 .../solr/client/solrj/cloud/DistributedLock.java   |  302 +++++
 .../solr/client/solrj/cloud/LockListener.java}     |   31 +-
 .../solr/client/solrj/cloud/ProtocolSupport.java   |  198 +++
 .../apache/solr/client/solrj/cloud/ZNodeName.java  |  141 +++
 .../client/solrj/cloud/ZooKeeperOperation.java}    |   25 +-
 .../solr/client/solrj/embedded/SSLConfig.java      |    2 +-
 .../client/solrj/impl/BaseCloudSolrClient.java     |  164 ++-
 .../client/solrj/impl/CloudHttp2SolrClient.java    |   11 +-
 .../solr/client/solrj/impl/CloudSolrClient.java    |   64 +-
 .../impl/ConcurrentUpdateHttp2SolrClient.java      |   24 +-
 .../solrj/impl/Http2ClusterStateProvider.java      |    4 +-
 .../solr/client/solrj/impl/Http2SolrClient.java    |   72 +-
 .../solr/client/solrj/impl/HttpClientUtil.java     |   58 +-
 .../solrj/impl/HttpClusterStateProvider.java       |   19 +-
 .../solr/client/solrj/impl/HttpSolrClient.java     |   35 +-
 .../solr/client/solrj/impl/LBHttpSolrClient.java   |   34 +-
 .../solr/client/solrj/impl/SolrClientBuilder.java  |    6 +-
 .../solrj/impl/SolrClientNodeStateProvider.java    |    6 +-
 .../solrj/impl/SolrHttpClientContextBuilder.java   |    6 +-
 .../client/solrj/impl/SolrHttpClientScheduler.java |  105 ++
 .../solrj/impl/SolrHttpRequestRetryHandler.java    |   36 +-
 .../solrj/impl/ZkClientClusterStateProvider.java   |   26 +-
 .../solr/client/solrj/io/SolrClientCache.java      |   10 +-
 .../client/solrj/io/sql/DatabaseMetaDataImpl.java  |    2 +-
 .../solr/client/solrj/io/stream/DaemonStream.java  |    3 +-
 .../solr/client/solrj/io/stream/SolrStream.java    |    2 +-
 .../solr/client/solrj/io/stream/TopicStream.java   |    8 +
 .../solrj/request/CollectionAdminRequest.java      |    2 +-
 .../src/java/org/apache/solr/common/ParWork.java   |  786 ++++++++++++
 .../java/org/apache/solr/common/SolrException.java |   12 +-
 .../java/org/apache/solr/common/TimeTracker.java   |  267 ++++
 .../java/org/apache/solr/common/WorkException.java |    6 +
 .../org/apache/solr/common/cloud/ClusterState.java |    2 +-
 .../common/cloud/CollectionStatePredicate.java     |    3 +
 .../solr/common/cloud/ConnectionManager.java       |  110 +-
 .../common/cloud/DefaultConnectionStrategy.java    |    6 +-
 .../apache/solr/common/cloud/DocCollection.java    |   18 +-
 .../org/apache/solr/common/cloud/SolrZkClient.java |  373 ++++--
 .../apache/solr/common/cloud/SolrZooKeeper.java    |   91 +-
 .../apache/solr/common/cloud/ZkCmdExecutor.java    |   39 +-
 .../apache/solr/common/cloud/ZkConfigManager.java  |    4 +-
 .../solr/common/cloud/ZkMaintenanceUtils.java      |   26 +-
 .../apache/solr/common/cloud/ZkStateReader.java    |  249 ++--
 .../org/apache/solr/common/params/QoSParams.java}  |   12 +-
 .../org/apache/solr/common/util/ExecutorUtil.java  |   20 +-
 .../java/org/apache/solr/common/util/IOUtils.java  |   10 +
 .../solr/common/util/ObjectReleaseTracker.java     |   11 +-
 .../apache/solr/common}/util/OrderedExecutor.java  |   10 +-
 .../java/org/apache/solr/common}/util/TimeOut.java |    4 +-
 .../apache/solr/common/util/ValidatingJsonMap.java |   12 +-
 .../org/apache/zookeeper/ZooKeeperExposed.java     |   34 +
 .../collection1/conf/solrconfig-managed-schema.xml |    4 +
 .../solr/collection1/conf/solrconfig-slave1.xml    |    1 +
 .../solrj/solr/collection1/conf/solrconfig-sql.xml |    1 +
 .../solrj/solr/collection1/conf/solrconfig.xml     |    3 +-
 .../configset-1/conf/solrconfig-minimal.xml        |    5 +
 .../configsets/configset-2/conf/solrconfig.xml     |    5 +
 .../solrj/solr/configsets/ml/conf/solrconfig.xml   |    1 +
 .../solr/configsets/shared/conf/solrconfig.xml     |    1 +
 .../solr/configsets/spatial/conf/solrconfig.xml    |    5 +
 .../solr/configsets/streaming/conf/solrconfig.xml  |    1 +
 .../solrj/solr/multicore/core0/conf/solrconfig.xml |    5 +
 .../solrj/solr/multicore/core1/conf/solrconfig.xml |    5 +
 .../solr/client/solrj/TestLBHttp2SolrClient.java   |    5 +-
 .../solr/client/solrj/TestLBHttpSolrClient.java    |    5 +-
 .../solr/client/solrj/TestSolrJErrorHandling.java  |    6 +-
 .../solrj/impl/TestCloudSolrClientConnections.java |    2 +-
 .../client/solrj/io/graph/GraphExpressionTest.java |    6 +-
 .../solr/client/solrj/request/TestV2Request.java   |    1 -
 .../cloud/TestCloudCollectionsListeners.java       |   10 +-
 .../common/cloud/TestCollectionStateWatchers.java  |   12 +-
 .../common/cloud/TestDocCollectionWatcher.java     |    3 +-
 .../solr/common/cloud/TestZkConfigManager.java     |    4 +-
 .../apache/solr/BaseDistributedSearchTestCase.java |   44 +-
 .../org/apache/solr/SolrIgnoredThreadsFilter.java  |   18 +
 .../src/java/org/apache/solr/SolrTestCase.java     |  287 ++++-
 .../src/java/org/apache/solr/SolrTestCaseJ4.java   |  103 +-
 .../solr/cloud/AbstractDistribZkTestBase.java      |   84 +-
 .../solr/cloud/AbstractFullDistribZkTestBase.java  |  554 ++++-----
 .../apache/solr/cloud/MiniSolrCloudCluster.java    |  457 ++++---
 .../apache/solr/cloud/MultiSolrCloudTestCase.java  |    8 +-
 .../solr/cloud/NoOpenOverseerFoundException.java   |    6 +
 .../org/apache/solr/cloud/SolrCloudTestCase.java   |   31 +-
 .../apache/solr/cloud/StoppableIndexingThread.java |    9 +-
 .../java/org/apache/solr/cloud/ZkTestServer.java   |  507 ++++----
 .../org/apache/solr/util/BadHdfsThreadsFilter.java |    4 +-
 .../java/org/apache/solr/util/BaseTestHarness.java |   36 +-
 .../java/org/apache/solr/util/DOMUtilTestBase.java |    5 +-
 .../java/org/apache/solr/util/RandomizeSSL.java    |   13 +-
 .../java/org/apache/solr/util/RestTestHarness.java |   22 +-
 .../java/org/apache/solr/util/SSLTestConfig.java   |    7 +
 .../src/java/org/apache/solr/util/TestHarness.java |   37 +-
 .../org/apache/solr/cloud/JettySolrRunnerTest.java |    2 +-
 versions.props                                     |    4 +-
 662 files changed, 14431 insertions(+), 8287 deletions(-)

diff --git a/build.gradle b/build.gradle
index 5fc3609..83368a1 100644
--- a/build.gradle
+++ b/build.gradle
@@ -150,3 +150,20 @@ apply from: file('gradle/documentation/documentation.gradle')
 apply from: file('gradle/documentation/changes-to-html.gradle')
 apply from: file('gradle/documentation/markdown.gradle')
 apply from: file('gradle/render-javadoc.gradle')
+
+allprojects {
+  task ufclasspath {
+    doLast{
+      File ufPath = new File(project.getRootDir().getParentFile(), "unitflier/run/solr");
+      if (configurations.hasProperty('testRuntimeClasspath')) {
+        java.io.File file = new java.io.File(ufPath, project.projectDir.name + '.txt');
+        file.getParentFile().mkdirs();
+        file.write project.projectDir.toString() + "\n"
+        file << sourceSets.test.output.classesDirs.asPath + "\n"
+        file << project.projectDir.toString() + "/src/test-files" + ":" + project.projectDir.toString() + "/src/resources" + ":" + sourceSets.main.output.classesDirs.asPath + ":"
+        file << sourceSets.test.output.classesDirs.asPath + ":"
+        file << configurations.testRuntimeClasspath.asPath + "\n"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle
index 583b76e..f277991 100644
--- a/gradle/testing/defaults-tests.gradle
+++ b/gradle/testing/defaults-tests.gradle
@@ -51,10 +51,11 @@ allprojects {
     }
 
     test {
+      reports.junitXml.destination file(propertyOrDefault("reports.dest", "${reports.junitXml.destination.toString()}"))
       ext {
         testOutputsDir = file("${reports.junitXml.destination}/outputs")
       }
-
+      binaryResultsDirectory = file(propertyOrDefault("binaryResultsDirectory", binaryResultsDirectory))
       if (verboseMode) {
         maxParallelForks = 1
       } else {
@@ -65,9 +66,9 @@ allprojects {
       useJUnit()
 
       minHeapSize = propertyOrDefault("tests.minheapsize", "256m")
-      maxHeapSize = propertyOrDefault("tests.heapsize", "512m")
+      maxHeapSize = propertyOrDefault("tests.heapsize", "3000m")
 
-      jvmArgs Commandline.translateCommandline(propertyOrDefault("tests.jvmargs", "-XX:TieredStopAtLevel=1"))
+      jvmArgs Commandline.translateCommandline(propertyOrDefault("tests.jvmargs", ""))
 
       systemProperty 'java.util.logging.config.file', file("${commonDir}/tools/junit4/logging.properties")
       systemProperty 'java.awt.headless', 'true'
diff --git a/gradle/testing/policies/solr-tests.policy b/gradle/testing/policies/solr-tests.policy
index 1290a38..099762d 100644
--- a/gradle/testing/policies/solr-tests.policy
+++ b/gradle/testing/policies/solr-tests.policy
@@ -20,6 +20,8 @@
 // permissions needed for tests to pass, based on properties set by the build system
 // NOTE: if the property is not set, the permission entry is ignored.
 grant {
+  permission java.io.FilePermission "/home/mm/junit.properties", "read";
+
   // 3rd party jar resources (where symlinks are not supported), test-files/ resources
   permission java.io.FilePermission "${common.dir}${/}-", "read";
   permission java.io.FilePermission "${common.dir}${/}..${/}solr${/}-", "read";
diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties
index ed4f53a..263fe57 100644
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@@ -23,6 +23,9 @@ com.fasterxml.jackson.core.version = 2.10.1
 /com.fasterxml.jackson.core/jackson-databind = ${com.fasterxml.jackson.core.version}
 /com.fasterxml.jackson.dataformat/jackson-dataformat-smile = ${com.fasterxml.jackson.core.version}
 
+com.fasterxml.woodstox.version = 6.0.3
+/com.fasterxml.woodstox/woodstox-core = ${com.fasterxml.woodstox.version}
+
 /com.github.ben-manes.caffeine/caffeine = 2.8.4
 /com.github.virtuald/curvesapi = 1.06
 
@@ -103,7 +106,7 @@ io.prometheus.version = 0.2.0
 /net.arnx/jsonic = 1.2.7
 /net.bytebuddy/byte-buddy = 1.9.3
 /net.hydromatic/eigenbase-properties = 1.1.5
-
+/net.sf.saxon/Saxon-HE = 10.1
 net.sourceforge.argparse4j.version = 0.8.1
 /net.sourceforge.argparse4j/argparse4j = ${net.sourceforge.argparse4j.version}
 
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
index 2028704..81de782 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
@@ -86,7 +86,7 @@ public class MockTokenizer extends Tokenizer {
   private boolean enableChecks = true;
   
   // evil: but we don't change the behavior with this random, we only switch up how we read
-  private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
+  //private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
   
   public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
     super(factory);
@@ -227,7 +227,8 @@ public class MockTokenizer extends Tokenizer {
   }
   
   protected int readChar() throws IOException {
-    switch(random.nextInt(10)) {
+    // this random can be created out of context and cause fails due to 'static test class initializers are not permitted to access random contexts'
+    switch(0) { // random.nextInt(10)
       case 0: {
         // read(char[])
         char c[] = new char[1];
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
index aef11ac..39bce04 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
@@ -275,7 +275,9 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
    */
   @Override
   protected void after() throws Exception {
-    Codec.setDefault(savedCodec);
+    if (savedCodec != null) {
+      Codec.setDefault(savedCodec);
+    }
     InfoStream.setDefault(savedInfoStream);
     if (savedLocale != null) Locale.setDefault(savedLocale);
     if (savedTimeZone != null) TimeZone.setDefault(savedTimeZone);
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
index 90c52d7..bb4f7ab 100644
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
+++ b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
@@ -21,6 +21,7 @@
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
   <indexConfig>
     <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.lockType:single}</lockType>
   </indexConfig>
   <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
   <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
diff --git a/solr/contrib/analytics/src/test-files/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml b/solr/contrib/analytics/src/test-files/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
index ecf1f14..2d1d58e 100644
--- a/solr/contrib/analytics/src/test-files/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
+++ b/solr/contrib/analytics/src/test-files/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
@@ -42,5 +42,7 @@ A solrconfig.xml snippet containing indexConfig settings for randomized testing.
        use the single process lockType for speed - but tests that explicitly need
        to vary the lockType canset it as needed.
   -->
+
   <lockType>${solr.tests.lockType:single}</lockType>
+
 </indexConfig>
diff --git a/solr/contrib/analytics/src/test-files/solr/configsets/cloud-analytics/conf/solrconfig.xml b/solr/contrib/analytics/src/test-files/solr/configsets/cloud-analytics/conf/solrconfig.xml
index 102e39e..50ab1fb 100644
--- a/solr/contrib/analytics/src/test-files/solr/configsets/cloud-analytics/conf/solrconfig.xml
+++ b/solr/contrib/analytics/src/test-files/solr/configsets/cloud-analytics/conf/solrconfig.xml
@@ -29,6 +29,11 @@
 
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
 
+  <indexConfig>
+    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
+  </indexConfig>
+
   <updateHandler class="solr.DirectUpdateHandler2">
     <commitWithin>
       <softCommit>${solr.commitwithin.softcommit:true}</softCommit>
diff --git a/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/LegacyAbstractAnalyticsTest.java b/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/LegacyAbstractAnalyticsTest.java
index 2f78203..ee1cc2e 100644
--- a/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/LegacyAbstractAnalyticsTest.java
+++ b/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/LegacyAbstractAnalyticsTest.java
@@ -40,6 +40,7 @@ import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.analytics.util.AnalyticsResponseHeadings;
 import org.apache.solr.analytics.util.MedianCalculator;
 import org.apache.solr.analytics.util.OrdinalCalculator;
+import org.apache.solr.core.XmlConfigFile;
 import org.apache.solr.request.SolrQueryRequest;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -81,7 +82,7 @@ public class LegacyAbstractAnalyticsTest extends SolrTestCaseJ4 {
 
   @BeforeClass
   public static void beforeClassAbstractAnalysis() {
-    xPathFact = XPathFactory.newInstance();
+    xPathFact = XmlConfigFile.xpathFactory;
   }
 
   @AfterClass
diff --git a/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/facet/LegacyAbstractAnalyticsFacetTest.java b/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/facet/LegacyAbstractAnalyticsFacetTest.java
index d406b67..96dcbbb 100644
--- a/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/facet/LegacyAbstractAnalyticsFacetTest.java
+++ b/solr/contrib/analytics/src/test/org/apache/solr/analytics/legacy/facet/LegacyAbstractAnalyticsFacetTest.java
@@ -34,6 +34,7 @@ import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.analytics.util.AnalyticsResponseHeadings;
 import org.apache.solr.analytics.util.MedianCalculator;
 import org.apache.solr.analytics.util.OrdinalCalculator;
+import org.apache.solr.core.XmlConfigFile;
 import org.apache.solr.request.SolrQueryRequest;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -64,7 +65,7 @@ public class LegacyAbstractAnalyticsFacetTest extends SolrTestCaseJ4 {
 
   @BeforeClass
   public static void beforeClassAbstractAnalysis() {
-    xPathFact = XPathFactory.newInstance();
+    xPathFact = XmlConfigFile.xpathFactory;
   }
 
   @AfterClass
@@ -88,7 +89,7 @@ public class LegacyAbstractAnalyticsFacetTest extends SolrTestCaseJ4 {
   }
 
   protected Node getNode(String xPath) throws XPathExpressionException {
-    return (Node)xPathFact.newXPath().compile(xPath).evaluate(doc, XPathConstants.NODE);
+    return (Node)XmlConfigFile.xpath.compile(xPath).evaluate(doc, XPathConstants.NODE);
   }
   private NodeList getNodes(String n1, String n2, String n3, String element, String n4) throws XPathExpressionException {
     // Construct the XPath expression. The form better not change or all these will fail.
@@ -97,7 +98,7 @@ public class LegacyAbstractAnalyticsFacetTest extends SolrTestCaseJ4 {
     sb.append("/lst[@name='").append(n3).append("']");
     sb.append("/lst[@name!='(MISSING)']");
     sb.append("//").append(element).append("[@name='").append(n4).append("']");
-    return (NodeList)xPathFact.newXPath().compile(sb.toString()).evaluate(doc, XPathConstants.NODESET);
+    return (NodeList)XmlConfigFile.xpath.compile(sb.toString()).evaluate(doc, XPathConstants.NODESET);
 
   }
   protected ArrayList<String> getStringList(String n1, String n2, String n3, String element, String n4)
@@ -336,7 +337,7 @@ public class LegacyAbstractAnalyticsFacetTest extends SolrTestCaseJ4 {
 
   protected NodeList getNodes(String xPath) throws XPathExpressionException {
     StringBuilder sb = new StringBuilder(xPath);
-    return (NodeList) xPathFact.newXPath().compile(sb.toString()).evaluate(doc, XPathConstants.NODESET);
+    return (NodeList) XmlConfigFile.xpath.compile(sb.toString()).evaluate(doc, XPathConstants.NODESET);
   }
 
 }
diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/DistributedClusteringComponentTest.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/DistributedClusteringComponentTest.java
index 89d3ddf..fda70a4 100644
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/DistributedClusteringComponentTest.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/DistributedClusteringComponentTest.java
@@ -32,7 +32,6 @@ public class DistributedClusteringComponentTest extends
 
   @Test
   public void test() throws Exception {
-    del("*:*");
     int numberOfDocs = 0;
     for (String[] doc : AbstractClusteringTestCase.DOCUMENTS) {
       index(id, Integer.toString(numberOfDocs++), "url", doc[0], "title", doc[1], "snippet", doc[2]);
diff --git a/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-solrconfig.xml b/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-solrconfig.xml
index f9f5304..834f332 100644
--- a/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-solrconfig.xml
+++ b/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-solrconfig.xml
@@ -20,6 +20,7 @@
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
   <indexConfig>
     <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
   </indexConfig>
 
   <!-- Used to specify an alternate directory to hold all index data
diff --git a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrEntityProcessor.java b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrEntityProcessor.java
index 7732673..11ea7cc 100644
--- a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrEntityProcessor.java
+++ b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/SolrEntityProcessor.java
@@ -117,12 +117,14 @@ public class SolrEntityProcessor extends EntityProcessorBase {
         solrClient = new Builder(url.toExternalForm())
             .withHttpClient(client)
             .withResponseParser(new XMLResponseParser())
+            .markInternalRequest()
             .build();
         log.info("using XMLResponseParser");
       } else {
         // TODO: it doesn't matter for this impl when passing a client currently, but we should close this!
         solrClient = new Builder(url.toExternalForm())
             .withHttpClient(client)
+            .markInternalRequest()
             .build();
         log.info("using BinaryResponseParser");
       }
diff --git a/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/contentstream-solrconfig.xml b/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/contentstream-solrconfig.xml
index d3ee34c..c400f4c 100644
--- a/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/contentstream-solrconfig.xml
+++ b/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/contentstream-solrconfig.xml
@@ -20,6 +20,7 @@
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
   <indexConfig>
     <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
   </indexConfig>
 
   <!-- Used to specify an alternate directory to hold all index data
diff --git a/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-solrconfig.xml b/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-solrconfig.xml
index ec6e6a9..d0c5e36 100644
--- a/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-solrconfig.xml
+++ b/solr/contrib/dataimporthandler/src/test-files/dih/solr/collection1/conf/dataimport-solrconfig.xml
@@ -20,6 +20,7 @@
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
   <indexConfig>
     <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
   </indexConfig>
 
   <!-- Used to specify an alternate directory to hold all index data
diff --git a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/DestroyCountCache.java b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/DestroyCountCache.java
index d14f43e..bbe1253 100644
--- a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/DestroyCountCache.java
+++ b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/DestroyCountCache.java
@@ -18,13 +18,14 @@ package org.apache.solr.handler.dataimport;
 
 import static org.hamcrest.CoreMatchers.nullValue;
 
+import java.util.Collections;
 import java.util.IdentityHashMap;
 import java.util.Map;
 
 import org.junit.Assert;
 
 public class DestroyCountCache extends SortedMapBackedCache {
-  static Map<DIHCache,DIHCache> destroyed = new IdentityHashMap<>();
+  static Map<DIHCache,DIHCache> destroyed = Collections.synchronizedMap(new IdentityHashMap<>());
   
   @Override
   public void destroy() {
diff --git a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestErrorHandling.java b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestErrorHandling.java
index 2391ae8..5b8f30e 100644
--- a/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestErrorHandling.java
+++ b/solr/contrib/dataimporthandler/src/test/org/apache/solr/handler/dataimport/TestErrorHandling.java
@@ -39,7 +39,7 @@ public class TestErrorHandling extends AbstractDataImportHandlerTestCase {
   @BeforeClass
   public static void beforeClass() throws Exception {
     savedFactory = System.getProperty("solr.DirectoryFactory");
-    System.setProperty("solr.directoryFactory", "solr.MockFSDirectoryFactory");
+    //System.setProperty("solr.directoryFactory", "solr.MockFSDirectoryFactory");
     initCore("dataimport-solrconfig.xml", "dataimport-schema.xml");
     ignoreException("Unexpected close tag");
   }
diff --git a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
index ba9ea59..304bd82 100644
--- a/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++ b/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
@@ -22,6 +22,7 @@
   <jmx />
   <indexConfig>
     <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
   </indexConfig>
 
   <!-- Used to specify an alternate directory to hold all index data.
diff --git a/solr/contrib/jaegertracer-configurator/src/test-files/solr/collection1/conf/solrconfig.xml b/solr/contrib/jaegertracer-configurator/src/test-files/solr/collection1/conf/solrconfig.xml
index 853ba65..d380e82 100644
--- a/solr/contrib/jaegertracer-configurator/src/test-files/solr/collection1/conf/solrconfig.xml
+++ b/solr/contrib/jaegertracer-configurator/src/test-files/solr/collection1/conf/solrconfig.xml
@@ -29,6 +29,11 @@
 
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
 
+  <indexConfig>
+    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
+  </indexConfig>
+
   <updateHandler class="solr.DirectUpdateHandler2">
     <commitWithin>
       <softCommit>${solr.commitwithin.softcommit:true}</softCommit>
diff --git a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml
index 01dbee9..2e31d66 100644
--- a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml
+++ b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml
@@ -22,6 +22,7 @@
   <jmx />
   <indexConfig>
     <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
   </indexConfig>
 
   <!-- Used to specify an alternate directory to hold all index data.
diff --git a/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr.xml b/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr.xml
index 057718a..d527fe1 100644
--- a/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr.xml
+++ b/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr.xml
@@ -19,6 +19,11 @@
  <!-- for use with the DefaultWrapperModel class -->
  <lib dir="${solr.solr.home:.}/models" />
 
+ <indexConfig>
+  <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+  <lockType>${solr.tests.lockType:single}</lockType>
+ </indexConfig>
+
  <schemaFactory class="ClassicIndexSchemaFactory" />
 
  <requestDispatcher>
diff --git a/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr_Th10_10.xml b/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr_Th10_10.xml
index f40110d..9693944 100644
--- a/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr_Th10_10.xml
+++ b/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-ltr_Th10_10.xml
@@ -16,6 +16,11 @@
  <directoryFactory name="DirectoryFactory"
   class="${solr.directoryFactory:solr.RAMDirectoryFactory}" />
 
+ <indexConfig>
+  <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+  <lockType>${solr.tests.lockType:single}</lockType>
+ </indexConfig>
+
  <schemaFactory class="ClassicIndexSchemaFactory" />
 
  <requestDispatcher>
diff --git a/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-multiseg.xml b/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-multiseg.xml
index 53d607b..fe8a00d 100644
--- a/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-multiseg.xml
+++ b/solr/contrib/ltr/src/test-files/solr/collection1/conf/solrconfig-multiseg.xml
@@ -16,6 +16,11 @@
  <directoryFactory name="DirectoryFactory"
   class="${solr.directoryFactory:solr.RAMDirectoryFactory}" />
 
+ <indexConfig>
+  <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+  <lockType>${solr.tests.lockType:single}</lockType>
+ </indexConfig>
+
  <schemaFactory class="ClassicIndexSchemaFactory" />
 
  <requestDispatcher>
diff --git a/solr/contrib/prometheus-exporter/src/java/org/apache/solr/prometheus/exporter/SolrClientFactory.java b/solr/contrib/prometheus-exporter/src/java/org/apache/solr/prometheus/exporter/SolrClientFactory.java
index 102d649..81c808b 100644
--- a/solr/contrib/prometheus-exporter/src/java/org/apache/solr/prometheus/exporter/SolrClientFactory.java
+++ b/solr/contrib/prometheus-exporter/src/java/org/apache/solr/prometheus/exporter/SolrClientFactory.java
@@ -45,7 +45,7 @@ public class SolrClientFactory {
     standaloneBuilder.withConnectionTimeout(settings.getHttpConnectionTimeout())
         .withSocketTimeout(settings.getHttpReadTimeout());
 
-    HttpSolrClient httpSolrClient = standaloneBuilder.build();
+    HttpSolrClient httpSolrClient = standaloneBuilder.markInternalRequest().build();
     httpSolrClient.setParser(responseParser);
 
     return httpSolrClient;
diff --git a/solr/contrib/velocity/src/test-files/velocity/solr/collection1/conf/solrconfig.xml b/solr/contrib/velocity/src/test-files/velocity/solr/collection1/conf/solrconfig.xml
index 35ce52b..0351cc3 100644
--- a/solr/contrib/velocity/src/test-files/velocity/solr/collection1/conf/solrconfig.xml
+++ b/solr/contrib/velocity/src/test-files/velocity/solr/collection1/conf/solrconfig.xml
@@ -19,6 +19,11 @@
 <config>
   <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
 
+  <indexConfig>
+    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <lockType>${solr.tests.lockType:single}</lockType>
+  </indexConfig>
+
   <!--<lib dir="../../contrib/velocity/lib" />-->
   <!--<lib dir="../../dist/" regex="solr-velocity-\d.*\.jar" />-->
 
diff --git a/solr/core/build.gradle b/solr/core/build.gradle
index 71002c2..cf64b13 100644
--- a/solr/core/build.gradle
+++ b/solr/core/build.gradle
@@ -56,10 +56,13 @@ dependencies {
   api 'org.apache.commons:commons-lang3'
   api 'com.carrotsearch:hppc'
   api 'com.fasterxml.jackson.core:jackson-databind'
+  api 'com.fasterxml.woodstox:woodstox-core'
   api 'commons-cli:commons-cli'
   api 'commons-codec:commons-codec'
   api 'commons-collections:commons-collections'
 
+  implementation 'net.sf.saxon:Saxon-HE'
+
   implementation 'com.fasterxml.jackson.dataformat:jackson-dataformat-smile'
 
   implementation('com.github.ben-manes.caffeine:caffeine', {
diff --git a/solr/core/ivy.xml b/solr/core/ivy.xml
index c632c47..4095916 100644
--- a/solr/core/ivy.xml
+++ b/solr/core/ivy.xml
@@ -67,6 +67,10 @@
     <dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="${/com.fasterxml.jackson.core/jackson-annotations}" conf="compile"/>
     <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-smile" rev="${/com.fasterxml.jackson.dataformat/jackson-dataformat-smile}" conf="compile"/>
 
+    <dependency org="com.fasterxml.woodstox" name="woodstox-core" rev="${/com.fasterxml.woodstox/woodstox-core}" conf="compile"/>
+
+    <dependency org="net.sf.saxon" name="Saxon-HE" rev="${/net.sf.saxon/Saxon-HE}" conf="compile"/>
+
     <dependency org="org.apache.hadoop" name="hadoop-auth" rev="${/org.apache.hadoop/hadoop-auth}" conf="compile.hadoop"/>
     <dependency org="org.apache.hadoop" name="hadoop-common" rev="${/org.apache.hadoop/hadoop-common}" conf="compile.hadoop"/>
     <dependency org="org.apache.hadoop" name="hadoop-hdfs-client" rev="${/org.apache.hadoop/hadoop-hdfs-client}" conf="compile.hadoop"/>
diff --git a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettyConfig.java b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettyConfig.java
index e4a0547..0abec45 100644
--- a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettyConfig.java
+++ b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettyConfig.java
@@ -17,6 +17,7 @@
 package org.apache.solr.client.solrj.embedded;
 
 import org.eclipse.jetty.servlet.ServletHolder;
+import org.eclipse.jetty.util.thread.QueuedThreadPool;
 
 import javax.servlet.Filter;
 import java.util.LinkedHashMap;
@@ -34,7 +35,6 @@ public class JettyConfig {
 
   public final boolean enableV2;
 
-
   public final boolean stopAtShutdown;
   
   public final Long waitForLoadingCoresToFinishMs;
@@ -47,9 +47,13 @@ public class JettyConfig {
   
   public final int portRetryTime;
 
+  public final boolean enableProxy;
+
+  public final QueuedThreadPool qtp;
+
   private JettyConfig(boolean onlyHttp1, int port, int portRetryTime , String context, boolean stopAtShutdown,
                       Long waitForLoadingCoresToFinishMs, Map<ServletHolder, String> extraServlets,
-                      Map<Class<? extends Filter>, String> extraFilters, SSLConfig sslConfig, boolean enableV2) {
+                      Map<Class<? extends Filter>, String> extraFilters, SSLConfig sslConfig, boolean enableV2, boolean enableProxy, QueuedThreadPool qtp) {
     this.onlyHttp1 = onlyHttp1;
     this.port = port;
     this.context = context;
@@ -60,6 +64,8 @@ public class JettyConfig {
     this.sslConfig = sslConfig;
     this.portRetryTime = portRetryTime;
     this.enableV2 = enableV2;
+    this.enableProxy = enableProxy;
+    this.qtp = qtp;
   }
 
   public static Builder builder() {
@@ -74,6 +80,12 @@ public class JettyConfig {
     builder.extraServlets = other.extraServlets;
     builder.extraFilters = other.extraFilters;
     builder.sslConfig = other.sslConfig;
+    builder.enableProxy = other.enableProxy;
+    builder.portRetryTime = other.portRetryTime;
+    builder.onlyHttp1 = other.onlyHttp1;
+    builder.waitForLoadingCoresToFinishMs = other.waitForLoadingCoresToFinishMs;
+    builder.enableV2 = other.enableV2;
+    builder.qtp = other.qtp;
     return builder;
   }
 
@@ -89,6 +101,8 @@ public class JettyConfig {
     Map<Class<? extends Filter>, String> extraFilters = new LinkedHashMap<>();
     SSLConfig sslConfig = null;
     int portRetryTime = 60;
+    boolean enableProxy;
+    QueuedThreadPool qtp;
 
     public Builder useOnlyHttp1(boolean useOnlyHttp1) {
       this.onlyHttp1 = useOnlyHttp1;
@@ -151,10 +165,20 @@ public class JettyConfig {
       return this;
     }
 
+    public Builder enableProxy(boolean enable) {
+      this.enableProxy = enable;
+      return this;
+    }
+
+    public Builder withExecutor(QueuedThreadPool qtp) {
+      this.qtp = qtp;
+      return this;
+    }
+
 
     public JettyConfig build() {
       return new JettyConfig(onlyHttp1, port, portRetryTime, context, stopAtShutdown,
-          waitForLoadingCoresToFinishMs, extraServlets, extraFilters, sslConfig, enableV2);
+          waitForLoadingCoresToFinishMs, extraServlets, extraFilters, sslConfig, enableV2, enableProxy, qtp);
     }
 
   }
diff --git a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
index 5a17f4c..57df4f2 100644
--- a/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
+++ b/solr/core/src/java/org/apache/solr/client/solrj/embedded/JettySolrRunner.java
@@ -26,6 +26,7 @@ import javax.servlet.ServletResponse;
 import javax.servlet.http.HttpServlet;
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
+import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.net.BindException;
@@ -39,21 +40,39 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Random;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.lucene.util.Constants;
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.cloud.SocketProxy;
+import org.apache.solr.client.solrj.impl.HttpClientUtil;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.impl.SolrHttpClientScheduler;
+import org.apache.solr.cloud.ZkController;
+import org.apache.solr.common.ParWork;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CloudConfig;
 import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.NodeConfig;
 import org.apache.solr.servlet.SolrDispatchFilter;
+import org.apache.solr.servlet.SolrQoSFilter;
 import org.apache.solr.util.TimeOut;
+import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.Watcher;
 import org.eclipse.jetty.alpn.server.ALPNServerConnectionFactory;
 import org.eclipse.jetty.http2.HTTP2Cipher;
 import org.eclipse.jetty.http2.server.HTTP2CServerConnectionFactory;
@@ -66,10 +85,13 @@ import org.eclipse.jetty.server.HttpConnectionFactory;
 import org.eclipse.jetty.server.SecureRequestCustomizer;
 import org.eclipse.jetty.server.Server;
 import org.eclipse.jetty.server.ServerConnector;
+import org.eclipse.jetty.server.SessionIdManager;
 import org.eclipse.jetty.server.SslConnectionFactory;
 import org.eclipse.jetty.server.handler.HandlerWrapper;
 import org.eclipse.jetty.server.handler.gzip.GzipHandler;
 import org.eclipse.jetty.server.session.DefaultSessionIdManager;
+import org.eclipse.jetty.server.session.HouseKeeper;
+import org.eclipse.jetty.server.session.SessionHandler;
 import org.eclipse.jetty.servlet.FilterHolder;
 import org.eclipse.jetty.servlet.ServletContextHandler;
 import org.eclipse.jetty.servlet.ServletHolder;
@@ -78,6 +100,7 @@ import org.eclipse.jetty.util.component.LifeCycle;
 import org.eclipse.jetty.util.ssl.SslContextFactory;
 import org.eclipse.jetty.util.thread.QueuedThreadPool;
 import org.eclipse.jetty.util.thread.ReservedThreadExecutor;
+import org.eclipse.jetty.util.thread.Scheduler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.MDC;
@@ -87,20 +110,20 @@ import org.slf4j.MDC;
  *
  * @since solr 1.3
  */
-public class JettySolrRunner {
+public class JettySolrRunner implements Closeable {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   private static final int THREAD_POOL_MAX_THREADS = 10000;
-  // NOTE: needs to be larger than SolrHttpClient.threadPoolSweeperMaxIdleTime
-  private static final int THREAD_POOL_MAX_IDLE_TIME_MS = 260000;
+  // NOTE: should be larger than HttpClientUtil.DEFAULT_SO_TIMEOUT or typical client SO timeout
+  private static final int THREAD_POOL_MAX_IDLE_TIME_MS = HttpClientUtil.DEFAULT_SO_TIMEOUT + 30000;
 
   Server server;
 
   volatile FilterHolder dispatchFilter;
   volatile FilterHolder debugFilter;
+  volatile FilterHolder qosFilter;
 
-  private boolean waitOnSolr = false;
   private int jettyPort = -1;
 
   private final JettyConfig config;
@@ -111,7 +134,7 @@ public class JettySolrRunner {
 
   private LinkedList<FilterHolder> extraFilters;
 
-  private static final String excludePatterns = "/partials/.+,/libs/.+,/css/.+,/js/.+,/img/.+,/templates/.+";
+  private static final String excludePatterns = "/partials/.+,/libs/.+,/css/.+,/js/.+,/img/.+,/templates/.+,/tpl/.+";
 
   private int proxyPort = -1;
 
@@ -123,20 +146,27 @@ public class JettySolrRunner {
 
   private String host;
 
+  private volatile boolean manageQtp;
+
   private volatile boolean started = false;
+  private volatile String nodeName;
+  private volatile boolean isClosed;
+
+
+  private static Scheduler scheduler = new SolrHttpClientScheduler("JettySolrRunnerScheduler", true, null, new ThreadGroup("JettySolrRunnerScheduler"), 1);
 
   public static class DebugFilter implements Filter {
     private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
     private AtomicLong nRequests = new AtomicLong();
 
-    List<Delay> delays = new ArrayList<>();
+    private Set<Delay> delays = ConcurrentHashMap.newKeySet(50);
 
     public long getTotalRequests() {
       return nRequests.get();
 
     }
-
+    
     /**
      * Introduce a delay of specified milliseconds for the specified request.
      *
@@ -147,7 +177,7 @@ public class JettySolrRunner {
     public void addDelay(String reason, int count, int delay) {
       delays.add(new Delay(reason, count, delay));
     }
-
+    
     /**
      * Remove any delay introduced before.
      */
@@ -183,6 +213,7 @@ public class JettySolrRunner {
         try {
           Thread.sleep(delayMs);
         } catch (InterruptedException e) {
+          SolrZkClient.checkInterrupted(e);
           throw new RuntimeException(e);
         }
         this.log.info("Waking up after the delay of {}ms...", delayMs);
@@ -241,12 +272,13 @@ public class JettySolrRunner {
    * @param enableProxy       enables proxy feature to disable connections
    */
   public JettySolrRunner(String solrHome, Properties nodeProperties, JettyConfig config, boolean enableProxy) {
+    ObjectReleaseTracker.track(this);
     this.enableProxy = enableProxy;
     this.solrHome = solrHome;
     this.config = config;
     this.nodeProperties = nodeProperties;
 
-    if (enableProxy) {
+    if (enableProxy || config.enableProxy) {
       try {
         proxy = new SocketProxy(0, config.sslConfig != null && config.sslConfig.isSSLMode());
       } catch (Exception e) {
@@ -260,15 +292,28 @@ public class JettySolrRunner {
 
   private void init(int port) {
 
-    QueuedThreadPool qtp = new QueuedThreadPool();
-    qtp.setMaxThreads(THREAD_POOL_MAX_THREADS);
-    qtp.setIdleTimeout(THREAD_POOL_MAX_IDLE_TIME_MS);
-    qtp.setReservedThreads(0);
+    QueuedThreadPool qtp;
+    if (config.qtp != null) {
+      qtp = config.qtp;
+    } else {
+      qtp = new SolrQueuedThreadPool("JettySolrRunner qtp", false);
+      qtp.setMaxThreads(Integer.getInteger("solr.maxContainerThreads", THREAD_POOL_MAX_THREADS));
+      qtp.setLowThreadsThreshold(Integer.getInteger("solr.lowContainerThreadsThreshold", -1)); // we don't use this or connections will get cut
+      qtp.setMinThreads(Integer.getInteger("solr.minContainerThreads", 2));
+      qtp.setIdleTimeout(Integer.getInteger("solr.containerThreadsIdle", THREAD_POOL_MAX_IDLE_TIME_MS));
+      qtp.setStopTimeout(1);
+      qtp.setReservedThreads(-1); // -1 auto sizes, important to keep
+    }
+
     server = new Server(qtp);
-    server.manage(qtp);
+
+
+    server.setStopTimeout(1); // will wait gracefull for stoptime / 2, then interrupts
+    assert config.stopAtShutdown;
     server.setStopAtShutdown(config.stopAtShutdown);
 
-    if (System.getProperty("jetty.testMode") != null) {
+    //if (System.getProperty("jetty.testMode") != null) {
+    if (true) {
       // if this property is true, then jetty will be configured to use SSL
       // leveraging the same system properties as java to specify
       // the keystore/truststore if they are set unless specific config
@@ -288,19 +333,22 @@ public class JettySolrRunner {
         HttpConnectionFactory http1ConnectionFactory = new HttpConnectionFactory(configuration);
 
         if (config.onlyHttp1 || !Constants.JRE_IS_MINIMUM_JAVA9) {
-          connector = new ServerConnector(server, new SslConnectionFactory(sslcontext,
+          connector = new ServerConnector(server, qtp, scheduler, null, 1, 2, new SslConnectionFactory(sslcontext,
               http1ConnectionFactory.getProtocol()),
               http1ConnectionFactory);
         } else {
           sslcontext.setCipherComparator(HTTP2Cipher.COMPARATOR);
 
-          connector = new ServerConnector(server);
+          connector = new ServerConnector(server, qtp, scheduler, null, 1, 2);
           SslConnectionFactory sslConnectionFactory = new SslConnectionFactory(sslcontext, "alpn");
           connector.addConnectionFactory(sslConnectionFactory);
           connector.setDefaultProtocol(sslConnectionFactory.getProtocol());
 
           HTTP2ServerConnectionFactory http2ConnectionFactory = new HTTP2ServerConnectionFactory(configuration);
 
+          http2ConnectionFactory.setMaxConcurrentStreams(1500);
+          http2ConnectionFactory.setInputBufferSize(16384);
+
           ALPNServerConnectionFactory alpn = new ALPNServerConnectionFactory(
               http2ConnectionFactory.getProtocol(),
               http1ConnectionFactory.getProtocol());
@@ -311,32 +359,34 @@ public class JettySolrRunner {
         }
       } else {
         if (config.onlyHttp1) {
-          connector = new ServerConnector(server, new HttpConnectionFactory(configuration));
+          connector = new ServerConnector(server,  qtp, scheduler, null, 1, 2, new HttpConnectionFactory(configuration));
         } else {
-          connector = new ServerConnector(server, new HttpConnectionFactory(configuration),
+          connector = new ServerConnector(server,  qtp, scheduler, null, 1, 2, new HttpConnectionFactory(configuration),
               new HTTP2CServerConnectionFactory(configuration));
         }
       }
 
       connector.setReuseAddress(true);
+      connector.setSoLingerTime(-1);
       connector.setPort(port);
       connector.setHost("127.0.0.1");
-      connector.setIdleTimeout(THREAD_POOL_MAX_IDLE_TIME_MS);
-      connector.setStopTimeout(0);
+      connector.setIdleTimeout(Integer.getInteger("solr.containerThreadsIdle", THREAD_POOL_MAX_IDLE_TIME_MS));
       server.setConnectors(new Connector[] {connector});
-      server.setSessionIdManager(new DefaultSessionIdManager(server, new Random()));
+      server.setSessionIdManager(new NoopSessionManager());
     } else {
       HttpConfiguration configuration = new HttpConfiguration();
       ServerConnector connector = new ServerConnector(server, new HttpConnectionFactory(configuration));
+      connector.setReuseAddress(true);
       connector.setPort(port);
-      connector.setIdleTimeout(THREAD_POOL_MAX_IDLE_TIME_MS);
+      connector.setSoLingerTime(-1);
+      connector.setIdleTimeout(Integer.getInteger("solr.containerThreadsIdle", THREAD_POOL_MAX_IDLE_TIME_MS));
       server.setConnectors(new Connector[] {connector});
     }
 
     HandlerWrapper chain;
     {
     // Initialize the servlets
-    final ServletContextHandler root = new ServletContextHandler(server, config.context, ServletContextHandler.SESSIONS);
+    final ServletContextHandler root = new ServletContextHandler(server, config.context, ServletContextHandler.NO_SESSIONS);
 
     server.addLifeCycleListener(new LifeCycle.Listener() {
 
@@ -366,7 +416,7 @@ public class JettySolrRunner {
 
         log.info("Jetty properties: {}", nodeProperties);
 
-        debugFilter = root.addFilter(DebugFilter.class, "/*", EnumSet.of(DispatcherType.REQUEST) );
+        debugFilter = root.addFilter(DebugFilter.class, "*", EnumSet.of(DispatcherType.REQUEST) );
         extraFilters = new LinkedList<>();
         for (Map.Entry<Class<? extends Filter>, String> entry : config.extraFilters.entrySet()) {
           extraFilters.add(root.addFilter(entry.getKey(), entry.getValue(), EnumSet.of(DispatcherType.REQUEST)));
@@ -378,13 +428,18 @@ public class JettySolrRunner {
         dispatchFilter = root.getServletHandler().newFilterHolder(Source.EMBEDDED);
         dispatchFilter.setHeldClass(SolrDispatchFilter.class);
         dispatchFilter.setInitParameter("excludePatterns", excludePatterns);
+
+        qosFilter = root.getServletHandler().newFilterHolder(Source.EMBEDDED);
+        qosFilter.setHeldClass(SolrQoSFilter.class);
+        root.addFilter(qosFilter, "*", EnumSet.of(DispatcherType.REQUEST, DispatcherType.ASYNC));
+
+        root.addServlet(Servlet404.class, "/*");
+
         // Map dispatchFilter in same path as in web.xml
-        root.addFilter(dispatchFilter, "/*", EnumSet.of(DispatcherType.REQUEST));
+        root.addFilter(dispatchFilter, "*", EnumSet.of(DispatcherType.REQUEST));
+
+        log.info("Jetty loaded and ready to go");
 
-        synchronized (JettySolrRunner.this) {
-          waitOnSolr = true;
-          JettySolrRunner.this.notify();
-        }
       }
 
       @Override
@@ -430,7 +485,7 @@ public class JettySolrRunner {
   /**
    * @return the {@link SolrDispatchFilter} for this node
    */
-  public SolrDispatchFilter getSolrDispatchFilter() { return (SolrDispatchFilter) dispatchFilter.getFilter(); }
+  public SolrDispatchFilter getSolrDispatchFilter() { return dispatchFilter == null ? null : (SolrDispatchFilter) dispatchFilter.getFilter(); }
 
   /**
    * @return the {@link CoreContainer} for this node
@@ -443,10 +498,7 @@ public class JettySolrRunner {
   }
 
   public String getNodeName() {
-    if (getCoreContainer() == null) {
-      return null;
-    }
-    return getCoreContainer().getZkController().getNodeName();
+    return nodeName;
   }
 
   public boolean isRunning() {
@@ -469,7 +521,7 @@ public class JettySolrRunner {
    * @throws Exception if an error occurs on startup
    */
   public void start() throws Exception {
-    start(true);
+    start(true, true);
   }
 
   /**
@@ -481,7 +533,7 @@ public class JettySolrRunner {
    *
    * @throws Exception if an error occurs on startup
    */
-  public void start(boolean reusePort) throws Exception {
+  public void start(boolean reusePort, boolean wait) throws Exception {
     // Do not let Jetty/Solr pollute the MDC for this thread
     Map<String, String> prevContext = MDC.getCopyOfContextMap();
     MDC.clear();
@@ -493,7 +545,6 @@ public class JettySolrRunner {
 
       // if started before, make a new server
       if (startedBefore) {
-        waitOnSolr = false;
         init(port);
       } else {
         startedBefore = true;
@@ -506,18 +557,19 @@ public class JettySolrRunner {
           server.start();
         }
       }
-      synchronized (JettySolrRunner.this) {
-        int cnt = 0;
-        while (!waitOnSolr || !dispatchFilter.isRunning() || getCoreContainer() == null) {
-          this.wait(100);
-          if (cnt++ == 15) {
-            throw new RuntimeException("Jetty/Solr unresponsive");
-          }
-        }
-      }
 
-      if (config.waitForLoadingCoresToFinishMs != null && config.waitForLoadingCoresToFinishMs > 0L) {
-        waitForLoadingCoresToFinish(config.waitForLoadingCoresToFinishMs);
+      if (getCoreContainer() != null) {
+        NodeConfig conf = getCoreContainer().getConfig();
+        CloudConfig cloudConf = conf.getCloudConfig();
+        if (cloudConf != null) {
+          String localHostContext = ZkController.trimLeadingAndTrailingSlashes(cloudConf.getSolrHostContext());
+
+          String zkServerAddress = cloudConf.getZkHost();
+          int localHostPort = cloudConf.getSolrHostPort();
+          String hostName = ZkController.normalizeHostName(cloudConf.getHost());
+          nodeName = ZkController.generateNodeName(hostName, Integer.toString(localHostPort), localHostContext);
+
+        }
       }
 
       setProtocolAndHost();
@@ -530,8 +582,63 @@ public class JettySolrRunner {
         }
       }
 
+      if (config.waitForLoadingCoresToFinishMs != null && config.waitForLoadingCoresToFinishMs > 0L) {
+        waitForLoadingCoresToFinish(config.waitForLoadingCoresToFinishMs);
+      }
+
+      if (getCoreContainer() != null && getCoreContainer().isZooKeeperAware()) {
+        SolrZkClient solrZkClient = getCoreContainer().getZkController().getZkStateReader().getZkClient();
+        if (solrZkClient.exists(ZkStateReader.COLLECTIONS_ZKNODE, null, true) == null) {
+          CountDownLatch latch = new CountDownLatch(1);
+          Watcher watcher = new Watcher() {
+
+            @Override
+            public void process(WatchedEvent event) {
+              if (Event.EventType.None.equals(event.getType())) {
+                return;
+              }
+              try {
+                if (event.getType() == Event.EventType.NodeChildrenChanged) {
+
+                  if (solrZkClient.exists(ZkStateReader.COLLECTIONS_ZKNODE, null, true) == null) {
+                    solrZkClient.getChildren("/", this, true);
+                    return;
+                  } else {
+                    latch.countDown();
+                  }
+                }
+                solrZkClient.getChildren("/", this, true);
+              } catch (KeeperException e) {
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+              } catch (InterruptedException e) {
+                ParWork.propegateInterrupt(e);
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+              }
+            }
+          };
+          List<String> rootNodes = solrZkClient.getChildren("/", watcher, true);
+          if (!rootNodes.contains(ZkStateReader.COLLECTIONS_ZKNODE)) {
+            boolean success = latch.await(30, TimeUnit.SECONDS);
+            if (!success) {
+              throw new TimeoutException();
+            }
+          } else {
+            solrZkClient.getSolrZooKeeper().removeWatches("/", watcher,  Watcher.WatcherType.Children, true);
+          }
+        }
+
+        if (wait) {
+          log.info("waitForNode: {}", getNodeName());
+
+          ZkStateReader reader = getCoreContainer().getZkController().getZkStateReader();
+
+          reader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && getNodeName() != null && n.contains(getNodeName()));
+        }
+      }
+
     } finally {
       started  = true;
+
       if (prevContext != null)  {
         MDC.setContextMap(prevContext);
       } else {
@@ -561,8 +668,8 @@ public class JettySolrRunner {
     int tryCnt = 1;
     while (true) {
       try {
-        tryCnt++;
         log.info("Trying to start Jetty on port {} try number {} ...", port, tryCnt);
+        tryCnt++;
         server.start();
         break;
       } catch (IOException ioe) {
@@ -601,94 +708,66 @@ public class JettySolrRunner {
     return ioe;
   }
 
-  /**
-   * Stop the Jetty server
-   *
-   * @throws Exception if an error occurs on shutdown
-   */
-  public void stop() throws Exception {
+  @Override
+  public void close() throws IOException {
+    close(true);
+  }
+
+  public void close(boolean wait) throws IOException {
     // Do not let Jetty/Solr pollute the MDC for this thread
     Map<String,String> prevContext = MDC.getCopyOfContextMap();
     MDC.clear();
     try {
-      Filter filter = dispatchFilter.getFilter();
-
-      // we want to shutdown outside of jetty cutting us off
-      SolrDispatchFilter sdf = getSolrDispatchFilter();
-      ExecutorService customThreadPool = null;
-      if (sdf != null) {
-        customThreadPool = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("jettyShutDown"));
-
-        sdf.closeOnDestroy(false);
-//        customThreadPool.submit(() -> {
-//          try {
-//            sdf.close();
-//          } catch (Throwable t) {
-//            log.error("Error shutting down Solr", t);
-//          }
-//        });
-        try {
-          sdf.close();
-        } catch (Throwable t) {
-          log.error("Error shutting down Solr", t);
-        }
-      }
-
-      QueuedThreadPool qtp = (QueuedThreadPool) server.getThreadPool();
-      ReservedThreadExecutor rte = qtp.getBean(ReservedThreadExecutor.class);
-
       server.stop();
 
-      if (server.getState().equals(Server.FAILED)) {
-        filter.destroy();
-        if (extraFilters != null) {
-          for (FilterHolder f : extraFilters) {
-            f.getFilter().destroy();
-          }
-        }
-      }
+      try {
 
-      // stop timeout is 0, so we will interrupt right away
-      while(!qtp.isStopped()) {
-        qtp.stop();
-        if (qtp.isStopped()) {
-          Thread.sleep(50);
-        }
+        server.join();
+      } catch (InterruptedException e) {
+        SolrZkClient.checkInterrupted(e);
+        throw new RuntimeException(e);
       }
 
-      // we tried to kill everything, now we wait for executor to stop
-      qtp.setStopTimeout(Integer.MAX_VALUE);
-      qtp.stop();
-      qtp.join();
+    } catch (Exception e) {
+      SolrZkClient.checkInterrupted(e);
+      log.error("", e);
+      throw new RuntimeException(e);
+    } finally {
 
-      if (rte != null) {
-        // we try and wait for the reserved thread executor, but it doesn't always seem to work
-        // so we actually set 0 reserved threads at creation
+      if (enableProxy) {
+        proxy.close();
+      }
+      if (wait && getCoreContainer() != null && getCoreContainer().isZooKeeperAware()) {
+        log.info("waitForJettyToStop: {}", getLocalPort());
+        String nodeName = getNodeName();
+        if (nodeName == null) {
+          log.info("Cannot wait for Jetty with null node name");
+          return;
+        }
 
-        rte.stop();
+        log.info("waitForNode: {}", getNodeName());
 
-        TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
-        timeout.waitFor("Timeout waiting for reserved executor to stop.", ()
-            -> rte.isStopped());
-      }
 
-      if (customThreadPool != null) {
-        ExecutorUtil.shutdownAndAwaitTermination(customThreadPool);
-      }
+        ZkStateReader reader = getCoreContainer().getZkController().getZkStateReader();
 
-      do {
         try {
-          server.join();
+          reader.waitForLiveNodes(10, TimeUnit.SECONDS, (o, n) -> !n.contains(nodeName));
         } catch (InterruptedException e) {
-          // ignore
+          Thread.currentThread().interrupt();
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "interrupted");
+        } catch (TimeoutException e) {
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
         }
-      } while (!server.isStopped());
-
-    } finally {
-      if (enableProxy) {
-        proxy.close();
       }
-
+//      if (server.getState().equals(Server.FAILED)) {
+//        if (filter != null) filter.destroy();
+//        if (extraFilters != null) {
+//          for (FilterHolder f : extraFilters) {
+//            f.getFilter().destroy();
+//          }
+//        }
+//      }
+      ObjectReleaseTracker.release(this);
       if (prevContext != null) {
         MDC.setContextMap(prevContext);
       } else {
@@ -698,6 +777,19 @@ public class JettySolrRunner {
   }
 
   /**
+   * Stop the Jetty server
+   *
+   * @throws Exception if an error occurs on shutdown
+   */
+  public void stop() throws Exception {
+    stop(true);
+  }
+
+  public void stop(boolean wait) throws Exception {
+    close(wait);
+  }
+
+  /**
    * Returns the Local Port of the jetty Server.
    *
    * @exception RuntimeException if there is no Connector
@@ -779,13 +871,15 @@ public class JettySolrRunner {
   }
 
   public SolrClient newClient() {
-    return new HttpSolrClient.Builder(getBaseUrl().toString()).build();
+    return new HttpSolrClient.Builder(getBaseUrl().toString()).
+            withHttpClient(getCoreContainer().getUpdateShardHandler().getDefaultHttpClient()).build();
   }
 
   public SolrClient newClient(int connectionTimeoutMillis, int socketTimeoutMillis) {
     return new HttpSolrClient.Builder(getBaseUrl().toString())
         .withConnectionTimeout(connectionTimeoutMillis)
         .withSocketTimeout(socketTimeoutMillis)
+        .withHttpClient(getCoreContainer().getUpdateShardHandler().getDefaultHttpClient())
         .build();
   }
 
@@ -858,4 +952,105 @@ public class JettySolrRunner {
   public SocketProxy getProxy() {
     return proxy;
   }
+
+  private final class NoopSessionManager implements SessionIdManager {
+    @Override
+    public void stop() throws Exception {
+    }
+
+    @Override
+    public void start() throws Exception {
+    }
+
+    @Override
+    public void removeLifeCycleListener(Listener listener) {
+    }
+
+    @Override
+    public boolean isStopping() {
+      return false;
+    }
+
+    @Override
+    public boolean isStopped() {
+      return false;
+    }
+
+    @Override
+    public boolean isStarting() {
+      return false;
+    }
+
+    @Override
+    public boolean isStarted() {
+      return false;
+    }
+
+    @Override
+    public boolean isRunning() {
+      return false;
+    }
+
+    @Override
+    public boolean isFailed() {
+      return false;
+    }
+
+    @Override
+    public void addLifeCycleListener(Listener listener) {
+    }
+
+    @Override
+    public void setSessionHouseKeeper(HouseKeeper houseKeeper) {
+    }
+
+    @Override
+    public String renewSessionId(String oldId, String oldExtendedId, HttpServletRequest request) {
+      return null;
+    }
+
+    @Override
+    public String newSessionId(HttpServletRequest request, long created) {
+      return null;
+    }
+
+    @Override
+    public boolean isIdInUse(String id) {
+      return false;
+    }
+
+    @Override
+    public void invalidateAll(String id) {
+    }
+
+    @Override
+    public String getWorkerName() {
+      return null;
+    }
+
+    @Override
+    public HouseKeeper getSessionHouseKeeper() {
+      return null;
+    }
+
+    @Override
+    public Set<SessionHandler> getSessionHandlers() {
+      return null;
+    }
+
+    @Override
+    public String getId(String qualifiedId) {
+      return null;
+    }
+
+    @Override
+    public String getExtendedId(String id, HttpServletRequest request) {
+      return null;
+    }
+
+    @Override
+    public void expireAll(String id) {
+    }
+  }
+
 }
diff --git a/solr/core/src/java/org/apache/solr/client/solrj/embedded/SolrQueuedThreadPool.java b/solr/core/src/java/org/apache/solr/client/solrj/embedded/SolrQueuedThreadPool.java
new file mode 100644
index 0000000..92be062
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/client/solrj/embedded/SolrQueuedThreadPool.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.solrj.embedded;
+
+import java.io.Closeable;
+import java.lang.invoke.MethodHandles;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ThreadFactory;
+
+import org.apache.solr.common.ParWork;
+import org.apache.solr.common.util.ObjectReleaseTracker;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.eclipse.jetty.util.annotation.Name;
+import org.eclipse.jetty.util.thread.QueuedThreadPool;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class SolrQueuedThreadPool extends QueuedThreadPool implements Closeable {
+    private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+    private final boolean killStop;
+    private final String name;
+    private volatile Error error;
+
+
+
+    public SolrQueuedThreadPool(String name, boolean killStop) {
+        super(10000, 15,
+        15000, -1,
+        null, null,
+              new  SolrNamedThreadFactory(name));
+        this.killStop = killStop;
+        this.name = name;
+    }
+
+    protected void runJob(Runnable job) {
+        try {
+            job.run();
+        } catch (Error error) {
+            log.error("Error in Jetty thread pool thread", error);
+            this.error = error;
+        }
+    }
+
+
+//
+//    @Override
+//    public Thread newThread(Runnable runnable) {
+//        Thread thread = new Thread(tg, runnable);
+//        thread.setDaemon(isDaemon());
+//        thread.setPriority(getThreadsPriority());
+//        thread.setName(name + "-" + thread.getId());
+//        return thread;d
+//    }
+
+    public void close() {
+        //  while (!isStopped()) {
+            try {
+
+                setStopTimeout(1);
+                super.doStop();
+//                // this allows 15 seconds until we start interrupting
+//                Thread.sleep(250);
+
+                // now we wait up 30 seconds gracefully, then interrupt again before waiting for the rest of the timeout
+
+            } catch (InterruptedException e) {
+                ParWork.propegateInterrupt(e);
+                throw new RuntimeException(e);
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+      //  }
+
+        if (error != null) {
+            throw error;
+        }
+        assert ObjectReleaseTracker.release(this);
+    }
+
+    @Override
+    protected void doStop() throws Exception {
+        if (!killStop) {
+            super.doStop();
+        }
+    }
+
+    @Override
+    public void join() throws InterruptedException
+    {
+        if (!killStop) {
+            super.join();
+        }
+    }
+}
\ No newline at end of file
diff --git a/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java b/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java
index e98c33c..e6e5ab0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java
+++ b/solr/core/src/java/org/apache/solr/cloud/CloudConfigSetService.java
@@ -56,7 +56,8 @@ public class CloudConfigSetService extends ConfigSetService {
       if (!zkController.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + colName, true)) {
         // TODO remove this functionality or maybe move to a CLI mechanism
         log.warn("Auto-creating collection (in ZK) from core descriptor (on disk).  This feature may go away!");
-        CreateCollectionCmd.createCollectionZkNode(zkController.getSolrCloudManager().getDistribStateManager(), colName, cd.getCloudDescriptor().getParams());
+        // nocommit
+        CreateCollectionCmd.createCollectionZkNode(zkController.getSolrCloudManager().getDistribStateManager(), colName, cd.getCloudDescriptor().getParams(), null);
       }
     } catch (InterruptedException e) {
       Thread.currentThread().interrupt();
diff --git a/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java b/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java
index c9f12e9..7fbf001 100644
--- a/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java
+++ b/solr/core/src/java/org/apache/solr/cloud/DistributedMap.java
@@ -42,17 +42,6 @@ public class DistributedMap {
 
   public DistributedMap(SolrZkClient zookeeper, String dir) {
     this.dir = dir;
-
-    ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zookeeper.getZkClientTimeout());
-    try {
-      cmdExecutor.ensureExists(dir, zookeeper);
-    } catch (KeeperException e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    } catch (InterruptedException e) {
-      Thread.currentThread().interrupt();
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    }
-
     this.zookeeper = zookeeper;
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
index 1398570..4eafa9f 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
@@ -19,59 +19,53 @@ package org.apache.solr.cloud;
 import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import org.apache.solr.common.cloud.SolrZkClient;
+
 import org.apache.solr.common.cloud.ZkNodeProps;
+import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.zookeeper.KeeperException;
-import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public abstract class ElectionContext implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-  final String electionPath;
-  final ZkNodeProps leaderProps;
-  final String id;
-  final String leaderPath;
-  volatile String leaderSeqPath;
-  private SolrZkClient zkClient;
+  protected final String electionPath;
+  protected final ZkNodeProps leaderProps;
+  protected final String id;
+  protected final String leaderPath;
+  protected volatile String leaderSeqPath;
+  private volatile boolean closed;
 
-  public ElectionContext(final String coreNodeName,
-      final String electionPath, final String leaderPath, final ZkNodeProps leaderProps, final SolrZkClient zkClient) {
-    assert zkClient != null;
-    this.id = coreNodeName;
+  public ElectionContext(final String id, final String electionPath, final String leaderPath, final ZkNodeProps leaderProps) {
+    this.id = id;
     this.electionPath = electionPath;
     this.leaderPath = leaderPath;
     this.leaderProps = leaderProps;
-    this.zkClient = zkClient;
+
+    ObjectReleaseTracker.track(this);
   }
-  
-  public void close() {
 
+  public void close() {
+    this.closed = true;
+    ObjectReleaseTracker.release(this);
   }
-  
+
   public void cancelElection() throws InterruptedException, KeeperException {
-    if (leaderSeqPath != null) {
-      try {
-        log.debug("Canceling election {}", leaderSeqPath);
-        zkClient.delete(leaderSeqPath, -1, true);
-      } catch (NoNodeException e) {
-        // fine
-        log.debug("cancelElection did not find election node to remove {}", leaderSeqPath);
-      }
-    } else {
-      log.debug("cancelElection skipped as this context has not been initialized");
-    }
   }
 
-  abstract void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStartMs) throws KeeperException, InterruptedException, IOException;
+  abstract void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs) throws KeeperException, InterruptedException, IOException;
 
   public void checkIfIamLeaderFired() {}
 
   public void joinedElectionFired() {}
 
-  public  ElectionContext copy(){
+  public ElectionContext copy(){
     throw new UnsupportedOperationException("copy");
   }
+
+  public boolean isClosed() {
+    return closed;
+  }
 }
 
 
+
diff --git a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
index f50aa11..ce507aa 100644
--- a/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
+++ b/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
@@ -22,14 +22,16 @@ import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.solr.cloud.ZkController.ContextKey;
-import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkCmdExecutor;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.cloud.ZooKeeperException;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
@@ -45,43 +47,41 @@ import org.slf4j.LoggerFactory;
  * leader is chosen. First call * {@link #setup(ElectionContext)} to ensure
  * the election process is init'd. Next call
  * {@link #joinElection(ElectionContext, boolean)} to start the leader election.
- * 
+ *
  * The implementation follows the classic ZooKeeper recipe of creating an
  * ephemeral, sequential node for each candidate and then looking at the set
  * of such nodes - if the created node is the lowest sequential node, the
  * candidate that created the node is the leader. If not, the candidate puts
- * a watch on the next lowest node it finds, and if that node goes down, 
+ * a watch on the next lowest node it finds, and if that node goes down,
  * starts the whole process over by checking if it's the lowest sequential node, etc.
- * 
+ *
  */
 public  class LeaderElector {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-  
-  static final String ELECTION_NODE = "/election";
-  
+
+  public static final String ELECTION_NODE = "/election";
+
   public final static Pattern LEADER_SEQ = Pattern.compile(".*?/?.*?-n_(\\d+)");
   private final static Pattern SESSION_ID = Pattern.compile(".*?/?(.*?-.*?)-n_\\d+");
-  private final static Pattern  NODE_NAME = Pattern.compile(".*?/?(.*?-)(.*?)-n_\\d+");
 
-  protected SolrZkClient zkClient;
-  
-  private ZkCmdExecutor zkCmdExecutor;
+  protected final SolrZkClient zkClient;
 
   private volatile ElectionContext context;
 
-  private ElectionWatcher watcher;
+  private volatile ElectionWatcher watcher;
 
-  private Map<ContextKey,ElectionContext> electionContexts;
-  private ContextKey contextKey;
+  private final Map<ContextKey,ElectionContext> electionContexts;
+  private final ContextKey contextKey;
+
+//  public LeaderElector(SolrZkClient zkClient) {
+//    this.zkClient = zkClient;
+//    this.contextKey = null;
+//    this.electionContexts = new ConcurrentHashMap<>(132, 0.75f, 50);
+//  }
 
-  public LeaderElector(SolrZkClient zkClient) {
-    this.zkClient = zkClient;
-    zkCmdExecutor = new ZkCmdExecutor(zkClient.getZkClientTimeout());
-  }
-  
   public LeaderElector(SolrZkClient zkClient, ContextKey key, Map<ContextKey,ElectionContext> electionContexts) {
+
     this.zkClient = zkClient;
-    zkCmdExecutor = new ZkCmdExecutor(zkClient.getZkClientTimeout());
     this.electionContexts = electionContexts;
     this.contextKey = key;
   }
@@ -99,7 +99,7 @@ public  class LeaderElector {
    * @param replacement has someone else been the leader already?
    */
   private void checkIfIamLeader(final ElectionContext context, boolean replacement) throws KeeperException,
-      InterruptedException, IOException {
+          InterruptedException, IOException {
     context.checkIfIamLeaderFired();
     // get all other numbers...
     final String holdElectionPath = context.electionPath + ELECTION_NODE;
@@ -112,29 +112,14 @@ public  class LeaderElector {
       return;
     }
 
-    // If any double-registrations exist for me, remove all but this latest one!
-    // TODO: can we even get into this state?
-    String prefix = zkClient.getSolrZooKeeper().getSessionId() + "-" + context.id + "-";
-    Iterator<String> it = seqs.iterator();
-    while (it.hasNext()) {
-      String elec = it.next();
-      if (!elec.equals(leaderSeqNodeName) && elec.startsWith(prefix)) {
-        try {
-          String toDelete = holdElectionPath + "/" + elec;
-          log.warn("Deleting duplicate registration: {}", toDelete);
-          zkClient.delete(toDelete, -1, true);
-        } catch (KeeperException.NoNodeException e) {
-          // ignore
-        }
-        it.remove();
-      }
-    }
 
     if (leaderSeqNodeName.equals(seqs.get(0))) {
       // I am the leader
       try {
-        if (zkClient.isClosed()) return; // but our zkClient is already closed
-        runIamLeaderProcess(context, replacement);
+        if (!context.isClosed()) {
+          runIamLeaderProcess(context, replacement);
+        }
+
       } catch (KeeperException.NodeExistsException e) {
         log.error("node exists",e);
         retryElection(context, false);
@@ -151,9 +136,15 @@ public  class LeaderElector {
       }
       try {
         String watchedNode = holdElectionPath + "/" + toWatch;
-        zkClient.getData(watchedNode, watcher = new ElectionWatcher(context.leaderSeqPath, watchedNode, getSeq(context.leaderSeqPath), context), null, true);
-        log.debug("Watching path {} to know if I could be the leader", watchedNode);
+
+        ElectionWatcher oldWatcher = watcher;
+        if (oldWatcher != null) oldWatcher.cancel();
+        zkClient.getData(watchedNode,
+                watcher = new ElectionWatcher(context.leaderSeqPath, watchedNode, getSeq(context.leaderSeqPath), context),
+                null, true);
+        if (log.isDebugEnabled()) log.debug("Watching path {} to know if I could be the leader", watchedNode);
       } catch (KeeperException.SessionExpiredException e) {
+        log.error("ZooKeeper session has expired");
         throw e;
       } catch (KeeperException.NoNodeException e) {
         // the previous node disappeared, check if we are the leader again
@@ -168,13 +159,13 @@ public  class LeaderElector {
 
   // TODO: get this core param out of here
   protected void runIamLeaderProcess(final ElectionContext context, boolean weAreReplacement) throws KeeperException,
-      InterruptedException, IOException {
-    context.runLeaderProcess(weAreReplacement,0);
+          InterruptedException, IOException {
+    context.runLeaderProcess(context, weAreReplacement,0);
   }
-  
+
   /**
    * Returns int given String of form n_0000000001 or n_0000000003, etc.
-   * 
+   *
    * @return sequence number
    */
   public static int getSeq(String nStringSequence) {
@@ -184,11 +175,11 @@ public  class LeaderElector {
       seq = Integer.parseInt(m.group(1));
     } else {
       throw new IllegalStateException("Could not find regex match in:"
-          + nStringSequence);
+              + nStringSequence);
     }
     return seq;
   }
-  
+
   private String getNodeId(String nStringSequence) {
     String id;
     Matcher m = SESSION_ID.matcher(nStringSequence);
@@ -196,42 +187,35 @@ public  class LeaderElector {
       id = m.group(1);
     } else {
       throw new IllegalStateException("Could not find regex match in:"
-          + nStringSequence);
+              + nStringSequence);
     }
     return id;
   }
 
   public static String getNodeName(String nStringSequence){
-    String result;
-    Matcher m = NODE_NAME.matcher(nStringSequence);
-    if (m.matches()) {
-      result = m.group(2);
-    } else {
-      throw new IllegalStateException("Could not find regex match in:"
-          + nStringSequence);
-    }
-    return result;
+
+    return nStringSequence;
 
   }
-  
+
   public int joinElection(ElectionContext context, boolean replacement) throws KeeperException, InterruptedException, IOException {
     return joinElection(context,replacement, false);
   }
 
-    /**
-     * Begin participating in the election process. Gets a new sequential number
-     * and begins watching the node with the sequence number before it, unless it
-     * is the lowest number, in which case, initiates the leader process. If the
-     * node that is watched goes down, check if we are the new lowest node, else
-     * watch the next lowest numbered node.
-     *
-     * @return sequential node number
-     */
+  /**
+   * Begin participating in the election process. Gets a new sequential number
+   * and begins watching the node with the sequence number before it, unless it
+   * is the lowest number, in which case, initiates the leader process. If the
+   * node that is watched goes down, check if we are the new lowest node, else
+   * watch the next lowest numbered node.
+   *
+   * @return sequential node number
+   */
   public int joinElection(ElectionContext context, boolean replacement,boolean joinAtHead) throws KeeperException, InterruptedException, IOException {
     context.joinedElectionFired();
-    
+
     final String shardsElectZkPath = context.electionPath + LeaderElector.ELECTION_NODE;
-    
+
     long sessionId = zkClient.getSolrZooKeeper().getSessionId();
     String id = sessionId + "-" + context.id;
     String leaderSeqPath = null;
@@ -244,21 +228,21 @@ public  class LeaderElector {
           List<String> nodes = OverseerTaskProcessor.getSortedElectionNodes(zkClient, shardsElectZkPath);
           if(nodes.size() <2){
             leaderSeqPath = zkClient.create(shardsElectZkPath + "/" + id + "-n_", null,
-                CreateMode.EPHEMERAL_SEQUENTIAL, false);
+                    CreateMode.EPHEMERAL_SEQUENTIAL, true);
           } else {
             String firstInLine = nodes.get(1);
             log.debug("The current head: {}", firstInLine);
             Matcher m = LEADER_SEQ.matcher(firstInLine);
             if (!m.matches()) {
               throw new IllegalStateException("Could not find regex match in:"
-                  + firstInLine);
+                      + firstInLine);
             }
             leaderSeqPath = shardsElectZkPath + "/" + id + "-n_"+ m.group(1);
             zkClient.create(leaderSeqPath, null, CreateMode.EPHEMERAL, false);
           }
         } else {
           leaderSeqPath = zkClient.create(shardsElectZkPath + "/" + id + "-n_", null,
-              CreateMode.EPHEMERAL_SEQUENTIAL, false);
+                  CreateMode.EPHEMERAL_SEQUENTIAL, true);
         }
 
         log.debug("Joined leadership election with path: {}", leaderSeqPath);
@@ -267,7 +251,7 @@ public  class LeaderElector {
       } catch (ConnectionLossException e) {
         // we don't know if we made our node or not...
         List<String> entries = zkClient.getChildren(shardsElectZkPath, null, true);
-        
+
         boolean foundId = false;
         for (String entry : entries) {
           String nodeId = getNodeId(entry);
@@ -281,12 +265,7 @@ public  class LeaderElector {
           cont = true;
           if (tries++ > 20) {
             throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
-                "", e);
-          }
-          try {
-            Thread.sleep(50);
-          } catch (InterruptedException e2) {
-            Thread.currentThread().interrupt();
+                    "", e);
           }
         }
 
@@ -296,14 +275,9 @@ public  class LeaderElector {
         if (tries++ > 20) {
           context = null;
           throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR,
-              "", e);
+                  "", e);
         }
         cont = true;
-        try {
-          Thread.sleep(50);
-        } catch (InterruptedException e2) {
-          Thread.currentThread().interrupt();
-        }
       }
     }
     checkIfIamLeader(context, replacement);
@@ -339,21 +313,20 @@ public  class LeaderElector {
         try {
           zkClient.delete(myNode, -1, true);
         } catch (KeeperException.NoNodeException nne) {
+          log.info("No znode found to delete at {}", myNode);
           // expected . don't do anything
         } catch (Exception e) {
-          log.warn("My watched node still exists and can't remove {}", myNode, e);
+          ParWork.propegateInterrupt(e);
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception canceling election", e);
         }
         return;
       }
       try {
         // am I the next leader?
         checkIfIamLeader(context, true);
-      } catch (AlreadyClosedException e) {
-
       } catch (Exception e) {
-        if (!zkClient.isClosed()) {
-          log.warn("", e);
-        }
+        ParWork.propegateInterrupt(e);
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception canceling election", e);
       }
     }
   }
@@ -362,18 +335,27 @@ public  class LeaderElector {
    * Set up any ZooKeeper nodes needed for leader election.
    */
   public void setup(final ElectionContext context) throws InterruptedException,
-      KeeperException {
+          KeeperException {
+    // nocommit - already created
     String electZKPath = context.electionPath + LeaderElector.ELECTION_NODE;
+
     if (context instanceof OverseerElectionContext) {
-      zkCmdExecutor.ensureExists(electZKPath, zkClient);
+      //zkCmdExecutor.ensureExists(electZKPath, zkClient);
     } else {
       // we use 2 param so that replica won't create /collection/{collection} if it doesn't exist
+      ShardLeaderElectionContext slec = (ShardLeaderElectionContext) context;
+
+      ZkCmdExecutor zkCmdExecutor = new ZkCmdExecutor(zkClient.getZkClientTimeout());
       zkCmdExecutor.ensureExists(electZKPath, (byte[])null, CreateMode.PERSISTENT, zkClient, 2);
+      System.out.println("CreateNODE:" + ZkStateReader.getShardLeadersPath(slec.collection, slec.shardId));
+      zkCmdExecutor.ensureExists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + slec.collection + "/"
+              + ZkStateReader.SHARD_LEADERS_ZKNODE + (slec.shardId != null ? ("/" + slec.shardId)
+              : ""), (byte[])null, CreateMode.PERSISTENT, zkClient, 2);
     }
 
     this.context = context;
   }
-  
+
   /**
    * Sort n string sequence list.
    */
@@ -388,10 +370,12 @@ public  class LeaderElector {
     ElectionWatcher watcher = this.watcher;
     ElectionContext ctx = context.copy();
     if (electionContexts != null) {
-      electionContexts.put(contextKey, ctx);
+      ElectionContext prevContext = electionContexts.put(contextKey, ctx);
+      if (prevContext != null) {
+        prevContext.close();
+      }
     }
     if (watcher != null) watcher.cancel();
-    this.context.cancelElection();
     this.context.close();
     this.context = ctx;
     joinElection(ctx, true, joinAtHead);
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index dd01368..526e301 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -29,11 +29,20 @@ import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executor;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.BiConsumer;
 
+import net.sf.saxon.trans.Err;
 import org.apache.lucene.util.Version;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
+import org.apache.solr.client.solrj.cloud.autoscaling.AlreadyExistsException;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.ClusterStateProvider;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -50,6 +59,7 @@ import org.apache.solr.cloud.overseer.SliceMutator;
 import org.apache.solr.cloud.overseer.ZkStateWriter;
 import org.apache.solr.cloud.overseer.ZkWriteCommand;
 import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrCloseable;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
@@ -61,10 +71,12 @@ import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CollectionAdminParams;
 import org.apache.solr.common.params.CollectionParams;
+import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.Pair;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.core.CloudConfig;
 import org.apache.solr.core.CoreContainer;
@@ -74,6 +86,8 @@ import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.update.UpdateShardHandler;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
+import org.apache.zookeeper.WatchedEvent;
+import org.apache.zookeeper.Watcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -139,17 +153,31 @@ import com.codahale.metrics.Timer;
 public class Overseer implements SolrCloseable {
   public static final String QUEUE_OPERATION = "operation";
 
+  public static final String OVERSEER_COLLECTION_QUEUE_WORK = "/overseer/collection-queue-work";
+
+  public static final String OVERSEER_QUEUE = "/overseer/queue";
+
+  public static final String OVERSEER_ASYNC_IDS = "/overseer/async_ids";
+
+  public static final String OVERSEER_COLLECTION_MAP_FAILURE = "/overseer/collection-map-failure";
+
+  public static final String OVERSEER_COLLECTION_MAP_COMPLETED = "/overseer/collection-map-completed";
+
+  public static final String OVERSEER_COLLECTION_MAP_RUNNING = "/overseer/collection-map-running";
+
+  public static final String OVERSEER_QUEUE_WORK = "/overseer/queue-work";
+
   // System properties are used in tests to make them run fast
   public static final int STATE_UPDATE_DELAY = ZkStateReader.STATE_UPDATE_DELAY;
   public static final int STATE_UPDATE_BATCH_SIZE = Integer.getInteger("solr.OverseerStateUpdateBatchSize", 10000);
   public static final int STATE_UPDATE_MAX_QUEUE = 20000;
 
   public static final int NUM_RESPONSES_TO_STORE = 10000;
-  public static final String OVERSEER_ELECT = "/overseer_elect";
+  public static final String OVERSEER_ELECT = "/overseer/overseer_elect";
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private volatile ElectionContext context;
 
-  enum LeaderStatus {DONT_KNOW, NO, YES}
 
   /**
    * <p>This class is responsible for dequeueing state change requests from the ZooKeeper queue at <code>/overseer/queue</code>
@@ -168,117 +196,123 @@ public class Overseer implements SolrCloseable {
     //Internal queue where overseer stores events that have not yet been published into cloudstate
     //If Overseer dies while extracting the main queue a new overseer will start from this queue
     private final ZkDistributedQueue workQueue;
-    // Internal map which holds the information about running tasks.
-    private final DistributedMap runningMap;
-    // Internal map which holds the information about successfully completed tasks.
-    private final DistributedMap completedMap;
-    // Internal map which holds the information about failed tasks.
-    private final DistributedMap failureMap;
-
-    private final Stats zkStats;
 
-    private boolean isClosed = false;
+    private volatile boolean isClosed = false;
 
     public ClusterStateUpdater(final ZkStateReader reader, final String myId, Stats zkStats) {
       this.zkClient = reader.getZkClient();
-      this.zkStats = zkStats;
       this.stateUpdateQueue = getStateUpdateQueue(zkStats);
       this.workQueue = getInternalWorkQueue(zkClient, zkStats);
-      this.failureMap = getFailureMap(zkClient);
-      this.runningMap = getRunningMap(zkClient);
-      this.completedMap = getCompletedMap(zkClient);
       this.myId = myId;
       this.reader = reader;
     }
 
-    public Stats getStateUpdateQueueStats() {
-      return stateUpdateQueue.getZkStats();
-    }
-
-    public Stats getWorkQueueStats()  {
-      return workQueue.getZkStats();
-    }
-
     @Override
     public void run() {
+      if (log.isDebugEnabled()) {
+        log.debug("Overseer run() - start");
+      }
+
       MDCLoggingContext.setNode(zkController.getNodeName() );
+      try {
 
-      LeaderStatus isLeader = amILeader();
-      while (isLeader == LeaderStatus.DONT_KNOW) {
-        log.debug("am_i_leader unclear {}", isLeader);
-        isLeader = amILeader();  // not a no, not a yes, try ask again
-      }
+      try {
+        if (log.isDebugEnabled()) {
+          log.debug("set watch on leader znode");
+        }
+        zkClient.exists(Overseer.OVERSEER_ELECT + "/leader", new Watcher() {
 
-      if (log.isInfoEnabled()) {
-        log.info("Starting to work on the main queue : {}", LeaderElector.getNodeName(myId));
+          @Override
+          public void process(WatchedEvent event) {
+            if (Event.EventType.None.equals(event.getType())) {
+              return;
+            }
+            log.info("Overseer leader has changed, closing ...");
+            Overseer.this.close();
+          }} , true);
+      } catch (KeeperException.SessionExpiredException e) {
+        log.warn("ZooKeeper session expired");
+        return;
+      } catch (InterruptedException | AlreadyClosedException e) {
+        ParWork.propegateInterrupt(e);
+        return;
+      } catch (Exception e) {
+       log.error("Error", e);
       }
-      try {
+
+      log.info("Starting to work on the main queue : {}", LeaderElector.getNodeName(myId));
+
         ZkStateWriter zkStateWriter = null;
-        ClusterState clusterState = null;
-        boolean refreshClusterState = true; // let's refresh in the first iteration
+        ClusterState clusterState = reader.getClusterState();
+        assert clusterState != null;
+
         // we write updates in batch, but if an exception is thrown when writing new clusterstate,
         // we do not sure which message is bad message, therefore we will re-process node one by one
         int fallbackQueueSize = Integer.MAX_VALUE;
         ZkDistributedQueue fallbackQueue = workQueue;
         while (!this.isClosed) {
-          isLeader = amILeader();
-          if (LeaderStatus.NO == isLeader) {
-            break;
-          }
-          else if (LeaderStatus.YES != isLeader) {
-            log.debug("am_i_leader unclear {}", isLeader);
-            continue; // not a no, not a yes, try ask again
-          }
-
-          //TODO consider removing 'refreshClusterState' and simply check if clusterState is null
-          if (refreshClusterState) {
+          if (zkStateWriter == null) {
             try {
-              reader.forciblyRefreshAllClusterStateSlow();
-              clusterState = reader.getClusterState();
               zkStateWriter = new ZkStateWriter(reader, stats);
-              refreshClusterState = false;
-
+            //  clusterState = reader.getClusterState();
               // if there were any errors while processing
               // the state queue, items would have been left in the
               // work queue so let's process those first
               byte[] data = fallbackQueue.peek();
-              while (fallbackQueueSize > 0 && data != null)  {
+              while (fallbackQueueSize > 0 && data != null) {
                 final ZkNodeProps message = ZkNodeProps.load(data);
-                if (log.isDebugEnabled()) {
-                  log.debug("processMessage: fallbackQueueSize: {}, message = {}", fallbackQueue.getZkStats().getQueueLength(), message);
-                }
+                log.debug("processMessage: fallbackQueueSize: {}, message = {}", fallbackQueue.getZkStats().getQueueLength(), message);
                 // force flush to ZK after each message because there is no fallback if workQueue items
                 // are removed from workQueue but fail to be written to ZK
                 try {
-                  clusterState = processQueueItem(message, clusterState, zkStateWriter, false, null);
+                  clusterState = processQueueItem(message, reader.getClusterState(), zkStateWriter, false, null);
+                  assert clusterState != null;
+                } catch (InterruptedException | AlreadyClosedException e) {
+                  ParWork.propegateInterrupt(e);
+                  return;
+                } catch (KeeperException.SessionExpiredException e) {
+                  log.error("run()", e);
+
+                  log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
+                  return;
                 } catch (Exception e) {
-                  if (isBadMessage(e)) {
-                    log.warn("Exception when process message = {}, consider as bad message and poll out from the queue", message);
-                    fallbackQueue.poll();
+                  SolrException exp = new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+                  try {
+                    if (isBadMessage(e)) {
+                      log.warn(
+                              "Exception when process message = {}, consider as bad message and poll out from the queue",
+                              message);
+                      fallbackQueue.poll();
+                    }
+                  } catch (Exception e1) {
+                    ParWork.propegateInterrupt(e1);
+                    exp.addSuppressed(e1);
                   }
-                  throw e;
+
+                  throw exp;
                 }
                 fallbackQueue.poll(); // poll-ing removes the element we got by peek-ing
                 data = fallbackQueue.peek();
                 fallbackQueueSize--;
               }
               // force flush at the end of the loop, if there are no pending updates, this is a no op call
-              clusterState = zkStateWriter.writePendingUpdates();
+              //clusterState = zkStateWriter.writePendingUpdates(clusterState);
+              assert clusterState != null;
               // the workQueue is empty now, use stateUpdateQueue as fallback queue
               fallbackQueue = stateUpdateQueue;
               fallbackQueueSize = 0;
-            } catch (AlreadyClosedException e) {
-              return;
             } catch (KeeperException.SessionExpiredException e) {
+              log.error("run()", e);
+
               log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
               return;
-            } catch (InterruptedException e) {
-              Thread.currentThread().interrupt();
+            } catch (InterruptedException | AlreadyClosedException e) {
+              ParWork.propegateInterrupt(e);
               return;
             } catch (Exception e) {
               log.error("Exception in Overseer when process message from work queue, retrying", e);
-              refreshClusterState = true;
-              continue;
+
+              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
             }
           }
 
@@ -286,16 +320,17 @@ public class Overseer implements SolrCloseable {
           try {
             // We do not need to filter any nodes here cause all processed nodes are removed once we flush clusterstate
             queue = new LinkedList<>(stateUpdateQueue.peekElements(1000, 3000L, (x) -> true));
-          } catch (KeeperException.SessionExpiredException e) {
-            log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e);
-            return;
-          } catch (InterruptedException e) {
+          } catch (InterruptedException | AlreadyClosedException e) {
             Thread.currentThread().interrupt();
             return;
-          } catch (AlreadyClosedException e) {
+          } catch (KeeperException.SessionExpiredException e) {
+            log.error("run()", e);
 
+            log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
+            return;
           } catch (Exception e) {
-            log.error("Exception in Overseer main queue loop", e);
+            ParWork.propegateInterrupt(e);
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
           }
           try {
             Set<String> processedNodes = new HashSet<>();
@@ -303,9 +338,7 @@ public class Overseer implements SolrCloseable {
               for (Pair<String, byte[]> head : queue) {
                 byte[] data = head.second();
                 final ZkNodeProps message = ZkNodeProps.load(data);
-                if (log.isDebugEnabled()) {
-                  log.debug("processMessage: queueSize: {}, message = {} current state version: {}", stateUpdateQueue.getZkStats().getQueueLength(), message, clusterState.getZkClusterStateVersion());
-                }
+                // log.debug("processMessage: queueSize: {}, message = {} current state version: {}", stateUpdateQueue.getZkStats().getQueueLength(), message, clusterState.getZkClusterStateVersion());
 
                 processedNodes.add(head.first());
                 fallbackQueueSize = processedNodes.size();
@@ -322,118 +355,98 @@ public class Overseer implements SolrCloseable {
             fallbackQueueSize = processedNodes.size();
             // we should force write all pending updates because the next iteration might sleep until there
             // are more items in the main queue
-            clusterState = zkStateWriter.writePendingUpdates();
+           // clusterState = zkStateWriter.writePendingUpdates(clusterState);
             // clean work queue
             stateUpdateQueue.remove(processedNodes);
             processedNodes.clear();
-          } catch (KeeperException.SessionExpiredException e) {
-            log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e);
-            return;
-          } catch (InterruptedException e) {
+          } catch (InterruptedException | AlreadyClosedException e) {
             Thread.currentThread().interrupt();
             return;
-          } catch (AlreadyClosedException e) {
-  
+          } catch (KeeperException.SessionExpiredException e) {
+            log.error("run()", e);
+
+            log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
+            return;
           } catch (Exception e) {
-            log.error("Exception in Overseer main queue loop", e);
-            refreshClusterState = true; // it might have been a bad version error
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
           }
         }
       } finally {
-        if (log.isInfoEnabled()) {
-          log.info("Overseer Loop exiting : {}", LeaderElector.getNodeName(myId));
+        log.info("Overseer Loop exiting : {}", LeaderElector.getNodeName(myId));
+
+        if (!isClosed) {
+          Overseer.this.close();
         }
-        //do this in a separate thread because any wait is interrupted in this main thread
-        new Thread(this::checkIfIamStillLeader, "OverseerExitThread").start();
+      }
+
+      if (log.isDebugEnabled()) {
+        log.debug("run() - end");
       }
     }
 
     // Return true whenever the exception thrown by ZkStateWriter is correspond
     // to a invalid state or 'bad' message (in this case, we should remove that message from queue)
     private boolean isBadMessage(Exception e) {
+      if (log.isDebugEnabled()) {
+        log.debug("isBadMessage(Exception e={}) - start", e);
+      }
+
       if (e instanceof KeeperException) {
         KeeperException ke = (KeeperException) e;
-        return ke.code() == KeeperException.Code.NONODE || ke.code() == KeeperException.Code.NODEEXISTS;
+        boolean isBadMessage = ke.code() == KeeperException.Code.NONODE || ke.code() == KeeperException.Code.NODEEXISTS;
+        if (log.isDebugEnabled()) {
+          log.debug("isBadMessage(Exception)={} - end", isBadMessage);
+        }
+        return isBadMessage;
       }
-      return !(e instanceof InterruptedException);
+      if (log.isDebugEnabled()) {
+        log.debug("isBadMessage(Exception)=false - end");
+      }
+      return false;
     }
 
-    private ClusterState processQueueItem(ZkNodeProps message, ClusterState clusterState, ZkStateWriter zkStateWriter, boolean enableBatching, ZkStateWriter.ZkWriteCallback callback) throws Exception {
+    private ClusterState processQueueItem(ZkNodeProps message, final ClusterState clusterState, ZkStateWriter zkStateWriter, boolean enableBatching, ZkStateWriter.ZkWriteCallback callback) throws Exception {
+      log.info("Consume state update from queue {}", message);
+      assert clusterState != null;
+      AtomicReference<ClusterState> state = new AtomicReference<>();
+
       final String operation = message.getStr(QUEUE_OPERATION);
       if (operation == null) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Message missing " + QUEUE_OPERATION + ":" + message);
       }
-      List<ZkWriteCommand> zkWriteCommands = null;
-      final Timer.Context timerContext = stats.time(operation);
-      try {
-        zkWriteCommands = processMessage(clusterState, message, operation);
-        stats.success(operation);
-      } catch (Exception e) {
-        // generally there is nothing we can do - in most cases, we have
-        // an issue that will fail again on retry or we cannot communicate with     a
-        // ZooKeeper in which case another Overseer should take over
-        // TODO: if ordering for the message is not important, we could
-        // track retries and put it back on the end of the queue
-        log.error("Overseer could not process the current clusterstate state update message, skipping the message: {}", message, e);
-        stats.error(operation);
-      } finally {
-        timerContext.stop();
-      }
-      if (zkWriteCommands != null) {
-        clusterState = zkStateWriter.enqueueUpdate(clusterState, zkWriteCommands, callback);
-        if (!enableBatching)  {
-          clusterState = zkStateWriter.writePendingUpdates();
-        }
-      }
-      return clusterState;
-    }
+      AtomicBoolean stop = new AtomicBoolean(false);
+      ParWork.getExecutor().invokeAll(Collections.singleton(new Callable<Object>() { // ### expert use
 
-    private void checkIfIamStillLeader() {
-      if (zkController != null && (zkController.getCoreContainer().isShutDown() || zkController.isClosed())) {
-        return;//shutting down no need to go further
-      }
-      org.apache.zookeeper.data.Stat stat = new org.apache.zookeeper.data.Stat();
-      final String path = OVERSEER_ELECT + "/leader";
-      byte[] data;
-      try {
-        data = zkClient.getData(path, null, stat, true);
-      } catch (AlreadyClosedException e) {
-        return;
-      } catch (Exception e) {
-        log.warn("Error communicating with ZooKeeper", e);
-        return;
-      }
-      try {
-        Map m = (Map) Utils.fromJSON(data);
-        String id = (String) m.get(ID);
-        if(overseerCollectionConfigSetProcessor.getId().equals(id)){
-          try {
-            log.warn("I (id={}) am exiting, but I'm still the leader",
-                overseerCollectionConfigSetProcessor.getId());
-            zkClient.delete(path,stat.getVersion(),true);
-          } catch (KeeperException.BadVersionException e) {
-            //no problem ignore it some other Overseer has already taken over
-          } catch (Exception e) {
-            log.error("Could not delete my leader node {}", path, e);
-          }
+          @Override
+          public Object call() throws Exception {
 
-        } else{
-          log.info("somebody else (id={}) has already taken up the overseer position", id);
-        }
-      } finally {
-        //if I am not shutting down, Then I need to rejoin election
-        try {
-          if (zkController != null && !zkController.getCoreContainer().isShutDown()) {
-            zkController.rejoinOverseerElection(null, false);
-          }
-        } catch (Exception e) {
-          log.warn("Unable to rejoinElection ",e);
-        }
-      }
+            List<ZkWriteCommand> zkWriteOps = processMessage(clusterState, message, operation);
+                ZkStateWriter zkStateWriter = new ZkStateWriter(zkController.getZkStateReader(), new Stats());
+                ClusterState cs = zkStateWriter.enqueueUpdate(clusterState, zkWriteOps,
+                        new ZkStateWriter.ZkWriteCallback() {
+
+                          @Override
+                          public void onWrite() throws Exception {
+                            // log.info("on write callback");
+                          }
+
+                        });
+                System.out.println("return cs:" + cs);
+                state.set(cs);
+                return null;
+
+
+          }}));
+
+      return (state.get() != null ? state.get() : clusterState);
     }
 
     private List<ZkWriteCommand> processMessage(ClusterState clusterState,
-        final ZkNodeProps message, final String operation) {
+                                                final ZkNodeProps message, final String operation) {
+      //if (log.isDebugEnabled()) {
+        log.info("processMessage(ClusterState clusterState={}, ZkNodeProps message={}, String operation={}) - start", clusterState, message, operation);
+     // }
+
       CollectionParams.CollectionAction collectionAction = CollectionParams.CollectionAction.get(operation);
       if (collectionAction != null) {
         switch (collectionAction) {
@@ -455,7 +468,11 @@ public class Overseer implements SolrCloseable {
             ExclusiveSliceProperty dProp = new ExclusiveSliceProperty(clusterState, message);
             if (dProp.balanceProperty()) {
               String collName = message.getStr(ZkStateReader.COLLECTION_PROP);
-              return Collections.singletonList(new ZkWriteCommand(collName, dProp.getDocCollection()));
+              List<ZkWriteCommand> returnList = Collections.singletonList(new ZkWriteCommand(collName, dProp.getDocCollection()));
+              if (log.isDebugEnabled()) {
+                log.debug("processMessage(ClusterState, ZkNodeProps, String) - end");
+              }
+              return returnList;
             }
             break;
           case MODIFYCOLLECTION:
@@ -465,7 +482,7 @@ public class Overseer implements SolrCloseable {
             return Collections.singletonList(new ClusterStateMutator(getSolrCloudManager()).migrateStateFormat(clusterState, message));
           default:
             throw new RuntimeException("unknown operation:" + operation
-                + " contents:" + message.getProperties());
+                    + " contents:" + message.getProperties());
         }
       } else {
         OverseerAction overseerAction = OverseerAction.get(operation);
@@ -487,9 +504,7 @@ public class Overseer implements SolrCloseable {
             return Collections.singletonList(new SliceMutator(getSolrCloudManager()).updateShardState(clusterState, message));
           case QUIT:
             if (myId.equals(message.get(ID))) {
-              if (log.isInfoEnabled()) {
-                log.info("Quit command received {} {}", message, LeaderElector.getNodeName(myId));
-              }
+              log.info("Quit command received {} {}", message, LeaderElector.getNodeName(myId));
               overseerCollectionConfigSetProcessor.close();
               close();
             } else {
@@ -503,61 +518,32 @@ public class Overseer implements SolrCloseable {
         }
       }
 
-      return Collections.singletonList(ZkStateWriter.NO_OP);
-    }
-
-    private LeaderStatus amILeader() {
-      Timer.Context timerContext = stats.time("am_i_leader");
-      boolean success = true;
-      String propsId = null;
-      try {
-        ZkNodeProps props = ZkNodeProps.load(zkClient.getData(
-            OVERSEER_ELECT + "/leader", null, null, true));
-        propsId = props.getStr(ID);
-        if (myId.equals(propsId)) {
-          return LeaderStatus.YES;
-        }
-      } catch (KeeperException e) {
-        success = false;
-        if (e.code() == KeeperException.Code.CONNECTIONLOSS) {
-          log.error("", e);
-          return LeaderStatus.DONT_KNOW;
-        } else if (e.code() != KeeperException.Code.SESSIONEXPIRED) {
-          log.warn("", e);
-        } else {
-          log.debug("", e);
-        }
-      } catch (InterruptedException e) {
-        success = false;
-        Thread.currentThread().interrupt();
-      } catch (AlreadyClosedException e) {
-        success = false;
-      } catch (Exception e) {
-        success = false;
-        log.warn("Unexpected exception", e);
-      } finally {
-        timerContext.stop();
-        if (success)  {
-          stats.success("am_i_leader");
-        } else  {
-          stats.error("am_i_leader");
-        }
+      List<ZkWriteCommand> returnList = Collections.singletonList(ZkStateWriter.NO_OP);
+      if (log.isDebugEnabled()) {
+        log.debug("processMessage(ClusterState, ZkNodeProps, String) - end");
       }
-      log.info("According to ZK I (id={}) am no longer a leader. propsId={}", myId, propsId);
-      return LeaderStatus.NO;
+      return returnList;
     }
 
     @Override
-      public void close() {
-        this.isClosed = true;
+    public void close() {
+      if (log.isDebugEnabled()) {
+        log.debug("close() - start");
+      }
+      //ExecutorUtil.shutdownAndAwaitTermination(executor);
+      this.isClosed = true;
+
+      if (log.isDebugEnabled()) {
+        log.debug("close() - end");
       }
+    }
 
   }
 
   public static class OverseerThread extends Thread implements Closeable {
 
     protected volatile boolean isClosed;
-    private Closeable thread;
+    private final Closeable thread;
 
     public OverseerThread(ThreadGroup tg, Closeable thread) {
       super(tg, (Runnable) thread);
@@ -622,18 +608,26 @@ public class Overseer implements SolrCloseable {
     this.zkController = zkController;
     this.stats = new Stats();
     this.config = config;
+
   }
 
-  public synchronized void start(String id) {
+  public synchronized void start(String id, ElectionContext context) {
     MDCLoggingContext.setNode(zkController == null ?
         null :
         zkController.getNodeName());
     this.id = id;
+    this.context = context;
     closed = false;
+
+    try {
+      if (context != null) context.close();
+    } catch (Exception e) {
+      log.error("", e);
+    }
     doClose();
     stats = new Stats();
     log.info("Overseer (id={}) starting", id);
-    createOverseerNode(reader.getZkClient());
+    //createOverseerNode(reader.getZkClient());
     //launch cluster state updater thread
     ThreadGroup tg = new ThreadGroup("Overseer state updater.");
     updaterThread = new OverseerThread(tg, new ClusterStateUpdater(reader, id, stats), "OverseerStateUpdate-" + id);
@@ -818,8 +812,13 @@ public class Overseer implements SolrCloseable {
       log.info("Overseer (id={}) closing", id);
     }
     this.closed = true;
-    doClose();
 
+    try {
+     if (context != null) context.close();
+    } catch (Exception e) {
+      log.error("", e);
+    }
+    doClose();
     assert ObjectReleaseTracker.release(this);
   }
 
@@ -828,38 +827,33 @@ public class Overseer implements SolrCloseable {
     return closed;
   }
 
-  private void doClose() {
-    
-    if (updaterThread != null) {
-      IOUtils.closeQuietly(updaterThread);
-      updaterThread.interrupt();
+  void doClose() {
+    if (log.isDebugEnabled()) {
+      log.debug("doClose() - start");
     }
-    if (ccThread != null) {
-      IOUtils.closeQuietly(ccThread);
-      ccThread.interrupt();
-    }
-    if (triggerThread != null)  {
-      IOUtils.closeQuietly(triggerThread);
-      triggerThread.interrupt();
-    }
-    if (updaterThread != null) {
-      try {
-        updaterThread.join();
-      } catch (InterruptedException e) {}
-    }
-    if (ccThread != null) {
-      try {
-        ccThread.join();
-      } catch (InterruptedException e) {}
+    try (ParWork closer = new ParWork(this, true)) {
+
+      closer.collect(() -> {
+        IOUtils.closeQuietly(ccThread);
+        ccThread.interrupt();
+      });
+
+      closer.collect(() -> {
+        IOUtils.closeQuietly(updaterThread);
+        updaterThread.interrupt();
+      });
+
+      closer.collect(() -> {
+        IOUtils.closeQuietly(triggerThread);
+        triggerThread.interrupt();
+      });
+
+      closer.addCollect("OverseerInternals");
     }
-    if (triggerThread != null)  {
-      try {
-        triggerThread.join();
-      } catch (InterruptedException e)  {}
+
+    if (log.isDebugEnabled()) {
+      log.debug("doClose() - end");
     }
-    updaterThread = null;
-    ccThread = null;
-    triggerThread = null;
   }
 
   /**
@@ -1049,9 +1043,6 @@ public class Overseer implements SolrCloseable {
   }
 
   public void offerStateUpdate(byte[] data) throws KeeperException, InterruptedException {
-    if (zkController.getZkClient().isClosed()) {
-      throw new AlreadyClosedException();
-    }
     getStateUpdateQueue().offer(data);
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
index e25befa..90e4d7e 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerElectionContext.java
@@ -17,94 +17,99 @@
 
 package org.apache.solr.cloud;
 
+import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.SolrException.ErrorCode;
+
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.cloud.SolrZkClient;
-import org.apache.solr.common.cloud.ZkCmdExecutor;
 import org.apache.solr.common.cloud.ZkNodeProps;
-import org.apache.solr.common.util.Utils;
-import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static org.apache.solr.common.params.CommonParams.ID;
 
-final class OverseerElectionContext extends ElectionContext {
+final class OverseerElectionContext extends ShardLeaderElectionContextBase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   private final SolrZkClient zkClient;
   private final Overseer overseer;
   private volatile boolean isClosed = false;
 
-  public OverseerElectionContext(SolrZkClient zkClient, Overseer overseer, final String zkNodeName) {
-    super(zkNodeName, Overseer.OVERSEER_ELECT, Overseer.OVERSEER_ELECT + "/leader", null, zkClient);
+  public OverseerElectionContext(final String zkNodeName, SolrZkClient zkClient, Overseer overseer) {
+    super(zkNodeName, Overseer.OVERSEER_ELECT, Overseer.OVERSEER_ELECT + "/leader", new ZkNodeProps(ID, zkNodeName), zkClient);
     this.overseer = overseer;
     this.zkClient = zkClient;
-    try {
-      new ZkCmdExecutor(zkClient.getZkClientTimeout()).ensureExists(Overseer.OVERSEER_ELECT, zkClient);
-    } catch (KeeperException e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    } catch (InterruptedException e) {
-      Thread.currentThread().interrupt();
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    }
   }
 
   @Override
-  void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStartMs) throws KeeperException,
-      InterruptedException {
+  void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs) throws KeeperException,
+          InterruptedException, IOException {
     if (isClosed) {
       return;
     }
-    log.info("I am going to be the leader {}", id);
-    final String id = leaderSeqPath
-        .substring(leaderSeqPath.lastIndexOf("/") + 1);
-    ZkNodeProps myProps = new ZkNodeProps(ID, id);
-
-    zkClient.makePath(leaderPath, Utils.toJSON(myProps),
-        CreateMode.EPHEMERAL, true);
-    if (pauseBeforeStartMs > 0) {
-      try {
-        Thread.sleep(pauseBeforeStartMs);
-      } catch (InterruptedException e) {
-        Thread.interrupted();
-        log.warn("Wait interrupted ", e);
-      }
-    }
+
+    super.runLeaderProcess(context, weAreReplacement, pauseBeforeStartMs);
+
     synchronized (this) {
       if (!this.isClosed && !overseer.getZkController().getCoreContainer().isShutDown()) {
-        overseer.start(id);
+        overseer.start(id, context);
       }
     }
   }
 
+  public Overseer getOverseer() {
+    return  overseer;
+  }
+
   @Override
   public void cancelElection() throws InterruptedException, KeeperException {
-    super.cancelElection();
-    overseer.close();
+
+    try {
+      super.cancelElection();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception closing Overseer", e);
+    }
+    try {
+      overseer.doClose();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception closing Overseer", e);
+    }
   }
 
   @Override
-  public synchronized void close() {
-    this.isClosed = true;
-    overseer.close();
+  public void close() {
+    try {
+      super.close();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception canceling election", e);
+    }
+
+    try {
+      overseer.doClose();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception closing Overseer", e);
+    }
+    this.isClosed  = true;
+
   }
 
   @Override
   public ElectionContext copy() {
-    return new OverseerElectionContext(zkClient, overseer, id);
+    return new OverseerElectionContext(id, zkClient, overseer);
   }
 
   @Override
   public void joinedElectionFired() {
-    overseer.close();
+
   }
 
   @Override
   public void checkIfIamLeaderFired() {
-    // leader changed - close the overseer
-    overseer.close();
-  }
 
+  }
 }
+
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerMessageHandler.java
index 1a40a0a..32c1968 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerMessageHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerMessageHandler.java
@@ -29,7 +29,7 @@ public interface OverseerMessageHandler {
    *
    * @return response
    */
-  OverseerSolrResponse processMessage(ZkNodeProps message, String operation);
+  OverseerSolrResponse processMessage(ZkNodeProps message, String operation) throws InterruptedException;
 
   /**
    * @return the name of the OverseerMessageHandler
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
index 786a718..8c13d30 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
@@ -26,18 +26,22 @@ import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Predicate;
 
 import com.codahale.metrics.Timer;
 import com.google.common.collect.ImmutableSet;
-import org.apache.commons.io.IOUtils;
-import org.apache.solr.cloud.Overseer.LeaderStatus;
 import org.apache.solr.cloud.OverseerTaskQueue.QueueEvent;
 import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.WorkException;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
@@ -72,8 +76,6 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
   public static final int MAX_PARALLEL_TASKS = 100;
   public static final int MAX_BLOCKED_TASKS = 1000;
 
-  public ExecutorService tpe;
-
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   private OverseerTaskQueue workQueue;
@@ -82,28 +84,26 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
   private DistributedMap failureMap;
 
   // Set that maintains a list of all the tasks that are running. This is keyed on zk id of the task.
-  final private Set<String> runningTasks;
+  private final Set<String> runningTasks = ConcurrentHashMap.newKeySet(500);
 
   // List of completed tasks. This is used to clean up workQueue in zk.
-  final private HashMap<String, QueueEvent> completedTasks;
-
-  private volatile String myId;
+  private final Map<String, QueueEvent> completedTasks = new ConcurrentHashMap<>(132, 0.75f, 50);
 
-  private volatile ZkStateReader zkStateReader;
+  private final String myId;
 
-  private boolean isClosed;
+  private volatile boolean isClosed;
 
-  private volatile Stats stats;
+  private final Stats stats;
 
   // Set of tasks that have been picked up for processing but not cleaned up from zk work-queue.
   // It may contain tasks that have completed execution, have been entered into the completed/failed map in zk but not
   // deleted from the work-queue as that is a batched operation.
-  final private Set<String> runningZKTasks;
+  final private Set<String> runningZKTasks = ConcurrentHashMap.newKeySet(500);
   // This map may contain tasks which are read from work queue but could not
   // be executed because they are blocked or the execution queue is full
   // This is an optimization to ensure that we do not read the same tasks
   // again and again from ZK.
-  final private Map<String, QueueEvent> blockedTasks = Collections.synchronizedMap(new LinkedHashMap<>());
+  final private Map<String, QueueEvent> blockedTasks = new ConcurrentSkipListMap<>();
   final private Predicate<String> excludedTasks = new Predicate<String>() {
     @Override
     public boolean test(String s) {
@@ -117,13 +117,11 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
 
   };
 
-  private final Object waitLock = new Object();
-
-  protected OverseerMessageHandlerSelector selector;
+  protected final OverseerMessageHandlerSelector selector;
 
-  private OverseerNodePrioritizer prioritizer;
+  private final OverseerNodePrioritizer prioritizer;
 
-  private String thisNode;
+  private final String thisNode;
 
   public OverseerTaskProcessor(ZkStateReader zkStateReader, String myId,
                                         Stats stats,
@@ -133,7 +131,6 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
                                         DistributedMap runningMap,
                                         DistributedMap completedMap,
                                         DistributedMap failureMap) {
-    this.zkStateReader = zkStateReader;
     this.myId = myId;
     this.stats = stats;
     this.selector = selector;
@@ -142,9 +139,6 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
     this.runningMap = runningMap;
     this.completedMap = completedMap;
     this.failureMap = failureMap;
-    this.runningZKTasks = new HashSet<>();
-    this.runningTasks = new HashSet<>();
-    this.completedTasks = new HashMap<>();
     thisNode = Utils.getMDCNode();
   }
 
@@ -152,11 +146,6 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
   public void run() {
     MDCLoggingContext.setNode(thisNode);
     log.debug("Process current queue of overseer operations");
-    LeaderStatus isLeader = amILeader();
-    while (isLeader == LeaderStatus.DONT_KNOW) {
-      log.debug("am_i_leader unclear {}", isLeader);
-      isLeader = amILeader();  // not a no, not a yes, try ask again
-    }
 
     String oldestItemInWorkQueue = null;
     // hasLeftOverItems - used for avoiding re-execution of async tasks that were processed by a previous Overseer.
@@ -171,10 +160,15 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
       // We don't need to handle this. This is just a fail-safe which comes in handy in skipping already processed
       // async calls.
       SolrException.log(log, "", e);
-    } catch (AlreadyClosedException e) {
-      return;
-    } catch (InterruptedException e) {
-      Thread.currentThread().interrupt();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      if (e instanceof KeeperException.SessionExpiredException) {
+        return;
+      }
+      if (e instanceof InterruptedException || e instanceof AlreadyClosedException) {
+        return;
+      }
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }
 
     if (oldestItemInWorkQueue == null)
@@ -184,50 +178,26 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
 
     try {
       prioritizer.prioritizeOverseerNodes(myId);
-    } catch (AlreadyClosedException e) {
-        return;
     } catch (Exception e) {
-      if (!zkStateReader.getZkClient().isClosed()) {
-        log.error("Unable to prioritize overseer ", e);
+      ParWork.propegateInterrupt(e);
+      if (e instanceof KeeperException.SessionExpiredException) {
+        return;
+      }
+      if (e instanceof InterruptedException || e instanceof AlreadyClosedException) {
+        return;
       }
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }
 
-    // TODO: Make maxThreads configurable.
-
-    this.tpe = new ExecutorUtil.MDCAwareThreadPoolExecutor(5, MAX_PARALLEL_TASKS, 0L, TimeUnit.MILLISECONDS,
-        new SynchronousQueue<Runnable>(),
-        new SolrNamedThreadFactory("OverseerThreadFactory"));
     try {
       while (!this.isClosed) {
         try {
-          isLeader = amILeader();
-          if (LeaderStatus.NO == isLeader) {
-            break;
-          } else if (LeaderStatus.YES != isLeader) {
-            log.debug("am_i_leader unclear {}", isLeader);
-            continue; // not a no, not a yes, try asking again
-          }
 
-          if (log.isDebugEnabled()) {
-            log.debug("Cleaning up work-queue. #Running tasks: {} #Completed tasks: {}", runningTasksSize(), completedTasks.size());
-          }
+          if (log.isDebugEnabled()) log.debug("Cleaning up work-queue. #Running tasks: {} #Completed tasks: {}",  runningTasksSize(), completedTasks.size());
           cleanUpWorkQueue();
 
           printTrackingMaps();
 
-          boolean waited = false;
-
-          while (runningTasksSize() > MAX_PARALLEL_TASKS) {
-            synchronized (waitLock) {
-              waitLock.wait(100);//wait for 100 ms or till a task is complete
-            }
-            waited = true;
-          }
-
-          if (waited)
-            cleanUpWorkQueue();
-
-
           ArrayList<QueueEvent> heads = new ArrayList<>(blockedTasks.size() + MAX_PARALLEL_TASKS);
           heads.addAll(blockedTasks.values());
 
@@ -238,157 +208,170 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
             //instead of reading MAX_PARALLEL_TASKS items always, we should only fetch as much as we can execute
             int toFetch = Math.min(MAX_BLOCKED_TASKS - heads.size(), MAX_PARALLEL_TASKS - runningTasksSize());
             List<QueueEvent> newTasks = workQueue.peekTopN(toFetch, excludedTasks, 2000L);
-            if (log.isDebugEnabled()) {
-              log.debug("Got {} tasks from work-queue : [{}]", newTasks.size(), newTasks);
-            }
+            log.debug("Got {} tasks from work-queue : [{}]", newTasks.size(), newTasks);
             heads.addAll(newTasks);
-          } else {
-            // Prevent free-spinning this loop.
-            Thread.sleep(1000);
           }
 
-          if (isClosed) break;
-
-          if (heads.isEmpty()) {
-            continue;
-          }
+//          if (heads.isEmpty()) {
+//            log.debug()
+//            continue;
+//          }
 
           blockedTasks.clear(); // clear it now; may get refilled below.
 
           taskBatch.batchId++;
           boolean tooManyTasks = false;
-          for (QueueEvent head : heads) {
-            if (!tooManyTasks) {
-              synchronized (runningTasks) {
+          try (ParWork worker = new ParWork(this)) {
+
+            for (QueueEvent head : heads) {
+              if (!tooManyTasks) {
                 tooManyTasks = runningTasksSize() >= MAX_PARALLEL_TASKS;
               }
-            }
-            if (tooManyTasks) {
-              // Too many tasks are running, just shove the rest into the "blocked" queue.
-              if(blockedTasks.size() < MAX_BLOCKED_TASKS)
-                blockedTasks.put(head.getId(), head);
-              continue;
-            }
-            synchronized (runningZKTasks) {
-              if (runningZKTasks.contains(head.getId())) continue;
-            }
-            final ZkNodeProps message = ZkNodeProps.load(head.getBytes());
-            final String asyncId = message.getStr(ASYNC);
-            if (hasLeftOverItems) {
-              if (head.getId().equals(oldestItemInWorkQueue))
-                hasLeftOverItems = false;
-              if (asyncId != null && (completedMap.contains(asyncId) || failureMap.contains(asyncId))) {
-                log.debug("Found already processed task in workQueue, cleaning up. AsyncId [{}]",asyncId );
+
+              if (runningZKTasks.contains(head.getId())) {
+                log.warn("Task found in running ZKTasks already, contining");
+                continue;
+              }
+
+              final ZkNodeProps message = ZkNodeProps.load(head.getBytes());
+              final String asyncId = message.getStr(ASYNC);
+              if (hasLeftOverItems) {
+                if (head.getId().equals(oldestItemInWorkQueue))
+                  hasLeftOverItems = false;
+                if (asyncId != null && (completedMap.contains(asyncId) || failureMap.contains(asyncId))) {
+                  log.debug("Found already processed task in workQueue, cleaning up. AsyncId [{}]", asyncId);
+                  workQueue.remove(head);
+                  continue;
+                }
+              }
+              String operation = message.getStr(Overseer.QUEUE_OPERATION);
+              if (operation == null) {
+                log.error("Msg does not have required " + Overseer.QUEUE_OPERATION + ": {}", message);
                 workQueue.remove(head);
                 continue;
               }
-            }
-            String operation = message.getStr(Overseer.QUEUE_OPERATION);
-            if (operation == null) {
-              log.error("Msg does not have required {} : {}", Overseer.QUEUE_OPERATION, message);
-              workQueue.remove(head);
-              continue;
-            }
-            OverseerMessageHandler messageHandler = selector.selectOverseerMessageHandler(message);
-            OverseerMessageHandler.Lock lock = messageHandler.lockTask(message, taskBatch);
-            if (lock == null) {
-              if (log.isDebugEnabled()) {
-                log.debug("Exclusivity check failed for [{}]", message);
+              OverseerMessageHandler messageHandler = selector.selectOverseerMessageHandler(message);
+              OverseerMessageHandler.Lock lock = messageHandler.lockTask(message, taskBatch);
+              if (lock == null) {
+                log.debug("Exclusivity check failed for [{}]", message.toString());
+                // we may end crossing the size of the MAX_BLOCKED_TASKS. They are fine
+                if (blockedTasks.size() < MAX_BLOCKED_TASKS)
+                  blockedTasks.put(head.getId(), head);
+                continue;
               }
-              //we may end crossing the size of the MAX_BLOCKED_TASKS. They are fine
-              if (blockedTasks.size() < MAX_BLOCKED_TASKS)
-                blockedTasks.put(head.getId(), head);
-              continue;
-            }
-            try {
-              markTaskAsRunning(head, asyncId);
-              if (log.isDebugEnabled()) {
+              try {
+                markTaskAsRunning(head, asyncId);
                 log.debug("Marked task [{}] as running", head.getId());
+              } catch (Exception e) {
+                if (e instanceof KeeperException.SessionExpiredException || e instanceof  InterruptedException) {
+                  ParWork.propegateInterrupt(e);
+                  log.error("ZooKeeper session has expired");
+                  return;
+                }
+
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
               }
-            } catch (KeeperException.NodeExistsException e) {
-              lock.unlock();
-              // This should never happen
-              log.error("Tried to pick up task [{}] when it was already running!", head.getId());
-              continue;
-            } catch (InterruptedException e) {
-              lock.unlock();
-              log.error("Thread interrupted while trying to pick task {} for execution.", head.getId());
-              Thread.currentThread().interrupt();
-              continue;
-            }
-            if (log.isDebugEnabled()) {
-              log.debug("{}: Get the message id: {} message: {}", messageHandler.getName(), head.getId(), message);
+              if (log.isDebugEnabled()) log.debug(
+                  messageHandler.getName() + ": Get the message id:" + head.getId() + " message:" + message.toString());
+              Runner runner = new Runner(messageHandler, message,
+                  operation, head, lock);
+              worker.add(runner);
             }
-            Runner runner = new Runner(messageHandler, message,
-                operation, head, lock);
-            tpe.execute(runner);
-          }
 
-        } catch (KeeperException e) {
-          if (e.code() == KeeperException.Code.SESSIONEXPIRED) {
-            log.warn("Overseer cannot talk to ZK");
-            return;
           }
-          SolrException.log(log, "", e);
-          
-          // Prevent free-spinning this loop.
-          try {
-            Thread.sleep(1000);
-          } catch (InterruptedException e1) {
-            Thread.currentThread().interrupt();
+
+        } catch (InterruptedException | AlreadyClosedException e) {
+          ParWork.propegateInterrupt(e);
+          return;
+        } catch (Exception e) {
+          SolrException.log(log, e);
+
+          if (e instanceof KeeperException.SessionExpiredException) {
             return;
           }
-          
-        } catch (InterruptedException e) {
-          Thread.currentThread().interrupt();
-          return;
-        } catch (AlreadyClosedException e) {
 
-        } catch (Exception e) {
-          SolrException.log(log, "", e);
         }
       }
     } finally {
       this.close();
     }
+
+    if (log.isDebugEnabled()) {
+      log.debug("run() - end");
+    }
   }
 
   private int runningTasksSize() {
-    synchronized (runningTasks) {
-      return runningTasks.size();
+    if (log.isDebugEnabled()) {
+      log.debug("runningTasksSize() - start");
+    }
+
+    int returnint = runningTasks.size();
+    if (log.isDebugEnabled()) {
+      log.debug("runningTasksSize() - end");
     }
+    return returnint;
+
   }
 
   private void cleanUpWorkQueue() throws KeeperException, InterruptedException {
-    synchronized (completedTasks) {
-      for (Map.Entry<String, QueueEvent> entry : completedTasks.entrySet()) {
-        workQueue.remove(entry.getValue());
-        synchronized (runningZKTasks) {
-          runningZKTasks.remove(entry.getKey());
-        }
+    if (log.isDebugEnabled()) {
+      log.debug("cleanUpWorkQueue() - start");
+    }
+
+    Set<Map.Entry<String, QueueEvent>> entrySet = completedTasks.entrySet();
+    AtomicBoolean sessionExpired = new AtomicBoolean();
+    AtomicBoolean interrupted = new AtomicBoolean();
+    try (ParWork work = new ParWork(this)) {
+      for (Map.Entry<String, QueueEvent> entry : entrySet) {
+        work.collect(()->{
+          if (interrupted.get() || sessionExpired.get()) {
+            return;
+          }
+          try {
+            workQueue.remove(entry.getValue());
+          } catch (KeeperException.SessionExpiredException e) {
+            sessionExpired.set(true);
+          } catch (InterruptedException e) {
+            interrupted.set(true);
+          } catch (KeeperException e) {
+           log.error("Exception removing item from workQueue", e);
+          }
+          runningTasks.remove(entry.getKey());});
+          completedTasks.remove(entry.getKey());
       }
-      completedTasks.clear();
+    }
+
+
+    if (interrupted.get()) {
+      Thread.currentThread().interrupt();
+      throw new InterruptedException();
+    }
+
+    if (sessionExpired.get()) {
+      throw new KeeperException.SessionExpiredException();
+    }
+
+    if (log.isDebugEnabled()) {
+      log.debug("cleanUpWorkQueue() - end");
     }
   }
 
   public void close() {
+    if (log.isDebugEnabled()) {
+      log.debug("close() - start");
+    }
+
     isClosed = true;
-    if (tpe != null) {
-      if (!tpe.isShutdown()) {
-        ExecutorUtil.shutdownAndAwaitTermination(tpe);
-      }
+
+    try (ParWork closer = new ParWork(this)) {
+      closer.add("OTP", selector);
     }
-    IOUtils.closeQuietly(selector);
   }
 
   public static List<String> getSortedOverseerNodeNames(SolrZkClient zk) throws KeeperException, InterruptedException {
-    List<String> children = null;
-    try {
-      children = zk.getChildren(Overseer.OVERSEER_ELECT + LeaderElector.ELECTION_NODE, null, true);
-    } catch (Exception e) {
-      log.warn("error ", e);
-      return new ArrayList<>();
-    }
+    List<String> children = zk.getChildren(Overseer.OVERSEER_ELECT + LeaderElector.ELECTION_NODE, null, true);
+
     LeaderElector.sortSeqs(children);
     ArrayList<String> nodeNames = new ArrayList<>(children.size());
     for (String c : children) nodeNames.add(LeaderElector.getNodeName(c));
@@ -396,15 +379,9 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
   }
 
   public static List<String> getSortedElectionNodes(SolrZkClient zk, String path) throws KeeperException, InterruptedException {
-    List<String> children = null;
-    try {
-      children = zk.getChildren(path, null, true);
+    List<String> children = zk.getChildren(path, null, true);
       LeaderElector.sortSeqs(children);
       return children;
-    } catch (Exception e) {
-      throw e;
-    }
-
   }
 
   public static String getLeaderNode(SolrZkClient zkClient) throws KeeperException, InterruptedException {
@@ -425,43 +402,6 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
     return  (String) m.get(ID);
   }
 
-  protected LeaderStatus amILeader() {
-    String statsName = "collection_am_i_leader";
-    Timer.Context timerContext = stats.time(statsName);
-    boolean success = true;
-    String propsId = null;
-    try {
-      ZkNodeProps props = ZkNodeProps.load(zkStateReader.getZkClient().getData(
-          Overseer.OVERSEER_ELECT + "/leader", null, null, true));
-      propsId = props.getStr(ID);
-      if (myId.equals(propsId)) {
-        return LeaderStatus.YES;
-      }
-    } catch (KeeperException e) {
-      success = false;
-      if (e.code() == KeeperException.Code.CONNECTIONLOSS) {
-        log.error("", e);
-        return LeaderStatus.DONT_KNOW;
-      } else if (e.code() != KeeperException.Code.SESSIONEXPIRED) {
-        log.warn("", e);
-      } else {
-        log.debug("", e);
-      }
-    } catch (InterruptedException e) {
-      success = false;
-      Thread.currentThread().interrupt();
-    } finally {
-      timerContext.stop();
-      if (success)  {
-        stats.success(statsName);
-      } else  {
-        stats.error(statsName);
-      }
-    }
-    log.info("According to ZK I (id={}) am no longer a leader. propsId={}", myId, propsId);
-    return LeaderStatus.NO;
-  }
-
   public boolean isClosed() {
     return isClosed;
   }
@@ -469,34 +409,26 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
   @SuppressWarnings("unchecked")
   private void markTaskAsRunning(QueueEvent head, String asyncId)
       throws KeeperException, InterruptedException {
-    synchronized (runningZKTasks) {
-      runningZKTasks.add(head.getId());
-    }
-
-    synchronized (runningTasks) {
-      runningTasks.add(head.getId());
-    }
+    runningZKTasks.add(head.getId());
 
+    runningTasks.add(head.getId());
 
     if (asyncId != null)
       runningMap.put(asyncId, null);
   }
   
   protected class Runner implements Runnable {
-    ZkNodeProps message;
-    String operation;
-    OverseerSolrResponse response;
-    QueueEvent head;
-    OverseerMessageHandler messageHandler;
-    private final OverseerMessageHandler.Lock lock;
+    final ZkNodeProps message;
+    final String operation;
+    volatile OverseerSolrResponse response;
+    final QueueEvent head;
+    final OverseerMessageHandler messageHandler;
 
     public Runner(OverseerMessageHandler messageHandler, ZkNodeProps message, String operation, QueueEvent head, OverseerMessageHandler.Lock lock) {
       this.message = message;
       this.operation = operation;
       this.head = head;
       this.messageHandler = messageHandler;
-      this.lock = lock;
-      response = null;
     }
 
 
@@ -528,49 +460,33 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
             }
           } else {
             completedMap.put(asyncId, OverseerSolrResponseSerializer.serialize(response));
-            if (log.isDebugEnabled()) {
-              log.debug("Updated completed map for task with zkid:[{}]", head.getId());
-            }
+            log.debug("Updated completed map for task with zkid:[{}]", head.getId());
           }
         } else {
           head.setBytes(OverseerSolrResponseSerializer.serialize(response));
-          if (log.isDebugEnabled()) {
-            log.debug("Completed task:[{}]", head.getId());
-          }
+          log.debug("Completed task:[{}]", head.getId());
         }
 
         markTaskComplete(head.getId(), asyncId);
-        if (log.isDebugEnabled()) {
-          log.debug("Marked task [{}] as completed.", head.getId());
-        }
+        log.debug("Marked task [{}] as completed.", head.getId());
         printTrackingMaps();
 
-        if (log.isDebugEnabled()) {
-          log.debug("{}: Message id: {} complete, response: {}", messageHandler.getName(), head.getId(), response.getResponse());
-        }
+        log.debug(messageHandler.getName() + ": Message id:" + head.getId() +
+            " complete, response:" + response.getResponse().toString());
         success = true;
-      } catch (AlreadyClosedException e) {
-
-      } catch (KeeperException e) {
-        SolrException.log(log, "", e);
-      } catch (InterruptedException e) {
-        // Reset task from tracking data structures so that it can be retried.
-        resetTaskWithException(messageHandler, head.getId(), asyncId, taskKey, message);
-        log.warn("Resetting task {} as the thread was interrupted.", head.getId());
-        Thread.currentThread().interrupt();
-      } finally {
-        lock.unlock();
-        if (!success) {
-          // Reset task from tracking data structures so that it can be retried.
-          try {
-            resetTaskWithException(messageHandler, head.getId(), asyncId, taskKey, message);
-          } catch(AlreadyClosedException e) {
-            
-          }
-        }
-        synchronized (waitLock){
-          waitLock.notifyAll();
+      } catch (InterruptedException | AlreadyClosedException e) {
+        ParWork.propegateInterrupt(e);
+        return;
+      } catch (Exception e) {
+        if (e instanceof KeeperException.SessionExpiredException) {
+          log.warn("Session expired, exiting...", e);
+          return;
         }
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+      }
+
+      if (log.isDebugEnabled()) {
+        log.debug("run() - end");
       }
     }
 
@@ -593,9 +509,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
       workQueue.remove(head);
     }
 
-    private void resetTaskWithException(OverseerMessageHandler messageHandler, String id, String asyncId, String taskKey, ZkNodeProps message) {
+    private void resetTaskWithException(OverseerMessageHandler messageHandler, String id, String asyncId, String taskKey, ZkNodeProps message) throws KeeperException, InterruptedException {
       log.warn("Resetting task: {}, requestid: {}, taskKey: {}", id, asyncId, taskKey);
-      try {
         if (asyncId != null) {
           if (!runningMap.remove(asyncId)) {
             log.warn("Could not find and remove async call [{}] from the running map.", asyncId);
@@ -606,12 +521,6 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
           runningTasks.remove(id);
         }
 
-      } catch (KeeperException e) {
-        SolrException.log(log, "", e);
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-      }
-
     }
 
     private void updateStats(String statsName) {
@@ -632,20 +541,17 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
 
   private void printTrackingMaps() {
     if (log.isDebugEnabled()) {
-      synchronized (runningTasks) {
-        log.debug("RunningTasks: {}", runningTasks);
-      }
+      log.debug("RunningTasks: {}", runningTasks);
+
       if (log.isDebugEnabled()) {
         log.debug("BlockedTasks: {}", blockedTasks.keySet());
       }
-      synchronized (completedTasks) {
-        if (log.isDebugEnabled()) {
-          log.debug("CompletedTasks: {}", completedTasks.keySet());
-        }
-      }
-      synchronized (runningZKTasks) {
-        log.info("RunningZKTasks: {}", runningZKTasks);
+      if (log.isDebugEnabled()) {
+        log.debug("CompletedTasks: {}", completedTasks.keySet());
       }
+
+      log.info("RunningZKTasks: {}", runningZKTasks);
+
     }
   }
 
@@ -676,9 +582,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
     }
 
     public int getRunningTasks() {
-      synchronized (runningTasks) {
-        return runningTasks.size();
-      }
+      return runningTasks.size();
     }
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
index 1572f00..9e5a74c 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskQueue.java
@@ -117,8 +117,8 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
       try {
         zookeeper.setData(responsePath, event.getBytes(), true);
       } catch (KeeperException.NoNodeException ignored) {
-        // we must handle the race case where the node no longer exists
-        log.info("Response ZK path: {} doesn't exist. Requestor may have disconnected from ZooKeeper", responsePath);
+        // this will often not exist or have been removed
+        if (log.isDebugEnabled()) log.debug("Response ZK path: {} doesn't exist.", responsePath);
       }
       try {
         zookeeper.delete(path, -1, true);
diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
index 35296a6..cc7addf 100644
--- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java
@@ -25,7 +25,9 @@ import java.util.List;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 
+import net.sf.saxon.trans.Err;
 import org.apache.http.client.methods.HttpUriRequest;
 import org.apache.lucene.index.IndexCommit;
 import org.apache.lucene.search.MatchAllDocsQuery;
@@ -37,11 +39,14 @@ import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
 import org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.SolrPingResponse;
+import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
@@ -102,10 +107,11 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
-  private int waitForUpdatesWithStaleStatePauseMilliSeconds = Integer
+  private volatile int waitForUpdatesWithStaleStatePauseMilliSeconds = Integer
       .getInteger("solr.cloud.wait-for-updates-with-stale-state-pause", 2500);
-  private int maxRetries = 500;
-  private int startingRecoveryDelayMilliSeconds = 2000;
+  private volatile int maxRetries = 500;
+  private volatile int startingRecoveryDelayMilliSeconds = Integer
+          .getInteger("solr.cloud.starting-recovery-delay-milli-seconds", 2000);
 
   public static interface RecoveryListener {
     public void recovered();
@@ -114,20 +120,19 @@ public class RecoveryStrategy implements Runnable, Closeable {
   }
 
   private volatile boolean close = false;
-
-  private RecoveryListener recoveryListener;
-  private ZkController zkController;
-  private String baseUrl;
-  private String coreZkNodeName;
-  private ZkStateReader zkStateReader;
+  private volatile RecoveryListener recoveryListener;
+  private final ZkController zkController;
+  private final String baseUrl;
+  private volatile String coreZkNodeName;
+  private final ZkStateReader zkStateReader;
   private volatile String coreName;
-  private int retries;
+  private AtomicInteger retries = new AtomicInteger(0);
   private boolean recoveringAfterStartup;
-  private CoreContainer cc;
   private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
-  private final Replica.Type replicaType;
+  private volatile Replica.Type replicaType;
+  private volatile CoreDescriptor coreDescriptor;
 
-  private CoreDescriptor coreDescriptor;
+  private CoreContainer cc;
 
   protected RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
     this.cc = cc;
@@ -182,6 +187,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
             .withConnectionTimeout(cfg.getDistributedConnectionTimeout())
             .withSocketTimeout(cfg.getDistributedSocketTimeout())
             .withHttpClient(cc.getUpdateShardHandler().getRecoveryOnlyHttpClient())
+            .markInternalRequest()
             ).build();
   }
   
@@ -189,9 +195,28 @@ public class RecoveryStrategy implements Runnable, Closeable {
   @Override
   final public void close() {
     close = true;
-    if (prevSendPreRecoveryHttpUriRequest != null) {
+    try {
       prevSendPreRecoveryHttpUriRequest.abort();
+    } catch (NullPointerException e) {
+      // expected
+    }
+
+    try (SolrCore core = cc.getCore(coreName)) {
+
+      if (core == null) {
+        SolrException.log(log, "SolrCore not found - cannot recover:" + coreName);
+        return;
+      }
+      SolrRequestHandler handler = core.getRequestHandler(ReplicationHandler.PATH);
+      ReplicationHandler replicationHandler = (ReplicationHandler) handler;
+
+      if (replicationHandler == null) {
+        throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE,
+                "Skipping recovery, no " + ReplicationHandler.PATH + " handler found");
+      }
+      replicationHandler.abortFetch();
     }
+
     log.warn("Stopping recovery for core=[{}] coreNodeName=[{}]", coreName, coreZkNodeName);
   }
 
@@ -278,6 +303,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           searchHolder.decref();
         }
       } catch (Exception e) {
+        ParWork.propegateInterrupt(e);
         log.debug("Error in solrcloud_debug block", e);
       }
     }
@@ -440,8 +466,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
           log.error("Recovery failed - trying again... ({})", retries);
 
-          retries++;
-          if (retries >= maxRetries) {
+
+          if (retries.incrementAndGet() >= maxRetries) {
             SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
             try {
               recoveryFailed(core, zkController, baseUrl, coreZkNodeName, this.coreDescriptor);
@@ -459,11 +485,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result
           // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
           // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
-          int loopCount = retries < 4 ? (int) Math.min(Math.pow(2, retries), 12) : 12;
-          if (log.isInfoEnabled()) {
-            log.info("Wait [{}] seconds before trying to recover again (attempt={})",
-                TimeUnit.MILLISECONDS.toSeconds(loopCount * startingRecoveryDelayMilliSeconds), retries);
-          }
+          int loopCount =  retries.get() < 4 ? (int) Math.min(Math.pow(2, retries.get()), 12) : 12;
+          log.info("Wait [{}] seconds before trying to recover again (attempt={})",
+              TimeUnit.MILLISECONDS.toSeconds(loopCount * startingRecoveryDelayMilliSeconds), retries);
           for (int i = 0; i < loopCount; i++) {
             if (isClosed()) {
               if (log.isInfoEnabled()) {
@@ -505,6 +529,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
     try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
       recentVersions = recentUpdates.getVersions(ulog.getNumRecordsToKeep());
     } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
       SolrException.log(log, "Corrupt tlog - ignoring.", e);
       recentVersions = new ArrayList<>(0);
     }
@@ -537,6 +562,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           }
         }
       } catch (Exception e) {
+        ParWork.propegateInterrupt(e);;
         SolrException.log(log, "Error getting recent versions.", e);
         recentVersions = new ArrayList<>(0);
       }
@@ -555,6 +581,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           firstTime = false; // skip peersync
         }
       } catch (Exception e) {
+        ParWork.propegateInterrupt(e);
         SolrException.log(log, "Error trying to get ulog starting operation.", e);
         firstTime = false; // skip peersync
       }
@@ -578,16 +605,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
         }
 
         boolean isLeader = leader.getCoreUrl().equals(ourUrl);
-        if (isLeader && !cloudDesc.isLeader()) {
+        if (isLeader && !cloudDesc.isLeader() && leader.getState().equals(Replica.State.ACTIVE)) {
           throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
         }
-        if (cloudDesc.isLeader()) {
-          // we are now the leader - no one else must have been suitable
-          log.warn("We have not yet recovered - but we are now the leader!");
-          log.info("Finished recovery process.");
-          zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
-          return;
-        }
 
         log.info("Begin buffering updates. core=[{}]", coreName);
         // recalling buffer updates will drop the old buffer tlog
@@ -736,8 +756,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
 
           log.error("Recovery failed - trying again... ({})", retries);
 
-          retries++;
-          if (retries >= maxRetries) {
+          if (retries.incrementAndGet() >= maxRetries) {
             SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
             try {
               recoveryFailed(core, zkController, baseUrl, coreZkNodeName, this.coreDescriptor);
@@ -754,7 +773,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
           // Wait an exponential interval between retries, start at 2 seconds and work up to a minute.
           // Since we sleep at 2 seconds sub-intervals in
           // order to check if we were closed, 30 is chosen as the maximum loopCount (2s * 30 = 1m).
-          double loopCount = Math.min(Math.pow(2, retries - 1), 30);
+          double loopCount = Math.min(Math.pow(2, retries.get() - 1), 30);
           log.info("Wait [{}] seconds before trying to recover again (attempt={})",
               loopCount * startingRecoveryDelayMilliSeconds, retries);
           for (int i = 0; i < loopCount; i++) {
@@ -793,20 +812,28 @@ public class RecoveryStrategy implements Runnable, Closeable {
           docCollection.getReplica(coreDesc.getCloudDescriptor().getCoreNodeName())
               .getState() == Replica.State.ACTIVE) {
         // this operation may take a long time, by putting replica into DOWN state, client won't query this replica
-        zkController.publish(coreDesc, Replica.State.DOWN);
+        //zkController.publish(coreDesc, Replica.State.DOWN);
+        // We should be in recovery and ignored by queries
       }
       numTried++;
+
+      if (numTried > 5) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Could not ping leader");
+        // instead of hammering on the leader,
+        // let recovery process continue normally
+      }
+
       Replica leaderReplica = null;
 
       if (isClosed()) {
-        return leaderReplica;
+        throw new AlreadyClosedException();
       }
 
       try {
         leaderReplica = zkStateReader.getLeaderRetry(
             cloudDesc.getCollectionName(), cloudDesc.getShardId());
       } catch (SolrException e) {
-        Thread.sleep(500);
+        Thread.sleep(250);
         continue;
       }
 
@@ -819,13 +846,13 @@ public class RecoveryStrategy implements Runnable, Closeable {
         return leaderReplica;
       } catch (IOException e) {
         log.error("Failed to connect leader {} on recovery, try again", leaderReplica.getBaseUrl());
-        Thread.sleep(500);
+        Thread.sleep(250);
       } catch (Exception e) {
         if (e.getCause() instanceof IOException) {
           log.error("Failed to connect leader {} on recovery, try again", leaderReplica.getBaseUrl());
-          Thread.sleep(500);
+          Thread.sleep(250);
         } else {
-          return leaderReplica;
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
       }
     }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java
index 17a6ec3..229cefa 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ReplicateFromLeader.java
@@ -17,6 +17,8 @@
 
 package org.apache.solr.cloud;
 
+import java.io.Closeable;
+import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 
 import org.apache.lucene.index.IndexCommit;
@@ -36,7 +38,7 @@ import org.apache.solr.update.UpdateLog;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public class ReplicateFromLeader {
+public class ReplicateFromLeader implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   private final CoreContainer cc;
@@ -133,7 +135,12 @@ public class ReplicateFromLeader {
 
   public void stopReplication() {
     if (replicationProcess != null) {
-      replicationProcess.shutdown();
+      replicationProcess.close();
     }
   }
+
+  @Override
+  public void close() throws IOException {
+    stopReplication();
+  }
 }
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index f6c96ca..6705cb0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -18,12 +18,15 @@ package org.apache.solr.cloud;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.util.EnumSet;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.solr.cloud.overseer.OverseerAction;
+import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.ClusterState;
@@ -35,6 +38,7 @@ import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.CoreDescriptor;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.search.SolrIndexSearcher;
@@ -53,34 +57,64 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
   private final CoreContainer cc;
   private final SyncStrategy syncStrategy;
 
+  protected final String shardId;
+
+  protected final String collection;
+  protected final LeaderElector leaderElector;
+
   private volatile boolean isClosed = false;
 
+  private final ZkController zkController;
+
   public ShardLeaderElectionContext(LeaderElector leaderElector,
                                     final String shardId, final String collection,
                                     final String coreNodeName, ZkNodeProps props, ZkController zkController, CoreContainer cc) {
-    super(leaderElector, shardId, collection, coreNodeName, props,
-        zkController);
+    super(coreNodeName, ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection
+                    + "/leader_elect/" + shardId,  ZkStateReader.getShardLeadersPath(
+            collection, shardId), props,
+            zkController.getZkClient());
+    System.out.println("MAKE SHARD LEADER ECONTEXT: " + cc.isShutDown());
     this.cc = cc;
-    syncStrategy = new SyncStrategy(cc);
+    this.syncStrategy = new SyncStrategy(cc);
+    this.shardId = shardId;
+    this.leaderElector = leaderElector;
+    this.zkController = zkController;
+    this.collection = collection;
   }
 
   @Override
   public void close() {
+    System.out.println("CLOSE SHARD LEADER CONTEXT");
     super.close();
+    try {
+      cancelElection();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception canceling election", e);
+    }
+    try {
+      syncStrategy.close();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception closing SyncStrategy", e);
+    }
+
     this.isClosed = true;
-    syncStrategy.close();
   }
 
   @Override
   public void cancelElection() throws InterruptedException, KeeperException {
+    super.cancelElection();
     String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
-    try (SolrCore core = cc.getCore(coreName)) {
-      if (core != null) {
-        core.getCoreDescriptor().getCloudDescriptor().setLeader(false);
+    try {
+      try (SolrCore core = cc.getCore(coreName)) {
+        if (core != null) {
+          core.getCoreDescriptor().getCloudDescriptor().setLeader(false);
+        }
       }
+    } catch (AlreadyClosedException e) {
+      // okay
     }
-
-    super.cancelElection();
   }
 
   @Override
@@ -88,12 +122,18 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
     return new ShardLeaderElectionContext(leaderElector, shardId, collection, id, leaderProps, zkController, cc);
   }
 
+
+
+  public LeaderElector getLeaderElector() {
+    return leaderElector;
+  }
+
   /*
    * weAreReplacement: has someone else been the leader already?
    */
   @Override
-  void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStart) throws KeeperException,
-      InterruptedException, IOException {
+  void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStart) throws KeeperException,
+          InterruptedException, IOException {
     String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
     ActionThrottle lt;
     try (SolrCore core = cc.getCore(coreName)) {
@@ -109,23 +149,19 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
       lt.minimumWaitBetweenActions();
       lt.markAttemptingAction();
 
-
       int leaderVoteWait = cc.getZkController().getLeaderVoteWait();
 
-      log.debug("Running the leader process for shard={} and weAreReplacement={} and leaderVoteWait={}", shardId, weAreReplacement, leaderVoteWait);
-      if (zkController.getClusterState().getCollection(collection).getSlice(shardId).getReplicas().size() > 1) {
-        // Clear the leader in clusterstate. We only need to worry about this if there is actually more than one replica.
-        ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
-            ZkStateReader.SHARD_ID_PROP, shardId, ZkStateReader.COLLECTION_PROP, collection);
-        zkController.getOverseer().getStateUpdateQueue().offer(Utils.toJSON(m));
-      }
+      log.debug("Running the leader process for shard={} and weAreReplacement={} and leaderVoteWait={}", shardId,
+              weAreReplacement, leaderVoteWait);
 
-      boolean allReplicasInLine = false;
-      if (!weAreReplacement) {
-        allReplicasInLine = waitForReplicasToComeUp(leaderVoteWait);
-      } else {
-        allReplicasInLine = areAllReplicasParticipating();
-      }
+//      ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
+//              ZkStateReader.SHARD_ID_PROP, shardId, ZkStateReader.COLLECTION_PROP, collection);
+//      try {
+//        zkController.getOverseer().offerStateUpdate(Utils.toJSON(m));
+//      } catch (Exception e1) {
+//        ParWork.propegateInterrupt(e1);
+//        throw new SolrException(ErrorCode.SERVER_ERROR, e1);
+//      }
 
       if (isClosed) {
         // Solr is shutting down or the ZooKeeper session expired while waiting for replicas. If the later,
@@ -142,13 +178,14 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         if (core == null) {
           return;
         }
-
-        replicaType = core.getCoreDescriptor().getCloudDescriptor().getReplicaType();
-        coreNodeName = core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName();
+        CoreDescriptor cd = core.getCoreDescriptor();
+        CloudDescriptor cloudCd = cd.getCloudDescriptor();
+        replicaType = cloudCd.getReplicaType();
+        coreNodeName = cloudCd.getCoreNodeName();
         // should I be leader?
         ZkShardTerms zkShardTerms = zkController.getShardTerms(collection, shardId);
         if (zkShardTerms.registered(coreNodeName) && !zkShardTerms.canBecomeLeader(coreNodeName)) {
-          if (!waitForEligibleBecomeLeaderAfterTimeout(zkShardTerms, coreNodeName, leaderVoteWait)) {
+          if (!waitForEligibleBecomeLeaderAfterTimeout(zkShardTerms, cd, leaderVoteWait)) {
             rejoinLeaderElection(core);
             return;
           } else {
@@ -163,28 +200,19 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
         log.info("I may be the new leader - try and sync");
 
+        // nocommit
         // we are going to attempt to be the leader
         // first cancel any current recovery
         core.getUpdateHandler().getSolrCoreState().cancelRecovery();
 
-        if (weAreReplacement) {
-          // wait a moment for any floating updates to finish
-          try {
-            Thread.sleep(2500);
-          } catch (InterruptedException e) {
-            Thread.currentThread().interrupt();
-            throw new SolrException(SolrException.ErrorCode.SERVICE_UNAVAILABLE, e);
-          }
-        }
-
         PeerSync.PeerSyncResult result = null;
         boolean success = false;
         try {
           result = syncStrategy.sync(zkController, core, leaderProps, weAreReplacement);
           success = result.isSuccess();
         } catch (Exception e) {
-          SolrException.log(log, "Exception while trying to sync", e);
-          result = PeerSync.PeerSyncResult.failure();
+          ParWork.propegateInterrupt(e);
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
 
         UpdateLog ulog = core.getUpdateHandler().getUpdateLog();
@@ -203,11 +231,12 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
             // - we were active
             // before, so become leader anyway if no one else has any versions either
             if (result.getOtherHasVersions().orElse(false)) {
-              log.info("We failed sync, but we have no versions - we can't sync in that case. But others have some versions, so we should not become leader");
+              log.info(
+                      "We failed sync, but we have no versions - we can't sync in that case. But others have some versions, so we should not become leader");
               success = false;
             } else {
               log.info(
-                  "We failed sync, but we have no versions - we can't sync in that case - we were active before, so become leader anyway");
+                      "We failed sync, but we have no versions - we can't sync in that case - we were active before, so become leader anyway");
               success = true;
             }
           }
@@ -219,15 +248,14 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
             RefCounted<SolrIndexSearcher> searchHolder = core.getNewestSearcher(false);
             SolrIndexSearcher searcher = searchHolder.get();
             try {
-              if (log.isDebugEnabled()) {
-                log.debug("{} synched {}", core.getCoreContainer().getZkController().getNodeName()
-                    , searcher.count(new MatchAllDocsQuery()));
-              }
+              log.debug(core.getCoreContainer().getZkController().getNodeName() + " synched "
+                      + searcher.count(new MatchAllDocsQuery()));
             } finally {
               searchHolder.decref();
             }
           } catch (Exception e) {
-            log.error("Error in solrcloud_debug block", e);
+            ParWork.propegateInterrupt(e);
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
           }
         }
         if (!success) {
@@ -236,8 +264,6 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
         }
 
       }
-
-      boolean isLeader = true;
       if (!isClosed) {
         try {
           if (replicaType == Replica.Type.TLOG) {
@@ -257,11 +283,26 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
           }
           // in case of leaderVoteWait timeout, a replica with lower term can win the election
           if (setTermToMax) {
-            log.error("WARNING: Potential data loss -- Replica {} became leader after timeout (leaderVoteWait) {}"
-                , "without being up-to-date with the previous leader", coreNodeName);
+            log.error("WARNING: Potential data loss -- Replica {} became leader after timeout (leaderVoteWait) " +
+                    "without being up-to-date with the previous leader", coreNodeName);
             zkController.getShardTerms(collection, shardId).setTermEqualsToLeader(coreNodeName);
           }
-          super.runLeaderProcess(weAreReplacement, 0);
+          super.runLeaderProcess(context, weAreReplacement, 0);
+
+          assert shardId != null;
+
+          ZkNodeProps zkNodes = ZkNodeProps.fromKeyVals(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
+                  ZkStateReader.SHARD_ID_PROP, shardId,
+                  ZkStateReader.COLLECTION_PROP, collection,
+                  ZkStateReader.BASE_URL_PROP, leaderProps.get(ZkStateReader.BASE_URL_PROP),
+                  ZkStateReader.NODE_NAME_PROP, leaderProps.get(ZkStateReader.NODE_NAME_PROP),
+                  ZkStateReader.CORE_NAME_PROP, leaderProps.get(ZkStateReader.CORE_NAME_PROP),
+                  ZkStateReader.CORE_NODE_NAME_PROP, leaderProps.get(ZkStateReader.CORE_NODE_NAME_PROP),
+                  ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
+          assert zkController != null;
+          assert zkController.getOverseer() != null;
+          zkController.getOverseer().offerStateUpdate(Utils.toJSON(zkNodes));
+
           try (SolrCore core = cc.getCore(coreName)) {
             if (core != null) {
               core.getCoreDescriptor().getCloudDescriptor().setLeader(true);
@@ -270,43 +311,37 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
               return;
             }
           }
-          if (log.isInfoEnabled()) {
-            log.info("I am the new leader: {} {}", ZkCoreNodeProps.getCoreUrl(leaderProps), shardId);
-          }
-
-          // we made it as leader - send any recovery requests we need to
-          syncStrategy.requestRecoveries();
+          log.info("I am the new leader: " + ZkCoreNodeProps.getCoreUrl(leaderProps) + " " + shardId);
 
-        } catch (SessionExpiredException e) {
-          throw new SolrException(ErrorCode.SERVER_ERROR,
-              "ZK session expired - cancelling election for " + collection + " " + shardId);
         } catch (Exception e) {
-          isLeader = false;
           SolrException.log(log, "There was a problem trying to register as the leader", e);
+          ParWork.propegateInterrupt(e);
+          if(e instanceof IOException
+                  || (e instanceof KeeperException && (!(e instanceof SessionExpiredException)))) {
 
-          try (SolrCore core = cc.getCore(coreName)) {
+            try (SolrCore core = cc.getCore(coreName)) {
 
-            if (core == null) {
-              if (log.isDebugEnabled()) {
-                log.debug("SolrCore not found: {} in {}", coreName, cc.getLoadedCoreNames());
+              if (core == null) {
+                if (log.isDebugEnabled())
+                  log.debug("SolrCore not found:" + coreName + " in " + cc.getLoadedCoreNames());
+                return;
+              }
+              core.getCoreDescriptor().getCloudDescriptor().setLeader(false);
+
+              // we could not publish ourselves as leader - try and rejoin election
+              try {
+                rejoinLeaderElection(core);
+              } catch (Exception exc) {
+                ParWork.propegateInterrupt(e);
+                throw new SolrException(ErrorCode.SERVER_ERROR, e);
               }
-              return;
-            }
-
-            core.getCoreDescriptor().getCloudDescriptor().setLeader(false);
-
-            // we could not publish ourselves as leader - try and rejoin election
-            try {
-              rejoinLeaderElection(core);
-            } catch (SessionExpiredException exc) {
-              throw new SolrException(ErrorCode.SERVER_ERROR,
-                  "ZK session expired - cancelling election for " + collection + " " + shardId);
             }
+          } else {
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
           }
         }
-      } else {
-        cancelElection();
       }
+
     } finally {
       MDCLoggingContext.clear();
     }
@@ -314,30 +349,37 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
   /**
    * Wait for other replicas with higher terms participate in the electioon
-   *
    * @return true if after {@code timeout} there are no other replicas with higher term participate in the election,
    * false if otherwise
    */
-  private boolean waitForEligibleBecomeLeaderAfterTimeout(ZkShardTerms zkShardTerms, String coreNodeName, int timeout) throws InterruptedException {
-    long timeoutAt = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeout, TimeUnit.MILLISECONDS);
-    while (!isClosed && !cc.isShutDown()) {
-      if (System.nanoTime() > timeoutAt) {
-        log.warn("After waiting for {}ms, no other potential leader was found, {} try to become leader anyway (core_term:{}, highest_term:{})",
-            timeout, coreNodeName, zkShardTerms.getTerm(coreNodeName), zkShardTerms.getHighestTerm());
-        return true;
-      }
-      if (replicasWithHigherTermParticipated(zkShardTerms, coreNodeName)) {
-        log.info("Can't become leader, other replicas with higher term participated in leader election");
-        return false;
-      }
-      Thread.sleep(500L);
+  private boolean waitForEligibleBecomeLeaderAfterTimeout(ZkShardTerms zkShardTerms, CoreDescriptor cd, int timeout) throws InterruptedException {
+    String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
+    AtomicReference<Boolean> foundHigherTerm = new AtomicReference<>();
+    try {
+      zkController.getZkStateReader().waitForState(cd.getCollectionName(), timeout, TimeUnit.MILLISECONDS, (n,c) -> foundForHigherTermReplica(zkShardTerms, cd, foundHigherTerm));
+    } catch (TimeoutException e) {
+      log.warn("After waiting for {}ms, no other potential leader was found, {} try to become leader anyway (" +
+                      "core_term:{}, highest_term:{})",
+              timeout, cd, zkShardTerms.getTerm(coreNodeName), zkShardTerms.getHighestTerm());
+      return true;
     }
+
+    return false;
+  }
+
+  private boolean foundForHigherTermReplica(ZkShardTerms zkShardTerms, CoreDescriptor cd, AtomicReference<Boolean> foundHigherTerm) {
+    String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
+    if (replicasWithHigherTermParticipated(zkShardTerms, coreNodeName)) {
+      log.info("Can't become leader, other replicas with higher term participated in leader election");
+      foundHigherTerm.set(true);
+      return true;
+    }
+
     return false;
   }
 
   /**
    * Do other replicas with higher term participated in the election
-   *
    * @return true if other replicas with higher term participated in the election, false if otherwise
    */
   private boolean replicasWithHigherTermParticipated(ZkShardTerms zkShardTerms, String coreNodeName) {
@@ -364,117 +406,12 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
   }
 
   public void publishActiveIfRegisteredAndNotActive(SolrCore core) throws Exception {
-    if (core.getCoreDescriptor().getCloudDescriptor().hasRegistered()) {
-      ZkStateReader zkStateReader = zkController.getZkStateReader();
-      zkStateReader.forceUpdateCollection(collection);
-      ClusterState clusterState = zkStateReader.getClusterState();
-      Replica rep = getReplica(clusterState, collection, leaderProps.getStr(ZkStateReader.CORE_NODE_NAME_PROP));
-      if (rep == null) return;
-      if (rep.getState() != Replica.State.ACTIVE || core.getCoreDescriptor().getCloudDescriptor().getLastPublished() != Replica.State.ACTIVE) {
-        log.debug("We have become the leader after core registration but are not in an ACTIVE state - publishing ACTIVE");
-        zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
-      }
-    }
-  }
-
-  private Replica getReplica(ClusterState clusterState, String collectionName, String replicaName) {
-    if (clusterState == null) return null;
-    final DocCollection docCollection = clusterState.getCollectionOrNull(collectionName);
-    if (docCollection == null) return null;
-    return docCollection.getReplica(replicaName);
-  }
-
-  // returns true if all replicas are found to be up, false if not
-  private boolean waitForReplicasToComeUp(int timeoutms) throws InterruptedException {
-    long timeoutAt = System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeoutms, TimeUnit.MILLISECONDS);
-    final String shardsElectZkPath = electionPath + LeaderElector.ELECTION_NODE;
-
-    DocCollection docCollection = zkController.getClusterState().getCollectionOrNull(collection);
-    Slice slices = (docCollection == null) ? null : docCollection.getSlice(shardId);
-    int cnt = 0;
-    while (!isClosed && !cc.isShutDown()) {
-      // wait for everyone to be up
-      if (slices != null) {
-        int found = 0;
-        try {
-          found = zkClient.getChildren(shardsElectZkPath, null, true).size();
-        } catch (KeeperException e) {
-          if (e instanceof KeeperException.SessionExpiredException) {
-            // if the session has expired, then another election will be launched, so
-            // quit here
-            throw new SolrException(ErrorCode.SERVER_ERROR,
-                "ZK session expired - cancelling election for " + collection + " " + shardId);
-          }
-          SolrException.log(log,
-              "Error checking for the number of election participants", e);
-        }
-
-        // on startup and after connection timeout, wait for all known shards
-        if (found >= slices.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT)).size()) {
-          log.info("Enough replicas found to continue.");
-          return true;
-        } else {
-          if (cnt % 40 == 0) {
-            if (log.isInfoEnabled()) {
-              log.info("Waiting until we see more replicas up for shard {}: total={} found={} timeoute in={}ms"
-                  , shardId, slices.getReplicas(EnumSet.of(Replica.Type.TLOG, Replica.Type.NRT)).size(), found,
-                  TimeUnit.MILLISECONDS.convert(timeoutAt - System.nanoTime(), TimeUnit.NANOSECONDS));
-            }
-          }
-        }
-
-        if (System.nanoTime() > timeoutAt) {
-          log.info("Was waiting for replicas to come up, but they are taking too long - assuming they won't come back till later");
-          return false;
-        }
-      } else {
-        log.warn("Shard not found: {} for collection {}", shardId, collection);
-
-        return false;
-
-      }
-
-      Thread.sleep(500);
-      docCollection = zkController.getClusterState().getCollectionOrNull(collection);
-      slices = (docCollection == null) ? null : docCollection.getSlice(shardId);
-      cnt++;
-    }
-    return false;
-  }
-
-  // returns true if all replicas are found to be up, false if not
-  private boolean areAllReplicasParticipating() throws InterruptedException {
-    final String shardsElectZkPath = electionPath + LeaderElector.ELECTION_NODE;
-    final DocCollection docCollection = zkController.getClusterState().getCollectionOrNull(collection);
-
-    if (docCollection != null && docCollection.getSlice(shardId) != null) {
-      final Slice slices = docCollection.getSlice(shardId);
-      int found = 0;
-      try {
-        found = zkClient.getChildren(shardsElectZkPath, null, true).size();
-      } catch (KeeperException e) {
-        if (e instanceof KeeperException.SessionExpiredException) {
-          // if the session has expired, then another election will be launched, so
-          // quit here
-          throw new SolrException(ErrorCode.SERVER_ERROR,
-              "ZK session expired - cancelling election for " + collection + " " + shardId);
-        }
-        SolrException.log(log, "Error checking for the number of election participants", e);
-      }
-
-      if (found >= slices.getReplicasMap().size()) {
-        log.debug("All replicas are ready to participate in election.");
-        return true;
-      }
-    } else {
-      log.warn("Shard not found: {} for collection {}", shardId, collection);
-      return false;
-    }
-    return false;
+    if (log.isDebugEnabled()) log.debug("We have become the leader after core registration but are not in an ACTIVE state - publishing ACTIVE");
+    zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE, true, false);
   }
 
   private void rejoinLeaderElection(SolrCore core)
-      throws InterruptedException, KeeperException, IOException {
+          throws InterruptedException, KeeperException, IOException {
     // remove our ephemeral and re join the election
     if (cc.isShutDown()) {
       log.debug("Not rejoining election because CoreContainer is closed");
@@ -485,9 +422,18 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
     cancelElection();
 
-    core.getUpdateHandler().getSolrCoreState().doRecovery(cc, core.getCoreDescriptor());
+    core.getUpdateHandler().getSolrCoreState().doRecovery(zkController.getCoreContainer(), core.getCoreDescriptor());
 
     leaderElector.joinElection(this, true);
   }
 
+  public String getShardId() {
+    return shardId;
+  }
+
+  public String getCollection() {
+    return collection;
+  }
+
 }
+
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index a9afc8d..6054b35 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -19,24 +19,21 @@ package org.apache.solr.cloud;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.nio.file.Paths;
+import java.util.Iterator;
 import java.util.List;
 import java.util.ArrayList;
 
-import org.apache.hadoop.fs.Path;
-import org.apache.solr.cloud.overseer.OverseerAction;
+import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.SolrZkClient;
-import org.apache.solr.common.cloud.ZkCmdExecutor;
 import org.apache.solr.common.cloud.ZkNodeProps;
-import org.apache.solr.common.cloud.ZkStateReader;
-import org.apache.solr.common.util.RetryUtil;
 import org.apache.solr.common.util.Utils;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
-import org.apache.zookeeper.KeeperException.NodeExistsException;
 import org.apache.zookeeper.Op;
 import org.apache.zookeeper.OpResult;
 import org.apache.zookeeper.OpResult.SetDataResult;
@@ -48,66 +45,79 @@ import org.slf4j.LoggerFactory;
 class ShardLeaderElectionContextBase extends ElectionContext {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   protected final SolrZkClient zkClient;
-  protected String shardId;
-  protected String collection;
-  protected LeaderElector leaderElector;
-  protected ZkStateReader zkStateReader;
-  protected ZkController zkController;
-  private Integer leaderZkNodeParentVersion;
+
+  private volatile Integer leaderZkNodeParentVersion;
 
   // Prevents a race between cancelling and becoming leader.
   private final Object lock = new Object();
 
-  public ShardLeaderElectionContextBase(LeaderElector leaderElector,
-                                        final String shardId, final String collection, final String coreNodeName,
-                                        ZkNodeProps props, ZkController zkController) {
-    super(coreNodeName, ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection
-        + "/leader_elect/" + shardId, ZkStateReader.getShardLeadersPath(
-        collection, shardId), props, zkController.getZkClient());
-    this.leaderElector = leaderElector;
-    this.zkStateReader = zkController.getZkStateReader();
-    this.zkClient = zkStateReader.getZkClient();
-    this.zkController = zkController;
-    this.shardId = shardId;
-    this.collection = collection;
-
-    String parent = new Path(leaderPath).getParent().toString();
-    ZkCmdExecutor zcmd = new ZkCmdExecutor(30000);
-    // only if /collections/{collection} exists already do we succeed in creating this path
-    log.info("make sure parent is created {}", parent);
+  public ShardLeaderElectionContextBase(final String coreNodeName, String electionPath, String leaderPath,
+                                        ZkNodeProps props, SolrZkClient zkClient) {
+    super(coreNodeName, electionPath, leaderPath, props);
+    this.zkClient = zkClient;
+  }
+
+  @Override
+  public void close() {
     try {
-      zcmd.ensureExists(parent, (byte[]) null, CreateMode.PERSISTENT, zkClient, 2);
-    } catch (KeeperException e) {
-      throw new RuntimeException(e);
-    } catch (InterruptedException e) {
-      Thread.currentThread().interrupt();
-      throw new RuntimeException(e);
+      super.close();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception canceling election", e);
+    }
+    try {
+      cancelElection();
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      log.error("Exception canceling election", e);
     }
   }
 
   @Override
   public void cancelElection() throws InterruptedException, KeeperException {
-    super.cancelElection();
     synchronized (lock) {
-      if (leaderZkNodeParentVersion != null) {
-        // no problem
-        // no problem
+      super.cancelElection();
+
+      Integer version = leaderZkNodeParentVersion;
+      if (version != null) {
         try {
           // We need to be careful and make sure we *only* delete our own leader registration node.
           // We do this by using a multi and ensuring the parent znode of the leader registration node
           // matches the version we expect - there is a setData call that increments the parent's znode
           // version whenever a leader registers.
-          log.debug("Removing leader registration node on cancel: {} {}", leaderPath, leaderZkNodeParentVersion);
+          log.debug("Removing leader registration node on cancel: {} {}", leaderPath, version);
           List<Op> ops = new ArrayList<>(2);
-          ops.add(Op.check(new Path(leaderPath).getParent().toString(), leaderZkNodeParentVersion));
+          ops.add(Op.check(Paths.get(leaderPath).getParent().toString(), version));
+          ops.add(Op.check(electionPath, -1));
           ops.add(Op.delete(leaderPath, -1));
           zkClient.multi(ops, true);
-        } catch (InterruptedException e) {
-          throw e;
-        } catch (IllegalArgumentException e) {
-          SolrException.log(log, e);
+        } catch (KeeperException e) {
+          if (e instanceof  NoNodeException) {
+            // okay
+            return;
+          }
+          if (e instanceof KeeperException.SessionExpiredException) {
+            log.warn("ZooKeeper session expired");
+            throw e;
+          }
+
+          List<OpResult> results = e.getResults();
+          for (OpResult result : results) {
+            if (((OpResult.ErrorResult) result).getErr() == -101) {
+              // no node, fine
+            } else {
+              throw new SolrException(ErrorCode.SERVER_ERROR, "Exception canceling election", e);
+            }
+          }
+
+        } catch (InterruptedException | AlreadyClosedException e) {
+          ParWork.propegateInterrupt(e);
+          return;
+        } catch (Exception e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, "Exception canceling election", e);
+        } finally {
+          version = null;
         }
-        leaderZkNodeParentVersion = null;
       } else {
         log.info("No version found for ephemeral leader parent node, won't remove previous leader registration.");
       }
@@ -115,80 +125,54 @@ class ShardLeaderElectionContextBase extends ElectionContext {
   }
 
   @Override
-  void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStartMs)
-      throws KeeperException, InterruptedException, IOException {
+  void runLeaderProcess(ElectionContext context, boolean weAreReplacement, int pauseBeforeStartMs)
+          throws KeeperException, InterruptedException, IOException {
     // register as leader - if an ephemeral is already there, wait to see if it goes away
 
-    String parent = new Path(leaderPath).getParent().toString();
+    String parent = Paths.get(leaderPath).getParent().toString();
+    List<String> errors = new ArrayList<>();
     try {
-      RetryUtil.retryOnThrowable(NodeExistsException.class, 60000, 5000, () -> {
-        synchronized (lock) {
-          log.info("Creating leader registration node {} after winning as {}", leaderPath, leaderSeqPath);
-          List<Op> ops = new ArrayList<>(2);
-
-          // We use a multi operation to get the parent nodes version, which will
-          // be used to make sure we only remove our own leader registration node.
-          // The setData call used to get the parent version is also the trigger to
-          // increment the version. We also do a sanity check that our leaderSeqPath exists.
-
-          ops.add(Op.check(leaderSeqPath, -1));
-          ops.add(Op.create(leaderPath, Utils.toJSON(leaderProps), zkClient.getZkACLProvider().getACLsToAdd(leaderPath), CreateMode.EPHEMERAL));
-          ops.add(Op.setData(parent, null, -1));
-          List<OpResult> results;
-
-          results = zkClient.multi(ops, true);
-          for (OpResult result : results) {
-            if (result.getType() == ZooDefs.OpCode.setData) {
-              SetDataResult dresult = (SetDataResult) result;
-              Stat stat = dresult.getStat();
-              leaderZkNodeParentVersion = stat.getVersion();
-              return;
-            }
+      synchronized (lock) {
+      log.info("Creating leader registration node {} after winning as {}", leaderPath, leaderSeqPath);
+      //zkClient.printLayout();
+      List<Op> ops = new ArrayList<>(3);
+
+      // We use a multi operation to get the parent nodes version, which will
+      // be used to make sure we only remove our own leader registration node.
+      // The setData call used to get the parent version is also the trigger to
+      // increment the version. We also do a sanity check that our leaderSeqPath exists.
+
+      ops.add(Op.check(leaderSeqPath, -1));
+      ops.add(Op.create(leaderPath, Utils.toJSON(leaderProps), zkClient.getZkACLProvider().getACLsToAdd(leaderPath), CreateMode.EPHEMERAL));
+      ops.add(Op.setData(parent, null, -1));
+      List<OpResult> results;
+
+      results = zkClient.multi(ops, true);
+      Iterator<Op> it = ops.iterator();
+      for (OpResult result : results) {
+        if (result.getType() == ZooDefs.OpCode.setData) {
+          SetDataResult dresult = (SetDataResult) result;
+          Stat stat = dresult.getStat();
+          leaderZkNodeParentVersion = stat.getVersion();
+        }
+        if (result.getType() == ZooDefs.OpCode.error) {
+          OpResult.ErrorResult dresult = (OpResult.ErrorResult) result;
+          if (dresult.getErr() > 0) {
+            errors.add(it.next().getPath());
           }
-          assert leaderZkNodeParentVersion != null;
         }
-      });
-    } catch (NoNodeException e) {
-      log.info("Will not register as leader because it seems the election is no longer taking place.");
-      return;
-    } catch (Throwable t) {
-      if (t instanceof OutOfMemoryError) {
-        throw (OutOfMemoryError) t;
-      }
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Could not register as the leader because creating the ephemeral registration node in ZooKeeper failed", t);
-    }
 
-    assert shardId != null;
-    boolean isAlreadyLeader = false;
-    if (zkStateReader.getClusterState() != null &&
-        zkStateReader.getClusterState().getCollection(collection).getSlice(shardId).getReplicas().size() < 2) {
-      Replica leader = zkStateReader.getLeader(collection, shardId);
-      if (leader != null
-          && leader.getBaseUrl().equals(leaderProps.get(ZkStateReader.BASE_URL_PROP))
-          && leader.getCoreName().equals(leaderProps.get(ZkStateReader.CORE_NAME_PROP))) {
-        isAlreadyLeader = true;
       }
+      assert leaderZkNodeParentVersion != null;
     }
-    if (!isAlreadyLeader) {
-      ZkNodeProps m = ZkNodeProps.fromKeyVals(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
-          ZkStateReader.SHARD_ID_PROP, shardId,
-          ZkStateReader.COLLECTION_PROP, collection,
-          ZkStateReader.BASE_URL_PROP, leaderProps.get(ZkStateReader.BASE_URL_PROP),
-          ZkStateReader.CORE_NAME_PROP, leaderProps.get(ZkStateReader.CORE_NAME_PROP),
-          ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
-      assert zkController != null;
-      assert zkController.getOverseer() != null;
-      zkController.getOverseer().offerStateUpdate(Utils.toJSON(m));
-    }
-  }
 
-  public LeaderElector getLeaderElector() {
-    return leaderElector;
+    } catch (Throwable t) {
+      ParWork.propegateInterrupt(t);
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Could not register as the leader because creating the ephemeral registration node in ZooKeeper failed: " + errors, t);
+    }
   }
 
   Integer getLeaderZkNodeParentVersion() {
-    synchronized (lock) {
-      return leaderZkNodeParentVersion;
-    }
+    return leaderZkNodeParentVersion;
   }
-}
\ No newline at end of file
+}
diff --git a/solr/core/src/java/org/apache/solr/cloud/SolrZkServer.java b/solr/core/src/java/org/apache/solr/cloud/SolrZkServer.java
index ca75183..965f80b 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SolrZkServer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SolrZkServer.java
@@ -25,6 +25,7 @@ import org.apache.zookeeper.server.quorum.QuorumPeerMain;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.Closeable;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -39,7 +40,7 @@ import java.util.Properties;
 import java.util.regex.Pattern;
 
 
-public class SolrZkServer {
+public class SolrZkServer implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   public static final String ZK_WHITELIST_PROPERTY = "zookeeper.4lw.commands.whitelist";
@@ -142,14 +143,9 @@ public class SolrZkServer {
 
     zkThread.setDaemon(true);
     zkThread.start();
-    try {
-      Thread.sleep(500); // pause for ZooKeeper to start
-    } catch (Exception e) {
-      log.error("STARTING ZOOKEEPER", e);
-    }
   }
 
-  public void stop() {
+  public void close() {
     if (zkRun == null) return;
     zkThread.interrupt();
   }
diff --git a/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java b/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java
index 5a1b8da..6d8974a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java
@@ -16,6 +16,7 @@
  */
 package org.apache.solr.cloud;
 
+import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
@@ -46,7 +47,7 @@ import org.slf4j.LoggerFactory;
 
 import static org.apache.solr.common.params.CommonParams.DISTRIB;
 
-public class SyncStrategy {
+public class SyncStrategy implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
@@ -59,8 +60,6 @@ public class SyncStrategy {
 
   private final ExecutorService updateExecutor;
   
-  private final List<RecoveryRequest> recoveryRequests = new ArrayList<>();
-  
   private static class RecoveryRequest {
     ZkNodeProps leaderProps;
     String baseUrl;
@@ -94,8 +93,6 @@ public class SyncStrategy {
       return PeerSync.PeerSyncResult.failure();
     }
 
-    recoveryRequests.clear();
-
     if (log.isInfoEnabled()) {
       log.info("Sync replicas to {}", ZkCoreNodeProps.getCoreUrl(leaderProps));
     }
@@ -231,24 +228,14 @@ public class SyncStrategy {
       
       if (!success) {
         if (log.isInfoEnabled()) {
-          log.info("{}: Sync failed - we will ask replica ({}) to recover."
+          log.info("{}: Sync failed - replica ({}) should try to recover."
               , ZkCoreNodeProps.getCoreUrl(leaderProps), srsp.getShardAddress());
         }
-        if (isClosed) {
-          log.info("We have been closed, don't request that a replica recover");
-        } else {
-          RecoveryRequest rr = new RecoveryRequest();
-          rr.leaderProps = leaderProps;
-          rr.baseUrl = ((ShardCoreRequest) srsp.getShardRequest()).baseUrl;
-          rr.coreName = ((ShardCoreRequest) srsp.getShardRequest()).coreName;
-          recoveryRequests.add(rr);
-        }
       } else {
         if (log.isInfoEnabled()) {
           log.info("{}: sync completed with {}", ZkCoreNodeProps.getCoreUrl(leaderProps), srsp.getShardAddress());
         }
       }
-      
     }
 
   }
@@ -289,49 +276,6 @@ public class SyncStrategy {
     this.isClosed = true;
   }
   
-  public void requestRecoveries() {
-    for (RecoveryRequest rr : recoveryRequests) {
-      try {
-        requestRecovery(rr.leaderProps, rr.baseUrl, rr.coreName);
-      } catch (SolrServerException | IOException e) {
-        log.error("Problem requesting that a replica recover", e);
-      }
-    }
-  }
-  
-  private void requestRecovery(final ZkNodeProps leaderProps, final String baseUrl, final String coreName) throws SolrServerException, IOException {
-    Thread thread = new Thread() {
-      {
-        setDaemon(true);
-      }
-      @Override
-      public void run() {
-        
-        if (isClosed) {
-          log.info("We have been closed, won't request recovery");
-          return;
-        }
-        RequestRecovery recoverRequestCmd = new RequestRecovery();
-        recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
-        recoverRequestCmd.setCoreName(coreName);
-        
-        try (HttpSolrClient client = new HttpSolrClient.Builder(baseUrl)
-            .withHttpClient(SyncStrategy.this.client)
-            .withConnectionTimeout(30000)
-            .withSocketTimeout(120000)
-            .build()) {
-          client.request(recoverRequestCmd);
-        } catch (Throwable t) {
-          SolrException.log(log, ZkCoreNodeProps.getCoreUrl(leaderProps) + ": Could not tell a replica to recover", t);
-          if (t instanceof Error) {
-            throw (Error) t;
-          }
-        }
-      }
-    };
-    updateExecutor.execute(thread);
-  }
-  
   public static ModifiableSolrParams params(String... params) {
     ModifiableSolrParams msp = new ModifiableSolrParams();
     for (int i = 0; i < params.length; i += 2) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java b/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java
index 5acd63b..3178f04 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkCLI.java
@@ -195,6 +195,7 @@ public class ZkCLI implements CLIO {
         zkServer.start();
       }
       SolrZkClient zkClient = null;
+      CoreContainer cc = null;
       try {
         zkClient = new SolrZkClient(zkServerAddress, 30000, 30000,
             () -> {
@@ -207,7 +208,7 @@ public class ZkCLI implements CLIO {
             System.exit(1);
           }
 
-          CoreContainer cc = new CoreContainer(Paths.get(solrHome), new Properties());
+          cc = new CoreContainer(Paths.get(solrHome), new Properties());
 
           if(!ZkController.checkChrootPath(zkServerAddress, true)) {
             stdout.println("A chroot was specified in zkHost but the znode doesn't exist. ");
@@ -361,11 +362,14 @@ public class ZkCLI implements CLIO {
         }
       } finally {
         if (solrPort != null) {
-          zkServer.stop();
+          zkServer.close();
         }
         if (zkClient != null) {
           zkClient.close();
         }
+        if (cc != null) {
+          cc.shutdown();
+        }
       }
     } catch (ParseException exp) {
       stdout.println("Unexpected exception:" + exp.getMessage());
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index 2cd376c..51aba45 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
 import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
+import java.io.PrintWriter;
 import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
 import java.net.InetAddress;
@@ -29,17 +30,21 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.SortedSet;
 import java.util.concurrent.Callable;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.ExecutionException;
@@ -51,7 +56,10 @@ import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Supplier;
 
 import com.google.common.base.Strings;
+import org.apache.commons.io.output.StringBuilderWriter;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.curator.framework.api.transaction.CuratorTransactionResult;
+import org.apache.solr.client.solrj.cloud.DistributedLock;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.impl.HttpSolrClient;
@@ -63,6 +71,7 @@ import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
 import org.apache.solr.cloud.overseer.OverseerAction;
 import org.apache.solr.cloud.overseer.SliceMutator;
 import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.BeforeReconnect;
@@ -98,6 +107,7 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.ObjectReleaseTracker;
 import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.common.util.TimeOut;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.common.util.URLUtil;
 import org.apache.solr.common.util.Utils;
@@ -120,13 +130,16 @@ import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
 import org.apache.zookeeper.KeeperException.SessionExpiredException;
 import org.apache.zookeeper.Op;
+import org.apache.zookeeper.OpResult;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
+import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.COLLECTIONS_ZKNODE;
 import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
@@ -148,20 +161,26 @@ public class ZkController implements Closeable {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
+  public final int WAIT_FOR_STATE = Integer.getInteger("solr.waitForState", 10);
 
   private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
+  private final DefaultConnectionStrategy strat;
+  private final int zkClientConnectTimeout;
+  private final Supplier<List<CoreDescriptor>> descriptorsSupplier;
+  private final ZkACLProvider zkACLProvider;
 
-  private final ZkDistributedQueue overseerJobQueue;
-  private final OverseerTaskQueue overseerCollectionQueue;
-  private final OverseerTaskQueue overseerConfigSetQueue;
+  private volatile ZkDistributedQueue overseerJobQueue;
+  private volatile OverseerTaskQueue overseerCollectionQueue;
+  private volatile OverseerTaskQueue overseerConfigSetQueue;
 
-  private final DistributedMap overseerRunningMap;
-  private final DistributedMap overseerCompletedMap;
-  private final DistributedMap overseerFailureMap;
-  private final DistributedMap asyncIdsMap;
+  private volatile DistributedMap overseerRunningMap;
+  private volatile DistributedMap overseerCompletedMap;
+  private volatile DistributedMap overseerFailureMap;
+  private volatile DistributedMap asyncIdsMap;
 
   public final static String COLLECTION_PARAM_PREFIX = "collection.";
   public final static String CONFIGNAME_PROP = "configName";
+  private String closeStack;
 
   static class ContextKey {
 
@@ -200,44 +219,64 @@ public class ZkController implements Closeable {
     }
   }
 
-  private final Map<ContextKey, ElectionContext> electionContexts = Collections.synchronizedMap(new HashMap<>());
+  private static byte[] emptyJson = "{}".getBytes(StandardCharsets.UTF_8);
 
-  private final SolrZkClient zkClient;
-  public final ZkStateReader zkStateReader;
-  private SolrCloudManager cloudManager;
-  private CloudSolrClient cloudSolrClient;
+  private final Map<ContextKey, ElectionContext> electionContexts = new ConcurrentHashMap<>(132, 0.75f, 50) {
+    @Override
+    public ElectionContext put(ContextKey key, ElectionContext value) {
+      if (ZkController.this.isClosed || cc.isShutDown()) {
+        throw new AlreadyClosedException();
+      }
+      return super.put(key, value);
+    }
+  };
+
+  private final Map<ContextKey, ElectionContext> overseerContexts = new ConcurrentHashMap<>(132, 0.75f, 50) {
+    @Override
+    public ElectionContext put(ContextKey key, ElectionContext value) {
+      if (ZkController.this.isClosed || cc.isShutDown()) {
+        throw new AlreadyClosedException();
+      }
+      return super.put(key, value);
+    }
+  };
+
+  private volatile SolrZkClient zkClient;
+  public volatile ZkStateReader zkStateReader;
+  private volatile SolrCloudManager cloudManager;
+  private volatile CloudSolrClient cloudSolrClient;
 
   private final String zkServerAddress;          // example: 127.0.0.1:54062/solr
 
   private final int localHostPort;      // example: 54065
   private final String hostName;           // example: 127.0.0.1
   private final String nodeName;           // example: 127.0.0.1:54065_solr
-  private String baseURL;            // example: http://127.0.0.1:54065/solr
+  private volatile String baseURL;            // example: http://127.0.0.1:54065/solr
 
   private final CloudConfig cloudConfig;
-  private final NodesSysPropsCacher sysPropsCacher;
+  private volatile NodesSysPropsCacher sysPropsCacher;
 
-  private LeaderElector overseerElector;
+  private volatile LeaderElector overseerElector;
 
-  private Map<String, ReplicateFromLeader> replicateFromLeaders = new ConcurrentHashMap<>();
-  private final Map<String, ZkCollectionTerms> collectionToTerms = new HashMap<>();
+  private final Map<String, ReplicateFromLeader> replicateFromLeaders = new ConcurrentHashMap<>(132, 0.75f, 50);
+  private final Map<String, ZkCollectionTerms> collectionToTerms = new ConcurrentHashMap<>(132, 0.75f, 50);
 
   // for now, this can be null in tests, in which case recovery will be inactive, and other features
   // may accept defaults or use mocks rather than pulling things from a CoreContainer
-  private CoreContainer cc;
+  private volatile CoreContainer cc;
 
   protected volatile Overseer overseer;
 
   private int leaderVoteWait;
   private int leaderConflictResolveWait;
 
-  private boolean genericCoreNodeNames;
+  private volatile boolean genericCoreNodeNames;
 
-  private int clientTimeout;
+  private volatile int clientTimeout;
 
   private volatile boolean isClosed;
 
-  private final ConcurrentHashMap<String, Throwable> replicasMetTragicEvent = new ConcurrentHashMap<>();
+  private final ConcurrentHashMap<String, Throwable> replicasMetTragicEvent = new ConcurrentHashMap<>(132, 0.75f, 50);
 
   @Deprecated
   // keeps track of replicas that have been asked to recover by leaders running on this node
@@ -251,7 +290,7 @@ public class ZkController implements Closeable {
 
   // keeps track of a list of objects that need to know a new ZooKeeper session was created after expiration occurred
   // ref is held as a HashSet since we clone the set before notifying to avoid synchronizing too long
-  private HashSet<OnReconnect> reconnectListeners = new HashSet<OnReconnect>();
+  private final Set<OnReconnect> reconnectListeners = ConcurrentHashMap.newKeySet();
 
   private class RegisterCoreAsync implements Callable<Object> {
 
@@ -299,38 +338,56 @@ public class ZkController implements Closeable {
    */
   public ZkController(final CoreContainer cc, String zkServerAddress, int zkClientConnectTimeout, CloudConfig cloudConfig, final Supplier<List<CoreDescriptor>> descriptorsSupplier)
       throws InterruptedException, TimeoutException, IOException {
-
+    if (cc == null) log.error("null corecontainer");
     if (cc == null) throw new IllegalArgumentException("CoreContainer cannot be null.");
-    this.cc = cc;
-
-    this.cloudConfig = cloudConfig;
-
-    this.genericCoreNodeNames = cloudConfig.getGenericCoreNodeNames();
-
-    // be forgiving and strip this off leading/trailing slashes
-    // this allows us to support users specifying hostContext="/" in
-    // solr.xml to indicate the root context, instead of hostContext=""
-    // which means the default of "solr"
-    String localHostContext = trimLeadingAndTrailingSlashes(cloudConfig.getSolrHostContext());
-
-    this.zkServerAddress = zkServerAddress;
-    this.localHostPort = cloudConfig.getSolrHostPort();
-    this.hostName = normalizeHostName(cloudConfig.getHost());
-    this.nodeName = generateNodeName(this.hostName, Integer.toString(this.localHostPort), localHostContext);
-    MDCLoggingContext.setNode(nodeName);
-    this.leaderVoteWait = cloudConfig.getLeaderVoteWait();
-    this.leaderConflictResolveWait = cloudConfig.getLeaderConflictResolveWait();
-
-    this.clientTimeout = cloudConfig.getZkClientTimeout();
-    DefaultConnectionStrategy strat = new DefaultConnectionStrategy();
-    String zkACLProviderClass = cloudConfig.getZkACLProviderClass();
-    ZkACLProvider zkACLProvider = null;
-    if (zkACLProviderClass != null && zkACLProviderClass.trim().length() > 0) {
-      zkACLProvider = cc.getResourceLoader().newInstance(zkACLProviderClass, ZkACLProvider.class);
-    } else {
-      zkACLProvider = new DefaultZkACLProvider();
+    try {
+      this.cc = cc;
+      this.descriptorsSupplier = descriptorsSupplier;
+      this.cloudConfig = cloudConfig;
+      this.zkClientConnectTimeout = zkClientConnectTimeout;
+      this.genericCoreNodeNames = cloudConfig.getGenericCoreNodeNames();
+
+      // be forgiving and strip this off leading/trailing slashes
+      // this allows us to support users specifying hostContext="/" in
+      // solr.xml to indicate the root context, instead of hostContext=""
+      // which means the default of "solr"
+      String localHostContext = trimLeadingAndTrailingSlashes(cloudConfig.getSolrHostContext());
+
+      this.zkServerAddress = zkServerAddress;
+      this.localHostPort = cloudConfig.getSolrHostPort();
+      log.info("normalize hostname {}", cloudConfig.getHost());
+      this.hostName = normalizeHostName(cloudConfig.getHost());
+      log.info("generate node name");
+      this.nodeName = generateNodeName(this.hostName, Integer.toString(this.localHostPort), localHostContext);
+      log.info("node name={}", nodeName);
+      MDCLoggingContext.setNode(nodeName);
+
+      log.info("leaderVoteWait get");
+      this.leaderVoteWait = cloudConfig.getLeaderVoteWait();
+      log.info("leaderConflictWait get");
+      this.leaderConflictResolveWait = cloudConfig.getLeaderConflictResolveWait();
+
+      log.info("clientTimeout get");
+      this.clientTimeout = cloudConfig.getZkClientTimeout();
+      log.info("create connection strat");
+      this.strat = new DefaultConnectionStrategy();
+      String zkACLProviderClass = cloudConfig.getZkACLProviderClass();
+
+      if (zkACLProviderClass != null && zkACLProviderClass.trim().length() > 0) {
+        zkACLProvider = cc.getResourceLoader().newInstance(zkACLProviderClass, ZkACLProvider.class);
+      } else {
+        zkACLProvider = new DefaultZkACLProvider();
+      }
+    } catch (Exception e) {
+      log.error("Exception during ZkController init", e);
+      throw e;
     }
 
+    assert ObjectReleaseTracker.track(this);
+  }
+
+  public void start() {
+
     String zkCredentialsProviderClass = cloudConfig.getZkCredentialsProviderClass();
     if (zkCredentialsProviderClass != null && zkCredentialsProviderClass.trim().length() > 0) {
       strat.setZkCredentialsToAddAutomatically(cc.getResourceLoader().newInstance(zkCredentialsProviderClass, ZkCredentialsProvider.class));
@@ -339,119 +396,109 @@ public class ZkController implements Closeable {
     }
     addOnReconnectListener(getConfigDirListener());
 
+
     zkClient = new SolrZkClient(zkServerAddress, clientTimeout, zkClientConnectTimeout, strat,
-        // on reconnect, reload cloud info
-        new OnReconnect() {
+            // on reconnect, reload cloud info
+            new OnReconnect() {
 
-          @Override
-          public void command() throws SessionExpiredException {
-            log.info("ZooKeeper session re-connected ... refreshing core states after session expiration.");
-            clearZkCollectionTerms();
-            try {
-              // recreate our watchers first so that they exist even on any problems below
-              zkStateReader.createClusterStateWatchersAndUpdate();
-
-              // this is troublesome - we dont want to kill anything the old
-              // leader accepted
-              // though I guess sync will likely get those updates back? But
-              // only if
-              // he is involved in the sync, and he certainly may not be
-              // ExecutorUtil.shutdownAndAwaitTermination(cc.getCmdDistribExecutor());
-              // we need to create all of our lost watches
-
-              // seems we dont need to do this again...
-              // Overseer.createClientNodes(zkClient, getNodeName());
-
-              // start the overseer first as following code may need it's processing
-              if (!zkRunOnly) {
-                ElectionContext context = new OverseerElectionContext(zkClient,
-                    overseer, getNodeName());
-
-                ElectionContext prevContext = overseerElector.getContext();
-                if (prevContext != null) {
-                  prevContext.cancelElection();
-                  prevContext.close();
-                }
+              @Override
+              public void command() throws SessionExpiredException {
+                log.info("ZooKeeper session re-connected ... refreshing core states after session expiration.");
+
+                try {
+                  // recreate our watchers first so that they exist even on any problems below
+                  zkStateReader.createClusterStateWatchersAndUpdate();
 
-                overseerElector.setup(context);
+                  // this is troublesome - we dont want to kill anything the old
+                  // leader accepted
+                  // though I guess sync will likely get those updates back? But
+                  // only if
+                  // he is involved in the sync, and he certainly may not be
+                  // ExecutorUtil.shutdownAndAwaitTermination(cc.getCmdDistribExecutor());
+                  // we need to create all of our lost watches
 
-                overseerElector.joinElection(context, true);
-              }
+                  // seems we dont need to do this again...
+                  // Overseer.createClientNodes(zkClient, getNodeName());
 
-              cc.cancelCoreRecoveries();
-              
-              try {
-                registerAllCoresAsDown(descriptorsSupplier, false);
-              } catch (SessionExpiredException e) {
-                // zk has to reconnect and this will all be tried again
-                throw e;
-              } catch (Exception e) {
-                // this is really best effort - in case of races or failure cases where we now need to be the leader, if anything fails,
-                // just continue
-                log.warn("Exception while trying to register all cores as DOWN", e);
-              } 
-
-              // we have to register as live first to pick up docs in the buffer
-              createEphemeralLiveNode();
-
-              List<CoreDescriptor> descriptors = descriptorsSupplier.get();
-              // re register all descriptors
-              ExecutorService executorService = (cc != null) ? cc.getCoreZkRegisterExecutorService() : null;
-              if (descriptors != null) {
-                for (CoreDescriptor descriptor : descriptors) {
-                  // TODO: we need to think carefully about what happens when it
-                  // was
-                  // a leader that was expired - as well as what to do about
-                  // leaders/overseers
-                  // with connection loss
-                  try {
-                    // unload solrcores that have been 'failed over'
-                    throwErrorIfReplicaReplaced(descriptor);
-
-                    if (executorService != null) {
-                      executorService.submit(new RegisterCoreAsync(descriptor, true, true));
-                    } else {
-                      register(descriptor.getName(), descriptor, true, true, false);
+
+
+                  // start the overseer first as following code may need it's processing
+                  if (!zkRunOnly) {
+                    ElectionContext context = new OverseerElectionContext(getNodeName(), zkClient, overseer);
+                    ElectionContext prevContext = overseerContexts.put(new ContextKey("overseer", "overseer"), context);
+                    if (prevContext != null) {
+                      prevContext.close();
+                    }
+                    if (overseerElector != null) {
+                      ParWork.close(overseerElector.getContext());
                     }
-                  } catch (Exception e) {
-                    SolrException.log(log, "Error registering SolrCore", e);
+                    LeaderElector overseerElector = new LeaderElector(zkClient, new ContextKey("overseer", "overseer"), overseerContexts);
+                    ZkController.this.overseer = new Overseer((HttpShardHandler) cc.getShardHandlerFactory().getShardHandler(), cc.getUpdateShardHandler(),
+                            CommonParams.CORES_HANDLER_PATH, zkStateReader,  ZkController.this, cloudConfig);
+                    overseerElector.setup(context);
+                    overseerElector.joinElection(context, true);
                   }
-                }
-              }
 
-              // notify any other objects that need to know when the session was re-connected
-              HashSet<OnReconnect> clonedListeners;
-              synchronized (reconnectListeners) {
-                clonedListeners = (HashSet<OnReconnect>)reconnectListeners.clone();
-              }
-              // the OnReconnect operation can be expensive per listener, so do that async in the background
-              for (OnReconnect listener : clonedListeners) {
-                try {
-                  if (executorService != null) {
-                    executorService.submit(new OnReconnectNotifyAsync(listener));
-                  } else {
-                    listener.command();
+                  // we have to register as live first to pick up docs in the buffer
+                  createEphemeralLiveNode();
+
+                  List<CoreDescriptor> descriptors = descriptorsSupplier.get();
+                  // re register all descriptors
+                  try (ParWork parWork = new ParWork(this)) {
+                    if (descriptors != null) {
+                      for (CoreDescriptor descriptor : descriptors) {
+                        // TODO: we need to think carefully about what happens when it
+                        // was
+                        // a leader that was expired - as well as what to do about
+                        // leaders/overseers
+                        // with connection loss
+                        try {
+                          // unload solrcores that have been 'failed over'
+                          throwErrorIfReplicaReplaced(descriptor);
+
+                          parWork.collect(new RegisterCoreAsync(descriptor, true, true));
+
+                        } catch (Exception e) {
+                          ParWork.propegateInterrupt(e);
+                          SolrException.log(log, "Error registering SolrCore", e);
+                        }
+                      }
+                    }
+                    parWork.addCollect("registerCores");
+                  }
+
+                  // notify any other objects that need to know when the session was re-connected
+
+                  try (ParWork parWork = new ParWork(this)) {
+                    // the OnReconnect operation can be expensive per listener, so do that async in the background
+                    for (OnReconnect listener : reconnectListeners) {
+                      try {
+                        parWork.collect(new OnReconnectNotifyAsync(listener));
+                      } catch (Exception exc) {
+                        SolrZkClient.checkInterrupted(exc);
+                        // not much we can do here other than warn in the log
+                        log.warn("Error when notifying OnReconnect listener {} after session re-connected.", listener, exc);
+                      }
+                    }
+                    parWork.addCollect("reconnectListeners");
                   }
-                } catch (Exception exc) {
-                  // not much we can do here other than warn in the log
-                  log.warn("Error when notifying OnReconnect listener {} after session re-connected.", listener, exc);
+                } catch (InterruptedException e) {
+                  log.warn("ConnectionManager interrupted", e);
+                  // Restore the interrupted status
+                  Thread.currentThread().interrupt();
+                  close();
+                  throw new ZooKeeperException(
+                          SolrException.ErrorCode.SERVER_ERROR, "", e);
+                } catch (SessionExpiredException e) {
+                  throw e;
+                } catch (Exception e) {
+                  SolrException.log(log, "", e);
+                  throw new ZooKeeperException(
+                          SolrException.ErrorCode.SERVER_ERROR, "", e);
                 }
               }
-            } catch (InterruptedException e) {
-              // Restore the interrupted status
-              Thread.currentThread().interrupt();
-              throw new ZooKeeperException(
-                  SolrException.ErrorCode.SERVER_ERROR, "", e);
-            } catch (SessionExpiredException e) {
-              throw e;
-            } catch (Exception e) {
-              SolrException.log(log, "", e);
-              throw new ZooKeeperException(
-                  SolrException.ErrorCode.SERVER_ERROR, "", e);
-            }
-          }
 
-        }, new BeforeReconnect() {
+            }, new BeforeReconnect() {
 
       @Override
       public void command() {
@@ -460,7 +507,11 @@ public class ZkController implements Closeable {
         } catch (Exception e) {
           log.error("Error trying to stop any Overseer threads", e);
         }
-        closeOutstandingElections(descriptorsSupplier);
+        cc.cancelCoreRecoveries();
+        clearZkCollectionTerms();
+        try (ParWork closer = new ParWork(electionContexts)) {
+          closer.add("election_contexts", electionContexts.values());
+        }
         markAllAsNotLeader(descriptorsSupplier);
       }
     }, zkACLProvider, new ConnectionManager.IsClosed() {
@@ -470,25 +521,18 @@ public class ZkController implements Closeable {
         return cc.isShutDown();
       }});
 
+    init();
 
     this.overseerRunningMap = Overseer.getRunningMap(zkClient);
     this.overseerCompletedMap = Overseer.getCompletedMap(zkClient);
     this.overseerFailureMap = Overseer.getFailureMap(zkClient);
     this.asyncIdsMap = Overseer.getAsyncIdsMap(zkClient);
-
-    zkStateReader = new ZkStateReader(zkClient, () -> {
-      if (cc != null) cc.securityNodeChanged();
-    });
-
-    init();
-
     this.overseerJobQueue = overseer.getStateUpdateQueue();
     this.overseerCollectionQueue = overseer.getCollectionQueue(zkClient);
     this.overseerConfigSetQueue = overseer.getConfigSetQueue(zkClient);
     this.sysPropsCacher = new NodesSysPropsCacher(getSolrCloudManager().getNodeStateProvider(),
-        getNodeName(), zkStateReader);
+            getNodeName(), zkStateReader);
 
-    assert ObjectReleaseTracker.track(this);
   }
 
   public int getLeaderVoteWait() {
@@ -499,88 +543,10 @@ public class ZkController implements Closeable {
     return leaderConflictResolveWait;
   }
 
-  private void registerAllCoresAsDown(
-      final Supplier<List<CoreDescriptor>> registerOnReconnect, boolean updateLastPublished) throws SessionExpiredException {
-    List<CoreDescriptor> descriptors = registerOnReconnect.get();
-    if (isClosed) return;
-    if (descriptors != null) {
-      // before registering as live, make sure everyone is in a
-      // down state
-      publishNodeAsDown(getNodeName());
-      for (CoreDescriptor descriptor : descriptors) {
-        // if it looks like we are going to be the leader, we don't
-        // want to wait for the following stuff
-        CloudDescriptor cloudDesc = descriptor.getCloudDescriptor();
-        String collection = cloudDesc.getCollectionName();
-        String slice = cloudDesc.getShardId();
-        try {
-
-          int children = zkStateReader
-              .getZkClient()
-              .getChildren(
-                  ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection
-                      + "/leader_elect/" + slice + "/election", null, true).size();
-          if (children == 0) {
-            log.debug("looks like we are going to be the leader for collection {} shard {}", collection, slice);
-            continue;
-          }
-
-        } catch (NoNodeException e) {
-          log.debug("looks like we are going to be the leader for collection {} shard {}", collection, slice);
-          continue;
-        } catch (InterruptedException e2) {
-          Thread.currentThread().interrupt();
-        } catch (SessionExpiredException e) {
-          // zk has to reconnect
-          throw e;
-        } catch (KeeperException e) {
-          log.warn("", e);
-          Thread.currentThread().interrupt();
-        }
-
-        final String coreZkNodeName = descriptor.getCloudDescriptor().getCoreNodeName();
-        try {
-          log.debug("calling waitForLeaderToSeeDownState for coreZkNodeName={} collection={} shard={}", new Object[]{coreZkNodeName, collection, slice});
-          waitForLeaderToSeeDownState(descriptor, coreZkNodeName);
-        } catch (Exception e) {
-          log.warn("There was a problem while making a best effort to ensure the leader has seen us as down, this is not unexpected as Zookeeper has just reconnected after a session expiration", e);
-          if (isClosed) {
-            return;
-          }
-        }
-      }
-    }
-  }
-
   public NodesSysPropsCacher getSysPropsCacher() {
     return sysPropsCacher;
   }
 
-  private void closeOutstandingElections(final Supplier<List<CoreDescriptor>> registerOnReconnect) {
-    List<CoreDescriptor> descriptors = registerOnReconnect.get();
-    if (descriptors != null) {
-      for (CoreDescriptor descriptor : descriptors) {
-        closeExistingElectionContext(descriptor);
-      }
-    }
-  }
-
-  private ContextKey closeExistingElectionContext(CoreDescriptor cd) {
-    // look for old context - if we find it, cancel it
-    String collection = cd.getCloudDescriptor().getCollectionName();
-    final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
-
-    ContextKey contextKey = new ContextKey(collection, coreNodeName);
-    ElectionContext prevContext = electionContexts.get(contextKey);
-
-    if (prevContext != null) {
-      prevContext.close();
-      electionContexts.remove(contextKey);
-    }
-
-    return contextKey;
-  }
-
   private void markAllAsNotLeader(final Supplier<List<CoreDescriptor>> registerOnReconnect) {
     List<CoreDescriptor> descriptors = registerOnReconnect.get();
     if (descriptors != null) {
@@ -591,36 +557,29 @@ public class ZkController implements Closeable {
     }
   }
 
-  public void preClose() {
-    this.isClosed = true;
-
-    try {
-      this.removeEphemeralLiveNode();
-    } catch (AlreadyClosedException | SessionExpiredException | KeeperException.ConnectionLossException e) {
-
-    } catch (Exception e) {
-      log.warn("Error removing live node. Continuing to close CoreContainer", e);
-    }
-
-    try {
+  public void disconnect() {
+    try (ParWork closer = new ParWork(this, true)) {
       if (getZkClient().getConnectionManager().isConnected()) {
-        log.info("Publish this node as DOWN...");
-        publishNodeAsDown(getNodeName());
-      }
-    } catch (Exception e) {
-      log.warn("Error publishing nodes as down. Continuing to close CoreContainer", e);
-    }
+        closer.add("PublishNodeAsDown&RepFromLeadersClose&RemoveEmphem", replicateFromLeaders.values(), () -> {
 
-    ExecutorService customThreadPool = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("preCloseThreadPool"));
+          try {
+            log.info("Publish this node as DOWN...");
+            publishNodeAsDown(getNodeName());
+          } catch (Exception e) {
+            ParWork.propegateInterrupt("Error publishing nodes as down. Continuing to close CoreContainer", e);
+          }
+          return "PublishDown";
 
-    try {
-      synchronized (collectionToTerms) {
-        customThreadPool.submit(() -> collectionToTerms.values().parallelStream().forEach(ZkCollectionTerms::close));
-      }
+        }, () -> {
+          try {
+            removeEphemeralLiveNode();
+          } catch (Exception e) {
+            ParWork.propegateInterrupt("Error Removing ephemeral live node. Continuing to close CoreContainer", e);
+          }
+          return "RemoveEphemNode";
 
-      customThreadPool.submit(() -> replicateFromLeaders.values().parallelStream().forEach(ReplicateFromLeader::stopReplication));
-    } finally {
-      ExecutorUtil.shutdownAndAwaitTermination(customThreadPool);
+        });
+      }
     }
   }
 
@@ -628,47 +587,28 @@ public class ZkController implements Closeable {
    * Closes the underlying ZooKeeper client.
    */
   public void close() {
-    if (!this.isClosed)
-      preClose();
-
-    ExecutorService customThreadPool = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("closeThreadPool"));
-
-    customThreadPool.submit(() -> Collections.singleton(overseerElector.getContext()).parallelStream().forEach(IOUtils::closeQuietly));
-
-    customThreadPool.submit(() -> Collections.singleton(overseer).parallelStream().forEach(IOUtils::closeQuietly));
-
-    try {
-      customThreadPool.submit(() -> electionContexts.values().parallelStream().forEach(IOUtils::closeQuietly));
-
+    if (this.isClosed) {
+      throw new AssertionError(closeStack);
+    }
+    this.isClosed = true;
+    StringBuilderWriter sw = new StringBuilderWriter(1000);
+    PrintWriter pw = new PrintWriter(sw);
+    new ObjectReleaseTracker.ObjectTrackerException(this.getClass().getName()).printStackTrace(pw);
+    this.closeStack = sw.toString();
+
+    try (ParWork closer = new ParWork(this, true)) {
+      // nocommit
+      closer.add("Cleanup&Terms", collectionToTerms.values());
+      closer.add("ZkController Internals",
+              electionContexts.values(), cloudManager, sysPropsCacher, cloudSolrClient, zkStateReader, zkClient);
+      ElectionContext context = null;
+      if (overseerElector != null) {
+        context = overseerElector.getContext();
+      }
+      closer.add("ZkController Internals", context, overseerContexts.values() , overseer);
     } finally {
-
-      sysPropsCacher.close();
-      customThreadPool.submit(() -> Collections.singleton(cloudSolrClient).parallelStream().forEach(IOUtils::closeQuietly));
-      customThreadPool.submit(() -> Collections.singleton(cloudManager).parallelStream().forEach(IOUtils::closeQuietly));
-
-      try {
-        try {
-          zkStateReader.close();
-        } catch (Exception e) {
-          log.error("Error closing zkStateReader", e);
-        }
-      } finally {
-        try {
-          zkClient.close();
-        } catch (Exception e) {
-          log.error("Error closing zkClient", e);
-        } finally {
-
-          // just in case the OverseerElectionContext managed to start another Overseer
-          IOUtils.closeQuietly(overseer);
-
-          ExecutorUtil.shutdownAndAwaitTermination(customThreadPool);
-        }
-
-      }
-
+      assert ObjectReleaseTracker.release(this);
     }
-    assert ObjectReleaseTracker.release(this);
   }
 
   /**
@@ -752,9 +692,9 @@ public class ZkController implements Closeable {
       if (cloudManager != null) {
         return cloudManager;
       }
-      cloudSolrClient = new CloudSolrClient.Builder(new ZkClientClusterStateProvider(zkStateReader)).withSocketTimeout(30000).withConnectionTimeout(15000)
+      cloudSolrClient = new CloudSolrClient.Builder(new ZkClientClusterStateProvider(zkStateReader))
           .withHttpClient(cc.getUpdateShardHandler().getDefaultHttpClient())
-          .withConnectionTimeout(15000).withSocketTimeout(30000).build();
+          .build();
       cloudManager = new SolrClientCloudManager(
           new ZkDistributedQueueFactory(zkClient),
           cloudSolrClient,
@@ -782,7 +722,7 @@ public class ZkController implements Closeable {
 
   // normalize host removing any url scheme.
   // input can be null, host, or url_prefix://host
-  private String normalizeHostName(String host) {
+  public static String normalizeHostName(String host) {
 
     if (host == null || host.length() == 0) {
       String hostaddress;
@@ -814,11 +754,12 @@ public class ZkController implements Closeable {
       }
       host = hostaddress;
     } else {
+      log.info("remove host scheme");
       if (URLUtil.hasScheme(host)) {
         host = URLUtil.removeScheme(host);
       }
     }
-
+    log.info("return host {}", host);
     return host;
   }
 
@@ -854,23 +795,96 @@ public class ZkController implements Closeable {
    */
   public static void createClusterZkNodes(SolrZkClient zkClient)
       throws KeeperException, InterruptedException, IOException {
-    ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zkClient.getZkClientTimeout());
-    cmdExecutor.ensureExists(ZkStateReader.LIVE_NODES_ZKNODE, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.COLLECTIONS_ZKNODE, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.ALIASES, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.SOLR_AUTOSCALING_EVENTS_PATH, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH, zkClient);
-    byte[] emptyJson = "{}".getBytes(StandardCharsets.UTF_8);
-    cmdExecutor.ensureExists(ZkStateReader.CLUSTER_STATE, emptyJson, CreateMode.PERSISTENT, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.SOLR_SECURITY_CONF_PATH, emptyJson, CreateMode.PERSISTENT, zkClient);
-    cmdExecutor.ensureExists(ZkStateReader.SOLR_AUTOSCALING_CONF_PATH, emptyJson, CreateMode.PERSISTENT, zkClient);
-    bootstrapDefaultConfigSet(zkClient);
+    log.info("Creating cluster zk nodes");
+    // we want to have a full zk layout at the start
+    // this is especially important so that we don't miss creating
+    // any watchers with ZkStateReader on startup
+
+    List<Op> operations = new ArrayList<>(30);
+
+    operations.add(zkClient.createPathOp(ZkStateReader.LIVE_NODES_ZKNODE));
+    operations.add(zkClient.createPathOp(ZkStateReader.CONFIGS_ZKNODE));
+    operations.add(zkClient.createPathOp(ZkStateReader.ALIASES, emptyJson));
+
+    operations.add(zkClient.createPathOp("/overseer"));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_ELECT));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_ELECT + LeaderElector.ELECTION_NODE));
+
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_QUEUE));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_QUEUE_WORK));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_COLLECTION_QUEUE_WORK));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_COLLECTION_MAP_RUNNING));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_COLLECTION_MAP_COMPLETED));
+//
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_COLLECTION_MAP_FAILURE));
+    operations.add(zkClient.createPathOp(Overseer.OVERSEER_ASYNC_IDS));
+
+    operations.add(zkClient.createPathOp("/autoscaling"));
+    operations.add(zkClient.createPathOp(ZkStateReader.SOLR_AUTOSCALING_CONF_PATH, emptyJson));
+    operations.add(zkClient.createPathOp(ZkStateReader.SOLR_AUTOSCALING_EVENTS_PATH));
+    operations.add(zkClient.createPathOp(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH));
+    operations.add(zkClient.createPathOp(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH));
+    operations.add(zkClient.createPathOp(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH));
+    operations.add(zkClient.createPathOp("/autoscaling/events/.scheduled_maintenance"));
+    operations.add(zkClient.createPathOp("/autoscaling/events/.auto_add_replicas"));
+//
+    operations.add(zkClient.createPathOp(ZkStateReader.CLUSTER_STATE, emptyJson));
+    //   operations.add(zkClient.createPathOp(ZkStateReader.CLUSTER_PROPS, emptyJson));
+    operations.add(zkClient.createPathOp(ZkStateReader.SOLR_PKGS_PATH, emptyJson));
+    operations.add(zkClient.createPathOp(ZkStateReader.ROLES, emptyJson));
+//
+
+//
+//    // we create the collection znode last to indicate succesful cluster init
+    // operations.add(zkClient.createPathOp(ZkStateReader.COLLECTIONS_ZKNODE));
+
+    try {
+      log.info("Create new base SolrCloud znodes in ZooKeeper ({})", operations.size());
+      zkClient.multi(operations, true);
+    } catch (KeeperException e) {
+      log.error("Failed creating cluster zk nodes: " + e.getPath(), e);
+
+      List<OpResult> results = e.getResults();
+      Iterator<Op> it = operations.iterator();
+      for (OpResult result : results) {
+
+        Op op = it.next();
+        if (result.getType() == ZooDefs.OpCode.error) {
+          OpResult.ErrorResult dresult = (OpResult.ErrorResult) result;
+
+          System.out.println("result:" + op.getPath());
+        }
+      }
+      zkClient.printLayout();
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Failed creating cluster zk nodes", e);
+    }
+//
+    try {
+      zkClient.mkDirs(ZkStateReader.SOLR_SECURITY_CONF_PATH, emptyJson);
+    } catch (KeeperException.NodeExistsException e) {
+      // okay, can be prepopulated
+    }
+    try {
+      zkClient.mkDirs(ZkStateReader.CLUSTER_PROPS, emptyJson);
+    } catch (KeeperException.NodeExistsException e) {
+      // okay, can be prepopulated
+    }
+
+    if (!Boolean.getBoolean("solr.suppressDefaultConfigBootstrap")) {
+      bootstrapDefaultConfigSet(zkClient);
+    } else {
+      log.info("Supressing upload of default config set");
+    }
+
+    log.info("Creating final {} node", COLLECTIONS_ZKNODE);
+    Map<String,byte[]> dataMap = new HashMap<>();
+    dataMap.put(COLLECTIONS_ZKNODE, null);
+    zkClient.mkDirs(dataMap);
+
   }
 
   private static void bootstrapDefaultConfigSet(SolrZkClient zkClient) throws KeeperException, InterruptedException, IOException {
-    if (zkClient.exists("/configs/_default", true) == false) {
+    if (!zkClient.exists("/configs/_default", true)) {
       String configDirPath = getDefaultConfigDirPath();
       if (configDirPath == null) {
         log.warn("The _default configset could not be uploaded. Please provide 'solr.default.confdir' parameter that points to a configset {} {}"
@@ -907,25 +921,123 @@ public class ZkController implements Closeable {
   }
 
   private void init() {
+    log.info("do init");
     try {
-      createClusterZkNodes(zkClient);
-      zkStateReader.createClusterStateWatchersAndUpdate();
+      zkClient.mkDirs("/cluster_lock");
+    } catch (KeeperException.NodeExistsException e) {
+      e.printStackTrace();
+    } catch (KeeperException e) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    }
+    boolean createdClusterNodes = false;
+    try {
+      DistributedLock lock = new DistributedLock(zkClient, "/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster_lock"));
+      log.info("get cluster lock");
+      while (!lock.lock()) {
+        Thread.sleep(250);
+      }
+      try {
+
+        log.info("got cluster lock");
+        CountDownLatch latch = new CountDownLatch(1);
+        zkClient.getSolrZooKeeper().sync(COLLECTIONS_ZKNODE, (rc, path, ctx) -> {latch.countDown();}, new Object());
+        boolean success = latch.await(10, TimeUnit.SECONDS);
+        if (!success) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, "Timeout calling sync on collection zknode");
+        }
+        zkClient.printLayout();
+        if (!zkClient.exists(COLLECTIONS_ZKNODE, true)) {
+          try {
+            createClusterZkNodes(zkClient);
+          } catch (Exception e) {
+            log.error("Failed creating initial zk layout", e);
+            throw new SolrException(ErrorCode.SERVER_ERROR, e);
+          }
+          createdClusterNodes = true;
+        } else {
+          log.info("Cluster zk nodes already exist");
+          int currentLiveNodes = zkClient.getChildren(ZkStateReader.LIVE_NODES_ZKNODE, null, true).size();
+          log.info("Current live nodes {}", currentLiveNodes);
+//          if (currentLiveNodes == 0) {
+//            log.info("Delete Overseer queues");
+//            // cluster is in a startup state, clear zk queues
+//            List<String> pathsToDelete = Arrays.asList(new String[]{Overseer.OVERSEER_QUEUE, Overseer.OVERSEER_QUEUE_WORK,
+//                    Overseer.OVERSEER_COLLECTION_QUEUE_WORK, Overseer.OVERSEER_COLLECTION_MAP_RUNNING,
+//                    Overseer.OVERSEER_COLLECTION_MAP_COMPLETED, Overseer.OVERSEER_COLLECTION_MAP_FAILURE, Overseer.OVERSEER_ASYNC_IDS});
+//            CountDownLatch latch = new CountDownLatch(pathsToDelete.size());
+//            int[] code = new int[1];
+//            String[] path = new String[1];
+//            boolean[] failed = new boolean[1];
+//
+//            for (String delPath : pathsToDelete) {
+//              zkClient.getSolrZooKeeper().delete(delPath, -1,
+//                      (resultCode, zkpath, context) -> {
+//                        code[0] = resultCode;
+//                        if (resultCode != 0) {
+//                          failed[0] = true;
+//                          path[0] = "" + zkpath;
+//                        }
+//
+//                        latch.countDown();
+//                      }, "");
+//            }
+//            boolean success = false;
+//            log.info("Wait for delete Overseer queues");
+//            try {
+//              success = latch.await(15, TimeUnit.SECONDS);
+//            } catch (InterruptedException e) {
+//              ParWork.propegateInterrupt(e);
+//
+//              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+//            }
+//
+//            // nocommit, still haackey, do fails right
+//            if (code[0] != 0) {
+//              System.out.println("fail code: "+ code[0]);
+//              KeeperException e = KeeperException.create(KeeperException.Code.get(code[0]), path[0]);
+//              if (e instanceof  NoNodeException) {
+//                // okay
+//              } else {
+//                throw e;
+//              }
+//
+//            }
+//
+//            if (!success) {
+//              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Timeout waiting for operations to complete");
+//            }
+//          }
+        }
+
+      } finally {
+        log.info("release cluster lock");
+        lock.unlock();
+      }
+      if (!createdClusterNodes) {
+        // wait?
+      }
+      zkStateReader = new ZkStateReader(zkClient, () -> {
+        if (cc != null) cc.securityNodeChanged();
+      });
       this.baseURL = zkStateReader.getBaseUrlForNodeName(this.nodeName);
 
-      checkForExistingEphemeralNode();
-      registerLiveNodesListener();
+      log.info("create watchers");
+      zkStateReader.createClusterStateWatchersAndUpdate();
 
       // start the overseer first as following code may need it's processing
       if (!zkRunOnly) {
-        overseerElector = new LeaderElector(zkClient);
+        LeaderElector overseerElector = new LeaderElector(zkClient, new ContextKey("overseer", "overseer"), electionContexts);
         this.overseer = new Overseer((HttpShardHandler) cc.getShardHandlerFactory().getShardHandler(), cc.getUpdateShardHandler(),
             CommonParams.CORES_HANDLER_PATH, zkStateReader, this, cloudConfig);
-        ElectionContext context = new OverseerElectionContext(zkClient,
-            overseer, getNodeName());
+        ElectionContext context = new OverseerElectionContext(getNodeName(), zkClient, overseer);
+        ElectionContext prevContext = electionContexts.put(new ContextKey("overseer", "overser"), context);
+        if (prevContext != null) {
+          prevContext.close();
+        }
         overseerElector.setup(context);
         overseerElector.joinElection(context, false);
       }
-
+      registerLiveNodesListener();
       Stat stat = zkClient.exists(ZkStateReader.LIVE_NODES_ZKNODE, null, true);
       if (stat != null && stat.getNumChildren() > 0) {
         publishAndWaitForDownStates();
@@ -933,6 +1045,10 @@ public class ZkController implements Closeable {
 
       // Do this last to signal we're up.
       createEphemeralLiveNode();
+
+
+
+    //  publishAndWaitForDownStates();
     } catch (IOException e) {
       log.error("", e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
@@ -951,85 +1067,58 @@ public class ZkController implements Closeable {
 
   }
 
-  private void checkForExistingEphemeralNode() throws KeeperException, InterruptedException {
-    if (zkRunOnly) {
-      return;
-    }
-    String nodeName = getNodeName();
-    String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName;
-
-    if (!zkClient.exists(nodePath, true)) {
-      return;
-    }
-
-    final CountDownLatch deletedLatch = new CountDownLatch(1);
-    Stat stat = zkClient.exists(nodePath, event -> {
-      if (Watcher.Event.EventType.None.equals(event.getType())) {
-        return;
-      }
-      if (Watcher.Event.EventType.NodeDeleted.equals(event.getType())) {
-        deletedLatch.countDown();
-      }
-    }, true);
-
-    if (stat == null) {
-      // znode suddenly disappeared but that's okay
-      return;
-    }
-
-    boolean deleted = deletedLatch.await(zkClient.getSolrZooKeeper().getSessionTimeout() * 2, TimeUnit.MILLISECONDS);
-    if (!deleted) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, "A previous ephemeral live node still exists. " +
-          "Solr cannot continue. Please ensure that no other Solr process using the same port is running already.");
-    }
-  }
-
   private void registerLiveNodesListener() {
+    log.info("register live nodes listener");
     // this listener is used for generating nodeLost events, so we check only if
     // some nodes went missing compared to last state
-    LiveNodesListener listener = (oldNodes, newNodes) -> {
-      oldNodes.removeAll(newNodes);
-      if (oldNodes.isEmpty()) { // only added nodes
-        return false;
-      }
-      if (isClosed) {
-        return true;
-      }
-      // if this node is in the top three then attempt to create nodeLost message
-      int i = 0;
-      for (String n : newNodes) {
-        if (n.equals(getNodeName())) {
-          break;
-        }
-        if (i > 2) {
-          return false; // this node is not in the top three
-        }
-        i++;
-      }
-
-      // retrieve current trigger config - if there are no nodeLost triggers
-      // then don't create markers
-      boolean createNodes = false;
-      try {
-        createNodes = zkStateReader.getAutoScalingConfig().hasTriggerForEvents(TriggerEventType.NODELOST);
-      } catch (KeeperException | InterruptedException e1) {
-        log.warn("Unable to read autoscaling.json", e1);
-      }
-      if (createNodes) {
-        byte[] json = Utils.toJSON(Collections.singletonMap("timestamp", getSolrCloudManager().getTimeSource().getEpochTimeNs()));
-        for (String n : oldNodes) {
-          String path = ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH + "/" + n;
+    LiveNodesListener listener = new LiveNodesListener() {
+      @Override
+      public boolean onChange(SortedSet<String> oldNodes, SortedSet<String> newNodes) {
+        {
+          oldNodes.removeAll(newNodes);
+          if (oldNodes.isEmpty()) { // only added nodes
+            return false;
+          }
+          if (isClosed) {
+            return true;
+          }
+          // if this node is in the top three then attempt to create nodeLost message
+          int i = 0;
+          for (String n : newNodes) {
+            if (n.equals(getNodeName())) {
+              break;
+            }
+            if (i > 2) {
+              return false; // this node is not in the top three
+            }
+            i++;
+          }
 
+          // retrieve current trigger config - if there are no nodeLost triggers
+          // then don't create markers
+          boolean createNodes = false;
           try {
-            zkClient.create(path, json, CreateMode.PERSISTENT, true);
-          } catch (KeeperException.NodeExistsException e) {
-            // someone else already created this node - ignore
+            createNodes = zkStateReader.getAutoScalingConfig().hasTriggerForEvents(TriggerEventType.NODELOST);
           } catch (KeeperException | InterruptedException e1) {
-            log.warn("Unable to register nodeLost path for {}", n, e1);
+            log.warn("Unable to read autoscaling.json", e1);
           }
+          if (createNodes) {
+            byte[] json = Utils.toJSON(Collections.singletonMap("timestamp", getSolrCloudManager().getTimeSource().getEpochTimeNs()));
+            for (String n : oldNodes) {
+              String path = ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH + "/" + n;
+
+              try {
+                zkClient.create(path, json, CreateMode.PERSISTENT, true);
+              } catch (KeeperException.NodeExistsException e) {
+                // someone else already created this node - ignore
+              } catch (KeeperException | InterruptedException e1) {
+                log.warn("Unable to register nodeLost path for {}", n, e1);
+              }
+            }
+          }
+          return false;
         }
       }
-      return false;
     };
     zkStateReader.registerLiveNodesListener(listener);
   }
@@ -1109,24 +1198,55 @@ public class ZkController implements Closeable {
 
   private void createEphemeralLiveNode() throws KeeperException,
       InterruptedException {
-    if (zkRunOnly) {
-      return;
-    }
+
     String nodeName = getNodeName();
     String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName;
     String nodeAddedPath = ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH + "/" + nodeName;
-    log.info("Register node as live in ZooKeeper:{}", nodePath);
-    List<Op> ops = new ArrayList<>(2);
-    ops.add(Op.create(nodePath, null, zkClient.getZkACLProvider().getACLsToAdd(nodePath), CreateMode.EPHEMERAL));
-    // if there are nodeAdded triggers don't create nodeAdded markers
-    boolean createMarkerNode = zkStateReader.getAutoScalingConfig().hasTriggerForEvents(TriggerEventType.NODEADDED);
-    if (createMarkerNode && !zkClient.exists(nodeAddedPath, true)) {
-      // use EPHEMERAL so that it disappears if this node goes down
-      // and no other action is taken
-      byte[] json = Utils.toJSON(Collections.singletonMap("timestamp", TimeSource.NANO_TIME.getEpochTimeNs()));
-      ops.add(Op.create(nodeAddedPath, json, zkClient.getZkACLProvider().getACLsToAdd(nodeAddedPath), CreateMode.EPHEMERAL));
+    log.info("Register node as live in ZooKeeper:" + nodePath);
+
+   // if (zkStateReader.getClusterState().getLiveNodes().size() == 0) {
+   //   DistributedLock lock = new DistributedLock(zkClient.getSolrZooKeeper(), "/cluster_lock", zkClient.getZkACLProvider().getACLsToAdd("/cluster_lock"));
+   //   try {
+        log.info("get lock for creating ephem live node");
+ //       lock.lock();
+        log.info("do create ephem live node");
+        createLiveNodeImpl(nodePath, nodeAddedPath);
+//      } finally {
+//        log.info("unlock");
+//        lock.unlock();
+//      }
+   // } else {
+   //   createLiveNodeImpl(nodePath, nodeAddedPath);
+   // }
+  }
+
+  private void createLiveNodeImpl(String nodePath, String nodeAddedPath) {
+    Map<String,byte[]> dataMap = new HashMap<>(2);
+    Map<String, CreateMode> createModeMap = new HashMap<>(2);
+    dataMap.put(nodePath, null);
+    createModeMap.put(nodePath, CreateMode.EPHEMERAL);
+    try {
+
+
+      // if there are nodeAdded triggers don't create nodeAdded markers
+      boolean createMarkerNode = zkStateReader.getAutoScalingConfig().hasTriggerForEvents(TriggerEventType.NODEADDED);
+
+      // TODO, do this optimistically
+//      if (createMarkerNode && !zkClient.exists(nodeAddedPath, true)) {
+//        // use EPHEMERAL so that it disappears if this node goes down
+//        // and no other action is taken
+//        byte[] json = Utils.toJSON(Collections.singletonMap("timestamp", TimeSource.NANO_TIME.getEpochTimeNs()));
+//        dataMap.put(nodeAddedPath, json);
+//        createModeMap.put(nodePath, CreateMode.EPHEMERAL);
+//      }
+
+   //   zkClient.mkDirs(dataMap, createModeMap);
+      zkClient.getSolrZooKeeper().create(nodePath, null, zkClient.getZkACLProvider().getACLsToAdd(nodePath), CreateMode.EPHEMERAL);
+
+    } catch (Exception e) {
+      ParWork.propegateInterrupt(e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
     }
-    zkClient.multi(ops, true);
   }
 
   public void removeEphemeralLiveNode() throws KeeperException, InterruptedException {
@@ -1136,15 +1256,16 @@ public class ZkController implements Closeable {
     String nodeName = getNodeName();
     String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName;
     String nodeAddedPath = ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH + "/" + nodeName;
-    log.info("Remove node as live in ZooKeeper:{}", nodePath);
-    List<Op> ops = new ArrayList<>(2);
-    ops.add(Op.delete(nodePath, -1));
-    ops.add(Op.delete(nodeAddedPath, -1));
 
     try {
-      zkClient.multi(ops, true);
+      zkClient.delete(nodePath, -1, true);
     } catch (NoNodeException e) {
-
+      // okay
+    }
+    try {
+      zkClient.delete(nodeAddedPath, -1, true);
+    } catch (NoNodeException e) {
+      // okay
     }
   }
 
@@ -1180,6 +1301,9 @@ public class ZkController implements Closeable {
                          boolean afterExpiration, boolean skipRecovery) throws Exception {
     MDCLoggingContext.setCoreDescriptor(cc, desc);
     try {
+      if (cc.isShutDown()) {
+        throw new AlreadyClosedException();
+      }
       // pre register has published our down state
       final String baseUrl = getBaseUrl();
       final CloudDescriptor cloudDesc = desc.getCloudDescriptor();
@@ -1187,10 +1311,10 @@ public class ZkController implements Closeable {
       final String shardId = cloudDesc.getShardId();
       final String coreZkNodeName = cloudDesc.getCoreNodeName();
       assert coreZkNodeName != null : "we should have a coreNodeName by now";
-
+      log.info("Register SolrCore, baseUrl={} collection={}, shard={} coreNodeName={}", baseUrl, collection, shardId, coreZkNodeName);
       // check replica's existence in clusterstate first
       try {
-        zkStateReader.waitForState(collection, Overseer.isLegacy(zkStateReader) ? 60000 : 100,
+        zkStateReader.waitForState(collection, Overseer.isLegacy(zkStateReader) ? 10000 : 10000,
             TimeUnit.MILLISECONDS, (collectionState) -> getReplicaOrNull(collectionState, shardId, coreZkNodeName) != null);
       } catch (TimeoutException e) {
         throw new SolrException(ErrorCode.SERVER_ERROR, "Error registering SolrCore, timeout waiting for replica present in clusterstate");
@@ -1207,7 +1331,7 @@ public class ZkController implements Closeable {
 
       ZkShardTerms shardTerms = getShardTerms(collection, cloudDesc.getShardId());
 
-      log.debug("Register replica - core:{} address:{} collection:{} shard:{}",
+      log.info("Register replica - core:{} address:{} collection:{} shard:{}",
           coreName, baseUrl, collection, shardId);
 
       try {
@@ -1230,10 +1354,16 @@ public class ZkController implements Closeable {
         throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
       }
 
-      // in this case, we want to wait for the leader as long as the leader might
-      // wait for a vote, at least - but also long enough that a large cluster has
-      // time to get its act together
-      String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
+
+      // don't wait if we have closed
+      if (cc.isShutDown()) {
+        throw new AlreadyClosedException();
+      }
+
+      getZkStateReader().waitForState(collection, 10, TimeUnit.SECONDS, (n,c) -> c != null && c.getLeader(shardId) != null);
+
+      //  there should be no stale leader state at this point, dont hit zk directly
+      String leaderUrl = zkStateReader.getLeaderUrl(collection, shardId, 10000);
 
       String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
       log.debug("We are {} and leader is {}", ourUrl, leaderUrl);
@@ -1285,7 +1415,9 @@ public class ZkController implements Closeable {
           if (isTlogReplicaAndNotLeader) {
             startReplicationFromLeader(coreName, true);
           }
-          publish(desc, Replica.State.ACTIVE);
+          if (!isLeader) {
+            publish(desc, Replica.State.ACTIVE, true, false);
+          }
         }
 
         if (replica.getType() != Type.PULL) {
@@ -1294,12 +1426,11 @@ public class ZkController implements Closeable {
         }
         core.getCoreDescriptor().getCloudDescriptor().setHasRegistered(true);
       } catch (Exception e) {
+        SolrZkClient.checkInterrupted(e);
         unregister(coreName, desc, false);
         throw e;
       }
 
-      // make sure we have an update cluster state right away
-      zkStateReader.forceUpdateCollection(collection);
       // the watcher is added to a set so multiple calls of this method will left only one watcher
       zkStateReader.registerDocCollectionWatcher(cloudDesc.getCollectionName(),
           new UnloadCoreOnDeletedWatcher(coreZkNodeName, shardId, desc.getName()));
@@ -1356,47 +1487,36 @@ public class ZkController implements Closeable {
     String leaderUrl;
     try {
       leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms)
-          .getCoreUrl();
-
-      // now wait until our currently cloud state contains the latest leader
-      String clusterStateLeaderUrl = zkStateReader.getLeaderUrl(collection,
-          shardId, timeoutms * 2); // since we found it in zk, we are willing to
-      // wait a while to find it in state
-      int tries = 0;
-      final long msInSec = 1000L;
-      int maxTries = (int) Math.floor(leaderConflictResolveWait / msInSec);
-      while (!leaderUrl.equals(clusterStateLeaderUrl)) {
-        if (cc.isShutDown()) throw new AlreadyClosedException();
-        if (tries > maxTries) {
-          throw new SolrException(ErrorCode.SERVER_ERROR,
-              "There is conflicting information about the leader of shard: "
-                  + cloudDesc.getShardId() + " our state says:"
-                  + clusterStateLeaderUrl + " but zookeeper says:" + leaderUrl);
-        }
-        tries++;
-        if (tries % 30 == 0) {
-          String warnMsg = String.format(Locale.ENGLISH, "Still seeing conflicting information about the leader "
-                  + "of shard %s for collection %s after %d seconds; our state says %s, but ZooKeeper says %s",
-              cloudDesc.getShardId(), collection, tries, clusterStateLeaderUrl, leaderUrl);
-          log.warn(warnMsg);
-        }
-        Thread.sleep(msInSec);
-        clusterStateLeaderUrl = zkStateReader.getLeaderUrl(collection, shardId,
-            timeoutms);
-        leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms)
-            .getCoreUrl();
-      }
+              .getCoreUrl();
+
+      zkStateReader.waitForState(collection, timeoutms * 2, TimeUnit.MILLISECONDS, (n, c) -> checkLeaderUrl(cloudDesc, leaderUrl, collection, shardId, leaderConflictResolveWait));
 
-    } catch (AlreadyClosedException e) {
-      throw e;
     } catch (Exception e) {
-      log.error("Error getting leader from zk", e);
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
-          "Error getting leader from zk for shard " + shardId, e);
+      if (e instanceof  InterruptedException) {
+        Thread.currentThread().interrupt();
+      }
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Error getting leader from zk", e);
     }
     return leaderUrl;
   }
 
+  private boolean checkLeaderUrl(CloudDescriptor cloudDesc, String leaderUrl, String collection, String shardId,
+                                 int timeoutms) {
+    // now wait until our currently cloud state contains the latest leader
+    String clusterStateLeaderUrl;
+    try {
+      clusterStateLeaderUrl = zkStateReader.getLeaderUrl(collection, shardId, 10000);
+
+      // leaderUrl = getLeaderProps(collection, cloudDesc.getShardId(), timeoutms).getCoreUrl();
+    } catch (Exception e) {
+      if (e instanceof  InterruptedException) {
+        Thread.currentThread().interrupt();
+      }
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    }
+    return clusterStateLeaderUrl != null;
+  }
+
   /**
    * Get leader props directly from zk nodes.
    * @throws SessionExpiredException on zk session expiration.
@@ -1414,38 +1534,32 @@ public class ZkController implements Closeable {
    */
   public ZkCoreNodeProps getLeaderProps(final String collection,
                                         final String slice, int timeoutms, boolean failImmediatelyOnExpiration) throws InterruptedException, SessionExpiredException {
-    int iterCount = timeoutms / 1000;
+    TimeOut timeout = new TimeOut(timeoutms, TimeUnit.MILLISECONDS, TimeSource.NANO_TIME);
     Exception exp = null;
-    while (iterCount-- > 0) {
+    while (!timeout.hasTimedOut()) {
       try {
-        byte[] data = zkClient.getData(
-            ZkStateReader.getShardLeadersPath(collection, slice), null, null,
-            true);
-        ZkCoreNodeProps leaderProps = new ZkCoreNodeProps(
-            ZkNodeProps.load(data));
+        getZkStateReader().waitForState(collection, 10, TimeUnit.SECONDS, (n,c) -> c != null && c.getLeader(slice) != null);
+
+        byte[] data = zkClient.getData(ZkStateReader.getShardLeadersPath(collection, slice), null, null, true);
+        ZkCoreNodeProps leaderProps = new ZkCoreNodeProps(ZkNodeProps.load(data));
         return leaderProps;
-      } catch (InterruptedException e) {
-        throw e;
-      } catch (SessionExpiredException e) {
-        if (failImmediatelyOnExpiration) {
-          throw e;
-        }
-        exp = e;
-        Thread.sleep(1000);
+
       } catch (Exception e) {
-        exp = e;
-        Thread.sleep(1000);
-      }
-      if (cc.isShutDown()) {
-        throw new AlreadyClosedException();
+        SolrZkClient.checkInterrupted(e);
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
       }
     }
-    throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Could not get leader props", exp);
+
+    return null;
   }
 
 
   private void joinElection(CoreDescriptor cd, boolean afterExpiration, boolean joinAtHead)
       throws InterruptedException, KeeperException, IOException {
+    if (this.isClosed || cc.isShutDown()) {
+      log.warn("cannot join election, closed");
+      return;
+    }
     // look for old context - if we find it, cancel it
     String collection = cd.getCloudDescriptor().getCollectionName();
     final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
@@ -1455,7 +1569,7 @@ public class ZkController implements Closeable {
     ElectionContext prevContext = electionContexts.get(contextKey);
 
     if (prevContext != null) {
-      prevContext.cancelElection();
+      prevContext.close();
     }
 
     String shardId = cd.getCloudDescriptor().getShardId();
@@ -1467,15 +1581,24 @@ public class ZkController implements Closeable {
     props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
     props.put(ZkStateReader.CORE_NODE_NAME_PROP, coreNodeName);
 
-
     ZkNodeProps ourProps = new ZkNodeProps(props);
 
     LeaderElector leaderElector = new LeaderElector(zkClient, contextKey, electionContexts);
     ElectionContext context = new ShardLeaderElectionContext(leaderElector, shardId,
         collection, coreNodeName, ourProps, this, cc);
 
+    if (this.isClosed || cc.isShutDown()) {
+      context.close();
+      return;
+    }
+    System.out.println("add context to map");
+    prevContext = electionContexts.put(contextKey, context);
+    if (prevContext != null) {
+      prevContext.close();
+    }
+
     leaderElector.setup(context);
-    electionContexts.put(contextKey, context);
+
     leaderElector.joinElection(context, false, joinAtHead);
   }
 
@@ -1544,7 +1667,7 @@ public class ZkController implements Closeable {
     try {
       String collection = cd.getCloudDescriptor().getCollectionName();
 
-      log.debug("publishing state={}", state);
+      log.info("publishing state={}", state);
       // System.out.println(Thread.currentThread().getStackTrace()[3]);
       Integer numShards = cd.getCloudDescriptor().getNumShards();
       if (numShards == null) { // XXX sys prop hack
@@ -1568,6 +1691,7 @@ public class ZkController implements Closeable {
       props.put(ZkStateReader.SHARD_ID_PROP, cd.getCloudDescriptor().getShardId());
       props.put(ZkStateReader.COLLECTION_PROP, collection);
       props.put(ZkStateReader.REPLICA_TYPE, cd.getCloudDescriptor().getReplicaType().toString());
+
       if (!Overseer.isLegacy(zkStateReader)) {
         props.put(ZkStateReader.FORCE_SET_STATE_PROP, "false");
       }
@@ -1626,15 +1750,15 @@ public class ZkController implements Closeable {
   private ZkCollectionTerms getCollectionTerms(String collection) {
     synchronized (collectionToTerms) {
       if (!collectionToTerms.containsKey(collection)) collectionToTerms.put(collection, new ZkCollectionTerms(collection, zkClient));
-      return collectionToTerms.get(collection);
     }
+    return collectionToTerms.get(collection);
   }
 
   public void clearZkCollectionTerms() {
-    synchronized (collectionToTerms) {
-      collectionToTerms.values().forEach(ZkCollectionTerms::close);
+      try (ParWork closer = new ParWork(this)) {
+        closer.add("zkCollectionTerms", collectionToTerms.values());
+      }
       collectionToTerms.clear();
-    }
   }
 
   public void unregister(String coreName, CoreDescriptor cd) throws Exception {
@@ -1644,7 +1768,11 @@ public class ZkController implements Closeable {
   public void unregister(String coreName, CoreDescriptor cd, boolean removeCoreFromZk) throws Exception {
     final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
     final String collection = cd.getCloudDescriptor().getCollectionName();
-    getCollectionTerms(collection).remove(cd.getCloudDescriptor().getShardId(), cd);
+    ZkCollectionTerms ct = collectionToTerms.get(collection);
+    if (ct != null) {
+      ct.close();
+      ct.remove(cd.getCloudDescriptor().getShardId(), cd);
+    }
     replicasMetTragicEvent.remove(collection+":"+coreNodeName);
 
     if (Strings.isNullOrEmpty(collection)) {
@@ -1659,7 +1787,7 @@ public class ZkController implements Closeable {
       ElectionContext context = electionContexts.remove(new ContextKey(collection, coreNodeName));
 
       if (context != null) {
-        context.cancelElection();
+        context.close();
       }
     }
     CloudDescriptor cloudDescriptor = cd.getCloudDescriptor();
@@ -1697,63 +1825,83 @@ public class ZkController implements Closeable {
     }
   }
 
-  private void waitForCoreNodeName(CoreDescriptor descriptor) {
-    int retryCount = 320;
-    log.debug("look for our core node name");
-    while (retryCount-- > 0) {
-      final DocCollection docCollection = zkStateReader.getClusterState()
-          .getCollectionOrNull(descriptor.getCloudDescriptor().getCollectionName());
-      if (docCollection != null && docCollection.getSlicesMap() != null) {
-        final Map<String, Slice> slicesMap = docCollection.getSlicesMap();
+  private void waitForCoreNodeName(CoreDescriptor cd) {
+    if (log.isDebugEnabled()) log.debug("look for our core node name");
+
+    AtomicReference<String> errorMessage = new AtomicReference<>();
+    try {
+      zkStateReader.waitForState(cd.getCollectionName(), 120, TimeUnit.SECONDS, (n, c) -> { // TODO: drop timeout for tests
+        if (c == null)
+          return false;
+        final Map<String,Slice> slicesMap = c.getSlicesMap();
+        if (slicesMap == null) {
+          return false;
+        }
         for (Slice slice : slicesMap.values()) {
           for (Replica replica : slice.getReplicas()) {
-            // TODO: for really large clusters, we could 'index' on this
 
             String nodeName = replica.getStr(ZkStateReader.NODE_NAME_PROP);
             String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
 
             String msgNodeName = getNodeName();
-            String msgCore = descriptor.getName();
+            String msgCore = cd.getName();
 
             if (msgNodeName.equals(nodeName) && core.equals(msgCore)) {
-              descriptor.getCloudDescriptor()
-                  .setCoreNodeName(replica.getName());
-              getCoreContainer().getCoresLocator().persist(getCoreContainer(), descriptor);
-              return;
+              cd.getCloudDescriptor()
+                      .setCoreNodeName(replica.getName());
+              return true;
             }
           }
         }
-      }
-      try {
-        Thread.sleep(1000);
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-      }
+        return false;
+      });
+    } catch (TimeoutException | InterruptedException e) {
+      String error = errorMessage.get();
+      if (error == null)
+        error = "";
+      throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, "Could not get shard id for core: " + cd.getName() + " " + error);
     }
   }
 
   private void waitForShardId(CoreDescriptor cd) {
     if (log.isDebugEnabled()) {
-      log.debug("waiting to find shard id in clusterstate for {}", cd.getName());
+      log.debug("waitForShardId(CoreDescriptor cd={}) - start", cd);
     }
-    int retryCount = 320;
-    while (retryCount-- > 0) {
-      final String shardId = zkStateReader.getClusterState().getShardId(cd.getCollectionName(), getNodeName(), cd.getName());
-      if (shardId != null) {
-        cd.getCloudDescriptor().setShardId(shardId);
-        return;
-      }
+
+    AtomicReference<String> returnId = new AtomicReference<>();
+    try {
       try {
-        Thread.sleep(1000);
+        zkStateReader.waitForState(cd.getCollectionName(), 5, TimeUnit.SECONDS, (n, c) -> { // nocommit
+          if (c == null) return false;
+          String shardId = c.getShardId(cd.getCloudDescriptor().getCoreNodeName());
+          if (shardId != null) {
+            returnId.set(shardId);
+            return true;
+          }
+          return false;
+        });
       } catch (InterruptedException e) {
         Thread.currentThread().interrupt();
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Could not get shard id for core: " + cd.getName());
       }
+    } catch (TimeoutException e1) {
+      log.error("waitForShardId(CoreDescriptor=" + cd + ")", e1);
+
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Could not get shard id for core: " + cd.getName());
     }
 
-    throw new SolrException(ErrorCode.SERVER_ERROR,
-        "Could not get shard id for core: " + cd.getName());
-  }
+    final String shardId = returnId.get();
+    if (shardId != null) {
+      cd.getCloudDescriptor().setShardId(shardId);
+
+      if (log.isDebugEnabled()) {
+        log.debug("waitForShardId(CoreDescriptor) - end coreNodeName=" + cd.getCloudDescriptor().getCoreNodeName() + " shardId=" + shardId);
+      }
+      return;
+    }
 
+    throw new SolrException(ErrorCode.SERVER_ERROR, "Could not get shard id for core: " + cd.getName());
+  }
 
   public String getCoreNodeName(CoreDescriptor descriptor) {
     String coreNodeName = descriptor.getCloudDescriptor().getCoreNodeName();
@@ -1766,7 +1914,7 @@ public class ZkController implements Closeable {
   }
 
   public void preRegister(CoreDescriptor cd, boolean publishState) {
-
+    log.info("PreRegister SolrCore, collection={}, shard={} coreNodeName={}", cd.getCloudDescriptor().getCollectionName(), cd.getCloudDescriptor().getShardId());
     String coreNodeName = getCoreNodeName(cd);
 
     // before becoming available, make sure we are not live and active
@@ -1780,9 +1928,9 @@ public class ZkController implements Closeable {
       if (cloudDesc.getCoreNodeName() == null) {
         cloudDesc.setCoreNodeName(coreNodeName);
       }
-
+      log.info("PreRegister found coreNodename of {}", coreNodeName);
       // publishState == false on startup
-      if (publishState || isPublishAsDownOnStartup(cloudDesc)) {
+      if (isPublishAsDownOnStartup(cloudDesc)) {
         publish(cd, Replica.State.DOWN, false, true);
       }
       String collectionName = cd.getCloudDescriptor().getCollectionName();
@@ -1835,9 +1983,6 @@ public class ZkController implements Closeable {
       CloudDescriptor cloudDesc = cd.getCloudDescriptor();
       String nodeName = cloudDesc.getCoreNodeName();
       if (nodeName == null) {
-        if (cc.repairCoreProperty(cd, CoreDescriptor.CORE_NODE_NAME) == false) {
-          throw new SolrException(ErrorCode.SERVER_ERROR, "No coreNodeName for " + cd);
-        }
         nodeName = cloudDesc.getCoreNodeName();
         // verify that the repair worked.
         if (nodeName == null) {
@@ -1853,7 +1998,7 @@ public class ZkController implements Closeable {
       AtomicReference<String> errorMessage = new AtomicReference<>();
       AtomicReference<DocCollection> collectionState = new AtomicReference<>();
       try {
-        zkStateReader.waitForState(cd.getCollectionName(), 10, TimeUnit.SECONDS, (c) -> {
+        zkStateReader.waitForState(cd.getCollectionName(), WAIT_FOR_STATE, TimeUnit.SECONDS, (c) -> {
           collectionState.set(c);
           if (c == null)
             return false;
@@ -1864,8 +2009,10 @@ public class ZkController implements Closeable {
           }
           Replica replica = slice.getReplica(coreNodeName);
           if (replica == null) {
+            StringBuilder sb = new StringBuilder();
+            slice.getReplicas().stream().forEach(replica1 -> sb.append(replica1.getName() + " "));
             errorMessage.set("coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
-                ", ignore the exception if the replica was deleted");
+                ", ignore the exception if the replica was deleted. Found: " + sb.toString());
             return false;
           }
           return true;
@@ -1874,8 +2021,9 @@ public class ZkController implements Closeable {
         String error = errorMessage.get();
         if (error == null)
           error = "coreNodeName " + coreNodeName + " does not exist in shard " + cloudDesc.getShardId() +
-              ", ignore the exception if the replica was deleted";
-        throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
+              ", ignore the exception if the replica was deleted" ;
+
+        throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error + "\n" + getZkStateReader().getClusterState().getCollection(cd.getCollectionName()));
       }
     }
   }
@@ -1937,6 +2085,8 @@ public class ZkController implements Closeable {
         try (HttpSolrClient client = new Builder(leaderBaseUrl)
             .withConnectionTimeout(8000) // short timeouts, we may be in a storm and this is best effort and maybe we should be the leader now
             .withSocketTimeout(30000)
+             .withHttpClient(cc.getUpdateShardHandler().getDefaultHttpClient())
+            .markInternalRequest()
             .build()) {
           WaitForState prepCmd = new WaitForState();
           prepCmd.setCoreName(leaderCoreName);
@@ -2037,7 +2187,7 @@ public class ZkController implements Closeable {
   /**
    * If in SolrCloud mode, upload config sets for each SolrCore in solr.xml.
    */
-  public static void bootstrapConf(SolrZkClient zkClient, CoreContainer cc) throws IOException {
+  public static void bootstrapConf(SolrZkClient zkClient, CoreContainer cc) throws IOException, KeeperException {
 
     ZkConfigManager configManager = new ZkConfigManager(zkClient);
 
@@ -2130,10 +2280,6 @@ public class ZkController implements Closeable {
     return overseer;
   }
 
-  public LeaderElector getOverseerElector() {
-    return overseerElector;
-  }
-
   /**
    * Returns the nodeName that should be used based on the specified properties.
    *
@@ -2143,7 +2289,7 @@ public class ZkController implements Closeable {
    * @lucene.experimental
    * @see ZkStateReader#getBaseUrlForNodeName
    */
-  static String generateNodeName(final String hostName,
+  public static String generateNodeName(final String hostName,
                                  final String hostPort,
                                  final String hostContext) {
     try {
@@ -2192,8 +2338,8 @@ public class ZkController implements Closeable {
             }
           }
         } else { // We're in the right place, now attempt to rejoin
-          overseerElector.retryElection(new OverseerElectionContext(zkClient,
-              overseer, getNodeName()), joinAtHead);
+          overseerElector.retryElection(new OverseerElectionContext(getNodeName(), zkClient,
+              overseer), joinAtHead);
           return;
         }
       } else {
@@ -2222,18 +2368,18 @@ public class ZkController implements Closeable {
       ContextKey contextKey = new ContextKey(collectionName, coreNodeName);
 
       ElectionContext prevContext = electionContexts.get(contextKey);
-      if (prevContext != null) prevContext.cancelElection();
+      if (prevContext != null) prevContext.close();
 
       ZkNodeProps zkProps = new ZkNodeProps(BASE_URL_PROP, baseUrl, CORE_NAME_PROP, coreName, NODE_NAME_PROP, getNodeName(), CORE_NODE_NAME_PROP, coreNodeName);
 
-      LeaderElector elect = ((ShardLeaderElectionContextBase) prevContext).getLeaderElector();
+      LeaderElector elect = ((ShardLeaderElectionContext) prevContext).getLeaderElector();
       ShardLeaderElectionContext context = new ShardLeaderElectionContext(elect, shardId, collectionName,
           coreNodeName, zkProps, this, getCoreContainer());
 
       context.leaderSeqPath = context.electionPath + LeaderElector.ELECTION_NODE + "/" + electionNode;
       elect.setup(context);
-      electionContexts.put(contextKey, context);
-
+      prevContext = electionContexts.put(contextKey, context);
+      if (prevContext != null) prevContext.close();
       elect.retryElection(context, params.getBool(REJOIN_AT_HEAD_PROP, false));
     } catch (Exception e) {
       throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to rejoin election", e);
@@ -2285,10 +2431,9 @@ public class ZkController implements Closeable {
    */
   public void addOnReconnectListener(OnReconnect listener) {
     if (listener != null) {
-      synchronized (reconnectListeners) {
-        reconnectListeners.add(listener);
-        log.debug("Added new OnReconnect listener {}", listener);
-      }
+       reconnectListeners.add(listener);
+       log.debug("Added new OnReconnect listener {}", listener);
+
     }
   }
 
@@ -2312,10 +2457,7 @@ public class ZkController implements Closeable {
   }
 
   Set<OnReconnect> getCurrentOnReconnectListeners() {
-    HashSet<OnReconnect> clonedListeners;
-    synchronized (reconnectListeners) {
-      clonedListeners = (HashSet<OnReconnect>)reconnectListeners.clone();
-    }
+    Set<OnReconnect> clonedListeners = new HashSet<>(reconnectListeners);
     return clonedListeners;
   }
 
@@ -2515,21 +2657,19 @@ public class ZkController implements Closeable {
         log.debug("Watcher on {} is removed ", zkDir);
         return false;
       }
-      final Set<Runnable> listeners = confDirectoryListeners.get(zkDir);
-      if (listeners != null && !listeners.isEmpty()) {
-        final Set<Runnable> listenersCopy = new HashSet<>(listeners);
-        // run these in a separate thread because this can be long running
-        new Thread(() -> {
-          log.debug("Running listeners for {}", zkDir);
-          for (final Runnable listener : listenersCopy) {
-            try {
-              listener.run();
-            } catch (Exception e) {
-              log.warn("listener throws error", e);
-            }
-          }
-        }).start();
+    }
+    final Set<Runnable> listeners = confDirectoryListeners.get(zkDir);
+    if (listeners != null) {
 
+      // run these in a separate thread because this can be long running
+
+      try (ParWork worker = new ParWork(this, true)) {
+        worker.add("", () -> {
+          listeners.forEach((it) -> worker.collect(() -> {
+            it.run();
+            return it;
+          }));
+        });
       }
     }
     return true;
@@ -2577,6 +2717,10 @@ public class ZkController implements Closeable {
     @Override
     // synchronized due to SOLR-11535
     public synchronized boolean onStateChanged(DocCollection collectionState) {
+      if (isClosed) { // don't accidentally delete cores on shutdown due to unreliable state
+        return true;
+      }
+
       if (getCoreContainer().getCoreDescriptor(coreName) == null) return true;
 
       boolean replicaRemoved = getReplicaOrNull(collectionState, shard, coreNodeName) == null;
@@ -2644,8 +2788,14 @@ public class ZkController implements Closeable {
    *
    * @param nodeName to operate on
    */
-  public void publishNodeAsDown(String nodeName) {
+  public void publishNodeAsDown(String nodeName) throws KeeperException {
     log.info("Publish node={} as DOWN", nodeName);
+
+    if (overseer == null) {
+      log.warn("Could not publish node as down, no overseer was started yet");
+      return;
+    }
+
     ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.DOWNNODE.toLower(),
         ZkStateReader.NODE_NAME_PROP, nodeName);
     try {
@@ -2655,8 +2805,6 @@ public class ZkController implements Closeable {
     } catch (InterruptedException e) {
       Thread.currentThread().interrupt();
       log.debug("Publish node as down was interrupted.");
-    } catch (KeeperException e) {
-      log.warn("Could not publish node as down: {}", e.getMessage());
     }
   }
 
@@ -2664,6 +2812,9 @@ public class ZkController implements Closeable {
    * Ensures that a searcher is registered for the given core and if not, waits until one is registered
    */
   private static void ensureRegisteredSearcher(SolrCore core) throws InterruptedException {
+    if (core.isClosed() || core.getCoreContainer().isShutDown()) {
+      return;
+    }
     if (!core.getSolrConfig().useColdSearcher) {
       RefCounted<SolrIndexSearcher> registeredSearcher = core.getRegisteredSearcher();
       if (registeredSearcher != null) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java b/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java
index 53d799b..b646a52 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkDistributedQueue.java
@@ -129,15 +129,15 @@ public class ZkDistributedQueue implements DistributedQueue {
   public ZkDistributedQueue(SolrZkClient zookeeper, String dir, Stats stats, int maxQueueSize, IsClosed higherLevelIsClosed) {
     this.dir = dir;
 
-    ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zookeeper.getZkClientTimeout(), higherLevelIsClosed);
-    try {
-      cmdExecutor.ensureExists(dir, zookeeper);
-    } catch (KeeperException e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    } catch (InterruptedException e) {
-      Thread.currentThread().interrupt();
-      throw new SolrException(ErrorCode.SERVER_ERROR, e);
-    }
+//    ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zookeeper.getZkClientTimeout(), higherLevelIsClosed);
+//    try {
+//      cmdExecutor.ensureExists(dir, zookeeper);
+//    } catch (KeeperException e) {
+//      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+//    } catch (InterruptedException e) {
+//      Thread.currentThread().interrupt();
+//      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+//    }
 
     this.zookeeper = zookeeper;
     this.stats = stats;
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java b/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
index bd446c4..be49409 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkShardTerms.java
@@ -111,6 +111,7 @@ public class ZkShardTerms implements AutoCloseable{
    * @param replicasNeedingRecovery set of replicas in which their terms should be lower than leader's term
    */
   public void ensureTermsIsHigher(String leader, Set<String> replicasNeedingRecovery) {
+    log.info("leader={} replicasNeedingRecvoery={}", leader, replicasNeedingRecovery);
     if (replicasNeedingRecovery.isEmpty()) return;
 
     ShardTerms newTerms;
@@ -304,6 +305,7 @@ public class ZkShardTerms implements AutoCloseable{
    * @throws KeeperException.NoNodeException correspond ZK term node is not created
    */
   private boolean saveTerms(ShardTerms newTerms) throws KeeperException.NoNodeException {
+    log.info("Save terms={}", newTerms);
     byte[] znodeData = Utils.toJSON(newTerms);
     try {
       Stat stat = zkClient.setData(znodePath, znodeData, newTerms.getVersion(), true);
@@ -316,6 +318,9 @@ public class ZkShardTerms implements AutoCloseable{
     } catch (KeeperException.NoNodeException e) {
       throw e;
     } catch (Exception e) {
+      if (e instanceof  InterruptedException) {
+        Thread.currentThread().interrupt();
+      }
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error while saving shard term for collection: " + collection, e);
     }
     return false;
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java b/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java
index 4d9d910..a3dbc31 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkSolrResourceLoader.java
@@ -111,7 +111,7 @@ public class ZkSolrResourceLoader extends SolrResourceLoader {
 
     try {
       // delegate to the class loader (looking into $INSTANCE_DIR/lib jars)
-      is = classLoader.getResourceAsStream(resource.replace(File.separatorChar, '/'));
+      is = resourceClassLoader.getResourceAsStream(resource.replace(File.separatorChar, '/'));
     } catch (Exception e) {
       throw new IOException("Error opening " + resource, e);
     }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/AddReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/AddReplicaCmd.java
index 02d9fd7..40d461f 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/AddReplicaCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/AddReplicaCmd.java
@@ -17,8 +17,6 @@
 
 package org.apache.solr.cloud.api.collections;
 
-
-import static org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.CREATE_NODE_SET;
 import static org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.SKIP_CREATE_REPLICA_IN_CLUSTER_STATE;
 import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
 import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
@@ -53,6 +51,7 @@ import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper;
 import org.apache.solr.cloud.ActiveReplicaWatcher;
 import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrCloseableLatch;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
@@ -60,6 +59,7 @@ import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ReplicaPosition;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkCoreNodeProps;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CommonAdminParams;
@@ -125,7 +125,7 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
     final String asyncId = message.getStr(ASYNC);
 
     String node = message.getStr(CoreAdminParams.NODE);
-    String createNodeSetStr = message.getStr(CREATE_NODE_SET);
+    String createNodeSetStr = message.getStr(ZkStateReader.CREATE_NODE_SET);
 
     if (node != null && createNodeSetStr != null) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Both 'node' and 'createNodeSet' parameters cannot be specified together.");
@@ -179,7 +179,7 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
     Runnable runnable = () -> {
       shardRequestTracker.processResponses(results, shardHandler, true, "ADDREPLICA failed to create replica");
       for (CreateReplica replica : createReplicas) {
-        ocmh.waitForCoreNodeName(collectionName, replica.node, replica.coreName);
+        ocmh.waitForCoreNodeName(zkStateReader, collectionName, replica.node, replica.coreName);
       }
       if (onComplete != null) onComplete.run();
     };
@@ -202,7 +202,9 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
         runnable.run();
       }
     } else {
-      ocmh.tpe.submit(runnable);
+      try (ParWork worker = new ParWork(this)) {
+        worker.add("AddReplica", runnable);
+      }
     }
 
     return createReplicas.stream()
@@ -240,30 +242,30 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
     }
 
     ModifiableSolrParams params = new ModifiableSolrParams();
-
     ZkStateReader zkStateReader = ocmh.zkStateReader;
     if (!Overseer.isLegacy(zkStateReader)) {
+      ZkNodeProps props = new ZkNodeProps(
+              Overseer.QUEUE_OPERATION, ADDREPLICA.toLower(),
+              ZkStateReader.COLLECTION_PROP, collectionName,
+              ZkStateReader.SHARD_ID_PROP, createReplica.sliceName,
+              ZkStateReader.CORE_NAME_PROP, createReplica.coreName,
+              ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(),
+              ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(createReplica.node),
+              ZkStateReader.NODE_NAME_PROP, createReplica.node,
+              ZkStateReader.REPLICA_TYPE, createReplica.replicaType.name());
+      if (createReplica.coreNodeName != null) {
+        props = props.plus(ZkStateReader.CORE_NODE_NAME_PROP, createReplica.coreNodeName);
+      }
       if (!skipCreateReplicaInClusterState) {
-        ZkNodeProps props = new ZkNodeProps(
-            Overseer.QUEUE_OPERATION, ADDREPLICA.toLower(),
-            ZkStateReader.COLLECTION_PROP, collectionName,
-            ZkStateReader.SHARD_ID_PROP, createReplica.sliceName,
-            ZkStateReader.CORE_NAME_PROP, createReplica.coreName,
-            ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(),
-            ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(createReplica.node),
-            ZkStateReader.NODE_NAME_PROP, createReplica.node,
-            ZkStateReader.REPLICA_TYPE, createReplica.replicaType.name());
-        if (createReplica.coreNodeName != null) {
-          props = props.plus(ZkStateReader.CORE_NODE_NAME_PROP, createReplica.coreNodeName);
-        }
         try {
           ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
         } catch (Exception e) {
           throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception updating Overseer state queue", e);
         }
       }
+      String coreUrl = ZkCoreNodeProps.getCoreUrl(props.getStr(ZkStateReader.BASE_URL_PROP), createReplica.coreName);;
       params.set(CoreAdminParams.CORE_NODE_NAME,
-          ocmh.waitToSeeReplicasInState(collectionName, Collections.singletonList(createReplica.coreName)).get(createReplica.coreName).getName());
+          ocmh.waitToSeeReplicasInState(collectionName, Collections.singletonList(coreUrl), false).get(coreUrl).getName());
     }
 
     String configName = zkStateReader.readConfigName(collectionName);
@@ -359,10 +361,10 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
     int totalReplicas = numNrtReplicas + numPullReplicas + numTlogReplicas;
 
     String node = message.getStr(CoreAdminParams.NODE);
-    Object createNodeSetStr = message.get(OverseerCollectionMessageHandler.CREATE_NODE_SET);
+    Object createNodeSetStr = message.get(ZkStateReader.CREATE_NODE_SET);
     if (createNodeSetStr == null) {
       if (node != null) {
-        message.getProperties().put(OverseerCollectionMessageHandler.CREATE_NODE_SET, node);
+        message.getProperties().put(ZkStateReader.CREATE_NODE_SET, node);
         createNodeSetStr = node;
       }
     }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/AliasCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/AliasCmd.java
index 611bd2d..2527c15 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/AliasCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/AliasCmd.java
@@ -19,12 +19,14 @@ package org.apache.solr.cloud.api.collections;
 
 import java.util.Map;
 
+import org.apache.solr.client.solrj.impl.BaseCloudSolrClient;
 import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.OverseerSolrResponse;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.CollectionProperties;
 import org.apache.solr.common.cloud.ZkNodeProps;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.util.NamedList;
@@ -33,6 +35,9 @@ import org.apache.solr.request.LocalSolrQueryRequest;
 
 import static org.apache.solr.cloud.api.collections.RoutedAlias.CREATE_COLLECTION_PREFIX;
 import static org.apache.solr.cloud.api.collections.RoutedAlias.ROUTED_ALIAS_NAME_CORE_PROP;
+import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
+import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
+import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS;
 import static org.apache.solr.common.params.CollectionAdminParams.COLL_CONF;
 import static org.apache.solr.common.params.CommonParams.NAME;
 
@@ -77,11 +82,12 @@ abstract class AliasCmd implements OverseerCollectionMessageHandler.Cmd {
     createMsgMap.put(Overseer.QUEUE_OPERATION, "create");
 
     NamedList results = new NamedList();
+    ZkNodeProps zkProps = new ZkNodeProps(createMsgMap);
     try {
       // Since we are running in the Overseer here, send the message directly to the Overseer CreateCollectionCmd.
       // note: there's doesn't seem to be any point in locking on the collection name, so we don't. We currently should
       //   already have a lock on the alias name which should be sufficient.
-      ocmh.commandMap.get(CollectionParams.CollectionAction.CREATE).call(clusterState, new ZkNodeProps(createMsgMap), results);
+      ocmh.commandMap.get(CollectionParams.CollectionAction.CREATE).call(clusterState, zkProps, results);
     } catch (SolrException e) {
       // The collection might already exist, and that's okay -- we can adopt it.
       if (!e.getMessage().contains("collection already exists")) {
@@ -89,10 +95,13 @@ abstract class AliasCmd implements OverseerCollectionMessageHandler.Cmd {
       }
     }
 
-    CollectionsHandler.waitForActiveCollection(createCollName, ocmh.overseer.getCoreContainer(),
-        new OverseerSolrResponse(results));
+
+    int numShards = BaseCloudSolrClient.getShardNames(zkProps).size();
+    CollectionsHandler.waitForActiveCollection(createCollName, ocmh.overseer.getCoreContainer(), numShards, numShards * BaseCloudSolrClient.getTotalReplicas(zkProps));
     CollectionProperties collectionProperties = new CollectionProperties(ocmh.zkStateReader.getZkClient());
     collectionProperties.setCollectionProperty(createCollName,ROUTED_ALIAS_NAME_CORE_PROP,aliasName);
+
+    // nocommit make efficient
     while (!ocmh.zkStateReader.getCollectionProperties(createCollName,1000).containsKey(ROUTED_ALIAS_NAME_CORE_PROP)) {
       Thread.sleep(50);
     }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
index cfc401d..923f594 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/Assign.java
@@ -62,7 +62,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static org.apache.solr.client.solrj.cloud.autoscaling.Policy.POLICY;
-import static org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.CREATE_NODE_SET;
 import static org.apache.solr.common.cloud.DocCollection.SNITCH;
 import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
 
@@ -235,9 +234,9 @@ public class Assign {
 
   public static List<String> getLiveOrLiveAndCreateNodeSetList(final Set<String> liveNodes, final ZkNodeProps message, final Random random) {
     List<String> nodeList;
-    final String createNodeSetStr = message.getStr(CREATE_NODE_SET);
+    final String createNodeSetStr = message.getStr(ZkStateReader.CREATE_NODE_SET);
     final List<String> createNodeList = (createNodeSetStr == null) ? null :
-        StrUtils.splitSmart((OverseerCollectionMessageHandler.CREATE_NODE_SET_EMPTY.equals(createNodeSetStr) ?
+        StrUtils.splitSmart((ZkStateReader.CREATE_NODE_SET_EMPTY.equals(createNodeSetStr) ?
             "" : createNodeSetStr), ",", true);
 
     if (createNodeList != null) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
index 4f00253..7261bb5 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java
@@ -21,6 +21,8 @@ package org.apache.solr.cloud.api.collections;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -29,6 +31,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Properties;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicReference;
@@ -40,19 +43,23 @@ import org.apache.solr.client.solrj.cloud.autoscaling.BadVersionException;
 import org.apache.solr.client.solrj.cloud.autoscaling.NotEmptyException;
 import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper;
 import org.apache.solr.client.solrj.cloud.autoscaling.VersionedData;
+import org.apache.solr.client.solrj.impl.BaseCloudSolrClient;
 import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.ZkController;
 import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
 import org.apache.solr.cloud.overseer.ClusterStateMutator;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.common.cloud.Aliases;
 import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.CollectionStatePredicate;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.DocRouter;
 import org.apache.solr.common.cloud.ImplicitDocRouter;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ReplicaPosition;
+import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkConfigManager;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
@@ -65,6 +72,7 @@ import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.util.TimeSource;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.CoreContainer;
 import org.apache.solr.handler.admin.ConfigSetsHandlerApi;
 import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.handler.component.ShardRequest;
@@ -95,11 +103,15 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
   private final OverseerCollectionMessageHandler ocmh;
   private final TimeSource timeSource;
   private final DistribStateManager stateManager;
+  private final ZkStateReader zkStateReader;
+  private final SolrCloudManager cloudManager;
 
-  public CreateCollectionCmd(OverseerCollectionMessageHandler ocmh) {
+  public CreateCollectionCmd(OverseerCollectionMessageHandler ocmh, CoreContainer cc, SolrCloudManager cloudManager, ZkStateReader zkStateReader) {
     this.ocmh = ocmh;
     this.stateManager = ocmh.cloudManager.getDistribStateManager();
     this.timeSource = ocmh.cloudManager.getTimeSource();
+    this.zkStateReader = zkStateReader;
+    this.cloudManager = cloudManager;
   }
 
   @Override
@@ -113,9 +125,9 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     final boolean waitForFinalState = message.getBool(WAIT_FOR_FINAL_STATE, false);
     final String alias = message.getStr(ALIAS, collectionName);
     log.info("Create collection {}", collectionName);
-    if (clusterState.hasCollection(collectionName)) {
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "collection already exists: " + collectionName);
-    }
+//    if (clusterState.hasCollection(collectionName)) {
+//      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "collection already exists: " + collectionName);
+//    }
     if (aliases.hasAlias(collectionName)) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "collection alias already exists: " + collectionName);
     }
@@ -136,6 +148,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     }
 
     String configName = getConfigName(collectionName, message);
+    log.info("configName={} colleciton={}", configName, collectionName);
     if (configName == null) {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No config set found to associate with the collection.");
     }
@@ -145,17 +158,24 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     String router = message.getStr("router.name", DocRouter.DEFAULT_NAME);
 
     // fail fast if parameters are wrong or incomplete
-    List<String> shardNames = populateShardNames(message, router);
+    List<String> shardNames = BaseCloudSolrClient.populateShardNames(message, router);
     checkReplicaTypes(message);
 
+  // nocommit
+    for (String shardName : shardNames) {
+      System.out.println("make shard:" + shardName);
+      stateManager.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName + "/" + shardName, null, CreateMode.PERSISTENT, false);
+      stateManager.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collectionName + "/leader_elect/" + shardName + "/election", null, CreateMode.PERSISTENT, false);
+    }
+
     AtomicReference<PolicyHelper.SessionWrapper> sessionWrapper = new AtomicReference<>();
 
     try {
 
       final String async = message.getStr(ASYNC);
 
-      ZkStateReader zkStateReader = ocmh.zkStateReader;
       boolean isLegacyCloud = Overseer.isLegacy(zkStateReader);
+      System.out.println("is legacycloud= " + isLegacyCloud);
 
       OverseerCollectionMessageHandler.createConfNode(stateManager, configName, collectionName, isLegacyCloud);
 
@@ -167,29 +187,43 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
           collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), (String) entry.getValue());
         }
       }
-
-      createCollectionZkNode(stateManager, collectionName, collectionParams);
-      
+      createCollectionZkNode(stateManager, collectionName, collectionParams, configName);
       ocmh.overseer.offerStateUpdate(Utils.toJSON(message));
 
+
+      // nocommit
       // wait for a while until we see the collection
-      TimeOut waitUntil = new TimeOut(30, TimeUnit.SECONDS, timeSource);
-      boolean created = false;
-      while (! waitUntil.hasTimedOut()) {
-        waitUntil.sleep(100);
-        created = ocmh.cloudManager.getClusterStateProvider().getClusterState().hasCollection(collectionName);
-        if(created) break;
-      }
-      if (!created) {
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not fully create collection: " + collectionName);
-      }
+
+      ocmh.zkStateReader.waitForState(collectionName, 10, TimeUnit.SECONDS, (n, c) -> c != null);
+
 
       // refresh cluster state
       clusterState = ocmh.cloudManager.getClusterStateProvider().getClusterState();
+      //zkStateReader.waitForState(collectionName,  15, TimeUnit.SECONDS, (l,c) -> c != null);
 
       List<ReplicaPosition> replicaPositions = null;
+//      try {
+//        replicaPositions = buildReplicaPositions(ocmh.cloudManager, clusterState,
+//                clusterState.getCollection(collectionName), message, shardNames, sessionWrapper);
+//      } catch (Exception e) {
+//        ParWork.propegateInterrupt(e);
+//        SolrException exp = new SolrException(ErrorCode.SERVER_ERROR, "call(ClusterState=" + clusterState + ", ZkNodeProps=" + message + ", NamedList=" + results + ")", e);
+//        try {
+//          ZkNodeProps deleteMessage = new ZkNodeProps("name", collectionName);
+//          new DeleteCollectionCmd(ocmh).call(clusterState, deleteMessage, results);
+//          // unwrap the exception
+//        } catch (Exception e1) {
+//          ParWork.propegateInterrupt(e1);
+//          exp.addSuppressed(e1);
+//        }
+//        throw exp;
+//      }
+
+      DocCollection docCollection = buildDocCollection(message, false);
+     // DocCollection docCollection = clusterState.getCollection(collectionName);
       try {
-        replicaPositions = buildReplicaPositions(ocmh.cloudManager, clusterState, clusterState.getCollection(collectionName), message, shardNames, sessionWrapper);
+        replicaPositions = buildReplicaPositions(cloudManager, clusterState,
+                docCollection, message, shardNames, sessionWrapper);
       } catch (Assign.AssignmentException e) {
         ZkNodeProps deleteMessage = new ZkNodeProps("name", collectionName);
         new DeleteCollectionCmd(ocmh).call(clusterState, deleteMessage, results);
@@ -228,28 +262,37 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
           }
         }
 
-        String coreName = Assign.buildSolrCoreName(ocmh.cloudManager.getDistribStateManager(),
-            ocmh.cloudManager.getClusterStateProvider().getClusterState().getCollection(collectionName),
-            replicaPosition.shard, replicaPosition.type, true);
-        if (log.isDebugEnabled()) {
-          log.debug(formatString("Creating core {0} as part of shard {1} of collection {2} on {3}"
-              , coreName, replicaPosition.shard, collectionName, nodeName));
-        }
+        String coreName = Assign.buildSolrCoreName(cloudManager.getDistribStateManager(),
+                docCollection,
+                replicaPosition.shard, replicaPosition.type, true);
+        log.info(formatString("Creating core {0} as part of shard {1} of collection {2} on {3}"
+                , coreName, replicaPosition.shard, collectionName, nodeName));
+
 
         String baseUrl = zkStateReader.getBaseUrlForNodeName(nodeName);
         //in the new mode, create the replica in clusterstate prior to creating the core.
         // Otherwise the core creation fails
+
+        log.info("Base url for replica={}", baseUrl);
+
         if (!isLegacyCloud) {
-          ZkNodeProps props = new ZkNodeProps(
-              Overseer.QUEUE_OPERATION, ADDREPLICA.toString(),
-              ZkStateReader.COLLECTION_PROP, collectionName,
-              ZkStateReader.SHARD_ID_PROP, replicaPosition.shard,
-              ZkStateReader.CORE_NAME_PROP, coreName,
-              ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(),
-              ZkStateReader.BASE_URL_PROP, baseUrl,
-              ZkStateReader.NODE_NAME_PROP, nodeName,
-              ZkStateReader.REPLICA_TYPE, replicaPosition.type.name(),
-              CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState));
+
+          ZkNodeProps props = new ZkNodeProps();
+          props.getProperties().putAll(message.getProperties());
+          ZkNodeProps addReplicaProps = new ZkNodeProps(
+                  Overseer.QUEUE_OPERATION, ADDREPLICA.toString(),
+                  ZkStateReader.COLLECTION_PROP, collectionName,
+                  ZkStateReader.SHARD_ID_PROP, replicaPosition.shard,
+                  ZkStateReader.CORE_NAME_PROP, coreName,
+                  ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(),
+                  ZkStateReader.BASE_URL_PROP, baseUrl,
+                  ZkStateReader.NODE_NAME_PROP, nodeName,
+                  ZkStateReader.REPLICA_TYPE, replicaPosition.type.name(),
+                  ZkStateReader.NUM_SHARDS_PROP, message.getStr(ZkStateReader.NUM_SHARDS_PROP),
+                      "shards", message.getStr("shards"),
+                  CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState));
+          props.getProperties().putAll(addReplicaProps.getProperties());
+          log.info("Sending state update to populate clusterstate with new replica {}", props);
           ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
         }
 
@@ -281,6 +324,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
         sreq.params = params;
 
         if (isLegacyCloud) {
+          log.info("Submit request to shard for legacyCloud for replica={}", baseUrl);
           shardHandler.submit(sreq, sreq.shards[0], sreq.params);
         } else {
           coresToCreate.put(coreName, sreq);
@@ -289,11 +333,29 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
 
       if(!isLegacyCloud) {
         // wait for all replica entries to be created
-        Map<String, Replica> replicas = ocmh.waitToSeeReplicasInState(collectionName, coresToCreate.keySet());
-        for (Map.Entry<String, ShardRequest> e : coresToCreate.entrySet()) {
-          ShardRequest sreq = e.getValue();
-          sreq.params.set(CoreAdminParams.CORE_NODE_NAME, replicas.get(e.getKey()).getName());
-          shardHandler.submit(sreq, sreq.shards[0], sreq.params);
+        Map<String,Replica> replicas = new HashMap<>();
+        zkStateReader.waitForState(collectionName, 10, TimeUnit.SECONDS, expectedReplicas(coresToCreate.size(), replicas)); // nocommit - timeout - keep this below containing timeouts - need central timeout stuff
+       // nocommit, what if replicas comes back wrong?
+        if (replicas.size() > 0) {
+          for (Map.Entry<String, ShardRequest> e : coresToCreate.entrySet()) {
+            ShardRequest sreq = e.getValue();
+            for (Replica rep : replicas.values()) {
+              if (rep.getCoreName().equals(sreq.params.get(CoreAdminParams.NAME)) && rep.getBaseUrl().equals(sreq.shards[0])) {
+                sreq.params.set(CoreAdminParams.CORE_NODE_NAME, rep.getName());
+                break;
+              }
+            }
+//            Replica replica = replicas.get(e.getKey());
+//
+//            if (replica != null) {
+//              String coreNodeName = replica.getName();
+//              sreq.params.set(CoreAdminParams.CORE_NODE_NAME, coreNodeName);
+//              log.info("Set the {} for replica {} to {}", CoreAdminParams.CORE_NODE_NAME, replica, coreNodeName);
+//            }
+
+            log.info("Submit request to shard for for replica={}", sreq.actualShards != null ? Arrays.asList(sreq.actualShards) : "null");
+            shardHandler.submit(sreq, sreq.shards[0], sreq.params);
+          }
         }
       }
 
@@ -306,7 +368,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
         // element, which may be interpreted by the user as a positive ack
         ocmh.cleanupCollection(collectionName, new NamedList<Object>());
         log.info("Cleaned up artifacts for failed create collection for [{}]", collectionName);
-        throw new SolrException(ErrorCode.BAD_REQUEST, "Underlying core creation failed while creating collection: " + collectionName);
+        throw new SolrException(ErrorCode.BAD_REQUEST, "Underlying core creation failed while creating collection: " + collectionName + "\n" + results);
       } else {
         log.debug("Finished create command on all shards for collection: {}", collectionName);
 
@@ -318,6 +380,10 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
               + " is enabled by default, which is NOT RECOMMENDED for production use. To turn it off:"
               + " curl http://{host:port}/solr/" + collectionName + "/config -d '{\"set-user-property\": {\"update.autoCreateFields\":\"false\"}}'");
         }
+        if (async != null) {
+          zkStateReader.waitForState(collectionName, 30, TimeUnit.SECONDS, BaseCloudSolrClient.expectedShardsAndActiveReplicas(shardNames.size(), replicaPositions.size()));
+        }
+
       }
 
       // modify the `withCollection` and store this new collection's name with it
@@ -328,7 +394,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
             CollectionAdminParams.COLOCATED_WITH, collectionName);
         ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
         try {
-          zkStateReader.waitForState(withCollection, 5, TimeUnit.SECONDS, (collectionState) -> collectionName.equals(collectionState.getStr(COLOCATED_WITH)));
+          zkStateReader.waitForState(withCollection, 30, TimeUnit.SECONDS, (collectionState) -> collectionName.equals(collectionState.getStr(COLOCATED_WITH)));
         } catch (TimeoutException e) {
           log.warn("Timed out waiting to see the {} property set on collection: {}", COLOCATED_WITH, withCollection);
           // maybe the overseer queue is backed up, we don't want to fail the create request
@@ -341,6 +407,9 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
         ocmh.zkStateReader.aliasesManager.applyModificationAndExportToZk(a -> a.cloneWithCollectionAlias(alias, collectionName));
       }
 
+    } catch (InterruptedException ex) {
+      ParWork.propegateInterrupt(ex);
+      throw ex;
     } catch (SolrException ex) {
       throw ex;
     } catch (Exception ex) {
@@ -355,6 +424,10 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
                                                             ZkNodeProps message,
                                                             List<String> shardNames,
                                                             AtomicReference<PolicyHelper.SessionWrapper> sessionWrapper) throws IOException, InterruptedException, Assign.AssignmentException {
+  //  if (log.isDebugEnabled()) {
+      log.info("buildReplicaPositions(SolrCloudManager cloudManager={}, ClusterState clusterState={}, DocCollection docCollection={}, ZkNodeProps message={}, List<String> shardNames={}, AtomicReference<PolicyHelper.SessionWrapper> sessionWrapper={}) - start", cloudManager, clusterState, docCollection, message, shardNames, sessionWrapper);
+   // }
+
     final String collectionName = message.getStr(NAME);
     // look at the replication factor and see if it matches reality
     // if it does not, find best nodes to create more cores
@@ -373,49 +446,56 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     List<ReplicaPosition> replicaPositions;
     List<String> nodeList = Assign.getLiveOrLiveAndCreateNodeSetList(clusterState.getLiveNodes(), message, OverseerCollectionMessageHandler.RANDOM);
     if (nodeList.isEmpty()) {
-      log.warn("It is unusual to create a collection ({}) without cores.", collectionName);
+      log.warn("It is unusual to create a collection ("+collectionName+") without cores. liveNodes={} message={}", clusterState.getLiveNodes(), message);
 
       replicaPositions = new ArrayList<>();
     } else {
       int totalNumReplicas = numNrtReplicas + numTlogReplicas + numPullReplicas;
       if (totalNumReplicas > nodeList.size()) {
-        log.warn("Specified number of replicas of {} on collection {} is higher than the number of Solr instances currently live or live and part of your {}({}). {}"
-            , totalNumReplicas
-            , collectionName
-            , OverseerCollectionMessageHandler.CREATE_NODE_SET
-            , nodeList.size()
-            , "It's unusual to run two replica of the same slice on the same Solr-instance.");
+        log.warn("Specified number of replicas of "
+                + totalNumReplicas
+                + " on collection "
+                + collectionName
+                + " is higher than the number of Solr instances currently live or live and part of your " + ZkStateReader.CREATE_NODE_SET + "("
+                + nodeList.size()
+                + "). It's unusual to run two replica of the same slice on the same Solr-instance.");
       }
 
       int maxShardsAllowedToCreate = maxShardsPerNode == Integer.MAX_VALUE ?
-          Integer.MAX_VALUE :
-          maxShardsPerNode * nodeList.size();
+              Integer.MAX_VALUE :
+              maxShardsPerNode * nodeList.size();
       int requestedShardsToCreate = numSlices * totalNumReplicas;
       if (maxShardsAllowedToCreate < requestedShardsToCreate) {
-        throw new Assign.AssignmentException("Cannot create collection " + collectionName + ". Value of "
-            + MAX_SHARDS_PER_NODE + " is " + maxShardsPerNode
-            + ", and the number of nodes currently live or live and part of your "+OverseerCollectionMessageHandler.CREATE_NODE_SET+" is " + nodeList.size()
-            + ". This allows a maximum of " + maxShardsAllowedToCreate
-            + " to be created. Value of " + OverseerCollectionMessageHandler.NUM_SLICES + " is " + numSlices
-            + ", value of " + NRT_REPLICAS + " is " + numNrtReplicas
-            + ", value of " + TLOG_REPLICAS + " is " + numTlogReplicas
-            + " and value of " + PULL_REPLICAS + " is " + numPullReplicas
-            + ". This requires " + requestedShardsToCreate
-            + " shards to be created (higher than the allowed number)");
+        String msg = "Cannot create collection " + collectionName + ". Value of "
+                + MAX_SHARDS_PER_NODE + " is " + maxShardsPerNode
+                + ", and the number of nodes currently live or live and part of your "+ZkStateReader.CREATE_NODE_SET+" is " + nodeList.size()
+                + ". This allows a maximum of " + maxShardsAllowedToCreate
+                + " to be created. Value of " + ZkStateReader.NUM_SHARDS_PROP + " is " + numSlices
+                + ", value of " + NRT_REPLICAS + " is " + numNrtReplicas
+                + ", value of " + TLOG_REPLICAS + " is " + numTlogReplicas
+                + " and value of " + PULL_REPLICAS + " is " + numPullReplicas
+                + ". This requires " + requestedShardsToCreate
+                + " shards to be created (higher than the allowed number)";
+
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, msg);
       }
       Assign.AssignRequest assignRequest = new Assign.AssignRequestBuilder()
-          .forCollection(collectionName)
-          .forShard(shardNames)
-          .assignNrtReplicas(numNrtReplicas)
-          .assignTlogReplicas(numTlogReplicas)
-          .assignPullReplicas(numPullReplicas)
-          .onNodes(nodeList)
-          .build();
+              .forCollection(collectionName)
+              .forShard(shardNames)
+              .assignNrtReplicas(numNrtReplicas)
+              .assignTlogReplicas(numTlogReplicas)
+              .assignPullReplicas(numPullReplicas)
+              .onNodes(nodeList)
+              .build();
       Assign.AssignStrategyFactory assignStrategyFactory = new Assign.AssignStrategyFactory(cloudManager);
       Assign.AssignStrategy assignStrategy = assignStrategyFactory.create(clusterState, docCollection);
       replicaPositions = assignStrategy.assign(cloudManager, assignRequest);
       sessionWrapper.set(PolicyHelper.getLastSessionWrapper(true));
     }
+
+    if (log.isDebugEnabled()) {
+      log.debug("buildReplicaPositions(SolrCloudManager, ClusterState, DocCollection, ZkNodeProps, List<String>, AtomicReference<PolicyHelper.SessionWrapper>) - end");
+    }
     return replicaPositions;
   }
 
@@ -428,22 +508,93 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     }
   }
 
-  public static List<String> populateShardNames(ZkNodeProps message, String router) {
-    List<String> shardNames = new ArrayList<>();
-    Integer numSlices = message.getInt(OverseerCollectionMessageHandler.NUM_SLICES, null);
-    if (ImplicitDocRouter.NAME.equals(router)) {
-      ClusterStateMutator.getShardNames(shardNames, message.getStr("shards", null));
-      numSlices = shardNames.size();
+  public static DocCollection buildDocCollection(ZkNodeProps message, boolean withDocRouter) {
+    log.info("buildDocCollection {}", message);
+    withDocRouter = true;
+    String cName = message.getStr(NAME);
+    DocRouter router = null;
+    Map<String,Object> routerSpec = null;
+    if (withDocRouter) {
+      routerSpec = DocRouter.getRouterSpec(message);
+      String routerName = routerSpec.get(NAME) == null ? DocRouter.DEFAULT_NAME : (String) routerSpec.get(NAME);
+      router = DocRouter.getDocRouter(routerName);
+    }
+    Object messageShardsObj = message.get("shards");
+
+    Map<String,Slice> slices;
+    if (messageShardsObj instanceof Map) { // we are being explicitly told the slice data (e.g. coll restore)
+      slices = Slice.loadAllFromMap(message.getStr(ZkStateReader.COLLECTION_PROP), (Map<String,Object>) messageShardsObj);
     } else {
-      if (numSlices == null) {
-        throw new SolrException(ErrorCode.BAD_REQUEST, OverseerCollectionMessageHandler.NUM_SLICES + " is a required param (when using CompositeId router).");
+      List<String> shardNames = new ArrayList<>();
+      if (withDocRouter) {
+        if (router instanceof ImplicitDocRouter) {
+          getShardNames(shardNames, message.getStr("shards", DocRouter.DEFAULT_NAME));
+        } else {
+          int numShards = message.getInt(ZkStateReader.NUM_SHARDS_PROP, -1);
+          if (numShards < 1)
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+                    "numShards is a required parameter for 'compositeId' router {}" + message);
+          getShardNames(numShards, shardNames);
+        }
+      }
+
+      List<DocRouter.Range> ranges = null;
+      if (withDocRouter) {
+        ranges = router.partitionRange(shardNames.size(), router.fullRange());// maybe null
+      }
+      slices = new LinkedHashMap<>();
+      for (int i = 0; i < shardNames.size(); i++) {
+        String sliceName = shardNames.get(i);
+
+        Map<String,Object> sliceProps = new LinkedHashMap<>(1);
+
+        if (withDocRouter) {
+          sliceProps.put(Slice.RANGE, ranges == null ? null : ranges.get(i));
+        }
+
+        slices.put(sliceName, new Slice(sliceName, null, sliceProps, message.getStr(ZkStateReader.COLLECTION_PROP)));
+
       }
-      if (numSlices <= 0) {
-        throw new SolrException(ErrorCode.BAD_REQUEST, OverseerCollectionMessageHandler.NUM_SLICES + " must be > 0");
+    }
+
+    Map<String,Object> collectionProps = new HashMap<>();
+
+    for (Map.Entry<String,Object> e : OverseerCollectionMessageHandler.COLLECTION_PROPS_AND_DEFAULTS.entrySet()) {
+      Object val = message.get(e.getKey());
+      if (val == null) {
+        val = OverseerCollectionMessageHandler.COLLECTION_PROPS_AND_DEFAULTS.get(e.getKey());
+      }
+      if (val != null) collectionProps.put(e.getKey(), val);
+    }
+    if (withDocRouter) {
+      collectionProps.put(DocCollection.DOC_ROUTER, routerSpec);
+    }
+    if (withDocRouter) {
+
+      if (message.getStr("fromApi") == null) {
+        collectionProps.put("autoCreated", "true");
       }
-      ClusterStateMutator.getShardNames(numSlices, shardNames);
     }
-    return shardNames;
+
+    // TODO default to 2; but need to debug why BasicDistributedZk2Test fails early on
+    String znode = message.getInt(DocCollection.STATE_FORMAT, 1) == 1 ? ZkStateReader.CLUSTER_STATE
+            : ZkStateReader.getCollectionPath(cName);
+
+    DocCollection newCollection = new DocCollection(cName,
+            slices, collectionProps, router, -1, znode);
+
+    return newCollection;
+  }
+
+  public static void getShardNames(List<String> shardNames, String shards) {
+    if (shards == null)
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "shards" + " is a required param");
+    for (String s : shards.split(",")) {
+      if (s == null || s.trim().isEmpty()) continue;
+      shardNames.add(s.trim());
+    }
+    if (shardNames.isEmpty())
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "shards" + " is a required param");
   }
 
   String getConfigName(String coll, ZkNodeProps message) throws KeeperException, InterruptedException {
@@ -493,90 +644,98 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
     }
   }
 
-  public static void createCollectionZkNode(DistribStateManager stateManager, String collection, Map<String,String> params) {
-    log.debug("Check for collection zkNode: {}", collection);
+  public static void createCollectionZkNode(DistribStateManager stateManager, String collection, Map<String,String> params, String configName) {
+    if (log.isDebugEnabled()) {
+      log.debug("createCollectionZkNode(DistribStateManager stateManager={}, String collection={}, Map<String,String> params={}) - start", stateManager, collection, params);
+    }
+
     String collectionPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
     // clean up old terms node
     String termsPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms";
     try {
       stateManager.removeRecursively(termsPath, true, true);
-    } catch (InterruptedException e) {
-      Thread.interrupted();
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error deleting old term nodes for collection from Zookeeper", e);
-    } catch (KeeperException | IOException | NotEmptyException | BadVersionException e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error deleting old term nodes for collection from Zookeeper", e);
+    } catch (Exception e) {
+      log.error("", e);
+      ParWork.propegateInterrupt(e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, "createCollectionZkNode(DistribStateManager=" + stateManager + ", String=" + collection + ", Map<String,String>=" + params + ")", e);
     }
     try {
-      if (!stateManager.hasData(collectionPath)) {
-        log.debug("Creating collection in ZooKeeper: {}", collection);
-
-        try {
-          Map<String,Object> collectionProps = new HashMap<>();
-
-          if (params.size() > 0) {
-            collectionProps.putAll(params);
-            // if the config name wasn't passed in, use the default
-            if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP)) {
-              // users can create the collection node and conf link ahead of time, or this may return another option
-              getConfName(stateManager, collection, collectionPath, collectionProps);
-            }
+      log.info("Creating collection in ZooKeeper:" + collection);
 
-          } else if (System.getProperty("bootstrap_confdir") != null) {
-            String defaultConfigName = System.getProperty(ZkController.COLLECTION_PARAM_PREFIX + ZkController.CONFIGNAME_PROP, collection);
+      Map<String,Object> collectionProps = new HashMap<>();
 
-            // if we are bootstrapping a collection, default the config for
-            // a new collection to the collection we are bootstrapping
-            log.info("Setting config for collection: {} to {}", collection, defaultConfigName);
+      if (params.size() > 0) {
+        collectionProps.putAll(params);
+        // if the config name wasn't passed in, use the default
+        if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP)) {
+          // users can create the collection node and conf link ahead of time, or this may return another option
+          getConfName(stateManager, collection, collectionPath, collectionProps);
+        }
 
-            Properties sysProps = System.getProperties();
-            for (String sprop : System.getProperties().stringPropertyNames()) {
-              if (sprop.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) {
-                collectionProps.put(sprop.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), sysProps.getProperty(sprop));
-              }
-            }
+      } else if (System.getProperty("bootstrap_confdir") != null) {
+        String defaultConfigName = System
+                .getProperty(ZkController.COLLECTION_PARAM_PREFIX + ZkController.CONFIGNAME_PROP, collection);
 
-            // if the config name wasn't passed in, use the default
-            if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP))
-              collectionProps.put(ZkController.CONFIGNAME_PROP, defaultConfigName);
+        // if we are bootstrapping a collection, default the config for
+        // a new collection to the collection we are bootstrapping
+        log.info("Setting config for collection:" + collection + " to " + defaultConfigName);
 
-          } else if (Boolean.getBoolean("bootstrap_conf")) {
-            // the conf name should should be the collection name of this core
-            collectionProps.put(ZkController.CONFIGNAME_PROP, collection);
-          } else {
-            getConfName(stateManager, collection, collectionPath, collectionProps);
+        Properties sysProps = System.getProperties();
+        for (String sprop : System.getProperties().stringPropertyNames()) {
+          if (sprop.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) {
+            collectionProps.put(sprop.substring(ZkController.COLLECTION_PARAM_PREFIX.length()),
+                    sysProps.getProperty(sprop));
           }
+        }
 
-          collectionProps.remove(ZkStateReader.NUM_SHARDS_PROP);  // we don't put numShards in the collections properties
-
-          ZkNodeProps zkProps = new ZkNodeProps(collectionProps);
-          stateManager.makePath(collectionPath, Utils.toJSON(zkProps), CreateMode.PERSISTENT, false);
+        // if the config name wasn't passed in, use the default
+        if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP))
+          collectionProps.put(ZkController.CONFIGNAME_PROP, defaultConfigName);
 
-        } catch (KeeperException e) {
-          //TODO shouldn't the stateManager ensure this does not happen; should throw AlreadyExistsException
-          // it's okay if the node already exists
-          if (e.code() != KeeperException.Code.NODEEXISTS) {
-            throw e;
-          }
-        } catch (AlreadyExistsException e) {
-          // it's okay if the node already exists
-        }
+      } else if (Boolean.getBoolean("bootstrap_conf")) {
+        // the conf name should should be the collection name of this core
+        collectionProps.put(ZkController.CONFIGNAME_PROP, collection);
       } else {
-        log.debug("Collection zkNode exists");
+        getConfName(stateManager, collection, collectionPath, collectionProps);
       }
 
-    } catch (KeeperException e) {
-      // it's okay if another beats us creating the node
-      if (e.code() == KeeperException.Code.NODEEXISTS) {
-        return;
-      }
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
-    } catch (IOException e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
-    } catch (InterruptedException e) {
-      Thread.interrupted();
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
+      collectionProps.remove(ZkStateReader.NUM_SHARDS_PROP); // we don't put numShards in the collections properties
+
+      // nocommit make efficient
+      collectionProps.put(ZkController.CONFIGNAME_PROP, configName);
+      ZkNodeProps zkProps = new ZkNodeProps(collectionProps);
+      stateManager.makePath(collectionPath, Utils.toJSON(zkProps), CreateMode.PERSISTENT, false);
+      stateManager.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection
+              + "/leader_elect/", null, CreateMode.PERSISTENT, false);
+      stateManager.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/"
+              + ZkStateReader.SHARD_LEADERS_ZKNODE, null, CreateMode.PERSISTENT, false);
+
+      System.out.println("make state.json path:" + ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + ZkStateReader.STATE_JSON);
+      stateManager.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + ZkStateReader.STATE_JSON,
+              ZkStateReader.emptyJson, CreateMode.PERSISTENT, false);
+
+      stateManager.makePath(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms", null, CreateMode.PERSISTENT,
+              false);
+
+    } catch (Exception e) {
+      log.error("", e);
+      ParWork.propegateInterrupt(e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, "createCollectionZkNode(DistribStateManager=" + stateManager + ", String=" + collection + ", Map<String,String>=" + params + ")", e);
     }
 
+
+    if (log.isDebugEnabled()) {
+      log.debug("createCollectionZkNode(DistribStateManager, String, Map<String,String>) - end");
+    }
+  }
+
+  public static void getShardNames(Integer numShards, List<String> shardNames) {
+    if (numShards == null)
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "numShards" + " is a required param");
+    for (int i = 0; i < numShards; i++) {
+      final String sliceName = "shard" + (i + 1);
+      shardNames.add(sliceName);
+    }
   }
 
   private static void getConfName(DistribStateManager stateManager, String collection, String collectionPath, Map<String,Object> collectionProps) throws IOException,
@@ -640,4 +799,30 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
           "Could not find configName for collection " + collection + " found:" + configNames);
     }
   }
+
+  public static CollectionStatePredicate expectedReplicas(int expectedReplicas, Map<String,Replica> replicaMap) {
+    log.info("Wait for expectedReplicas={}", expectedReplicas);
+
+    return (liveNodes, collectionState) -> {
+      if (collectionState == null)
+        return false;
+      if (collectionState.getSlices() == null) {
+        return false;
+      }
+
+      int replicas = 0;
+      for (Slice slice : collectionState) {
+        for (Replica replica : slice) {
+            replicaMap.put(replica.getCoreName(), replica);
+            replicas++;
+        }
+      }
+      if (replicas == expectedReplicas) {
+        return true;
+      }
+
+      return false;
+    };
+  }
+
 }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateShardCmd.java
index 989003a..91b1692 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/CreateShardCmd.java
@@ -83,7 +83,10 @@ public class CreateShardCmd implements OverseerCollectionMessageHandler.Cmd {
     // wait for a while until we see the shard
     //ocmh.waitForNewShard(collectionName, sliceName);
     // wait for a while until we see the shard and update the local view of the cluster state
-    clusterState = ocmh.waitForNewShard(collectionName, sliceName);
+    ocmh.waitForNewShard(collectionName, sliceName);
+
+    // refresh clusterstate
+    clusterState = ocmh.zkStateReader.getClusterState();
 
     String async = message.getStr(ASYNC);
     ZkNodeProps addReplicasProps = new ZkNodeProps(
@@ -92,7 +95,7 @@ public class CreateShardCmd implements OverseerCollectionMessageHandler.Cmd {
         ZkStateReader.NRT_REPLICAS, String.valueOf(numNrtReplicas),
         ZkStateReader.TLOG_REPLICAS, String.valueOf(numTlogReplicas),
         ZkStateReader.PULL_REPLICAS, String.valueOf(numPullReplicas),
-        OverseerCollectionMessageHandler.CREATE_NODE_SET, message.getStr(OverseerCollectionMessageHandler.CREATE_NODE_SET),
+        ZkStateReader.CREATE_NODE_SET, message.getStr(ZkStateReader.CREATE_NODE_SET),
         CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState));
 
     Map<String, Object> propertyParams = new HashMap<>();
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java
index 70d8d2b..6c81a0b 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteCollectionCmd.java
@@ -70,6 +70,7 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
 
   @Override
   public void call(ClusterState state, ZkNodeProps message, @SuppressWarnings({"rawtypes"})NamedList results) throws Exception {
+    log.info("delete collection called");
     Object o = message.get(MaintainRoutedAliasCmd.INVOKED_BY_ROUTED_ALIAS);
     if (o != null) {
       ((Runnable)o).run(); // this will ensure the collection is removed from the alias before it disappears.
@@ -133,7 +134,7 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
       ZkNodeProps internalMsg = message.plus(NAME, collection);
 
       @SuppressWarnings({"unchecked"})
-      List<Replica> failedReplicas = ocmh.collectionCmd(internalMsg, params, results, null, asyncId, okayExceptions);
+      List<Replica> failedReplicas = ocmh.collectionCmd(internalMsg, params, results, null, null, okayExceptions);
       for (Replica failedReplica : failedReplicas) {
         boolean isSharedFS = failedReplica.getBool(ZkStateReader.SHARED_STORAGE_PROP, false) && failedReplica.get("dataDir") != null;
         if (isSharedFS) {
@@ -148,7 +149,7 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
       ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
 
       // wait for a while until we don't see the collection
-      zkStateReader.waitForState(collection, 60, TimeUnit.SECONDS, (collectionState) -> collectionState == null);
+      zkStateReader.waitForState(collection, 10, TimeUnit.SECONDS, (collectionState) -> collectionState == null);
 
       // we can delete any remaining unique aliases
       if (!aliasReferences.isEmpty()) {
@@ -176,17 +177,18 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
 //            "Could not fully remove collection: " + collection);
 //      }
     } finally {
-
+      // HUH? This is delete collection, taking out /collections/name
+      // How can you leave /collections/name/counter?
       try {
         String collectionPath =  ZkStateReader.getCollectionPathRoot(collection);
-        if (zkStateReader.getZkClient().exists(collectionPath, true)) {
-          if (removeCounterNode) {
-            zkStateReader.getZkClient().clean(collectionPath);
-          } else {
+
+//          if (removeCounterNode) {
+//            zkStateReader.getZkClient().clean(collectionPath);
+//          } else {
             final String counterNodePath = Assign.getCounterNodePath(collection);
             zkStateReader.getZkClient().clean(collectionPath, s -> !s.equals(counterNodePath));
-          }
-        }
+     //     }
+
       } catch (InterruptedException e) {
         SolrException.log(log, "Cleaning up collection in zk was interrupted:"
             + collection, e);
@@ -194,6 +196,9 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
       } catch (KeeperException e) {
         SolrException.log(log, "Problem cleaning up collection in zk:"
             + collection, e);
+        if (e instanceof  KeeperException.SessionExpiredException) {
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
       }
     }
   }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteReplicaCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteReplicaCmd.java
index c263203..ec0d649 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteReplicaCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteReplicaCmd.java
@@ -34,11 +34,13 @@ import java.util.concurrent.Callable;
 
 import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.Cmd;
 import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler.ShardRequestTracker;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CoreAdminParams;
@@ -148,17 +150,22 @@ public class DeleteReplicaCmd implements Cmd {
       }
     }
 
-    for (Map.Entry<Slice, Set<String>> entry : shardToReplicasMapping.entrySet()) {
-      Slice shardSlice = entry.getKey();
-      String shardId = shardSlice.getName();
-      Set<String> replicas = entry.getValue();
-      //callDeleteReplica on all replicas
-      for (String replica: replicas) {
-        log.debug("Deleting replica {}  for shard {} based on count {}", replica, shardId, count);
-        deleteCore(shardSlice, collectionName, replica, message, shard, results, onComplete, parallel);
+    try (ParWork worker = new ParWork(this)) {
+
+      for (Map.Entry<Slice,Set<String>> entry : shardToReplicasMapping.entrySet()) {
+        Slice shardSlice = entry.getKey();
+        String shardId = shardSlice.getName();
+        Set<String> replicas = entry.getValue();
+        // callDeleteReplica on all replicas
+        for (String replica : replicas) {
+          if (log.isDebugEnabled()) log.debug("Deleting replica {}  for shard {} based on count {}", replica, shardId, count);
+          worker.collect(() -> { deleteCore(shardSlice, collectionName, replica, message, shard, results, onComplete, parallel); return replica; });
+        }
+        results.add("shard_id", shardId);
+        results.add("replicas_deleted", replicas);
       }
-      results.add("shard_id", shardId);
-      results.add("replicas_deleted", replicas);
+
+      worker.addCollect("DeleteReplicas");
     }
 
   }
@@ -255,16 +262,14 @@ public class DeleteReplicaCmd implements Cmd {
       try {
         if (isLive) {
           shardRequestTracker.processResponses(results, shardHandler, false, null);
-
-          //check if the core unload removed the corenode zk entry
-          if (ocmh.waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return Boolean.TRUE;
         }
 
         // try and ensure core info is removed from cluster state
         ocmh.deleteCoreNode(collectionName, replicaName, replica, core);
-        if (ocmh.waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return Boolean.TRUE;
+        if (ocmh.waitForCoreNodeGone(collectionName, shard, replicaName, 15000)) return Boolean.TRUE;
         return Boolean.FALSE;
       } catch (Exception e) {
+        SolrZkClient.checkInterrupted(e);
         results.add("failure", "Could not complete delete " + e.getMessage());
         throw e;
       } finally {
@@ -272,20 +277,23 @@ public class DeleteReplicaCmd implements Cmd {
       }
     };
 
-    if (!parallel) {
-      try {
-        if (!callable.call())
-          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
-                  "Could not remove replica : " + collectionName + "/" + shard + "/" + replicaName);
-      } catch (InterruptedException | KeeperException e) {
-        throw e;
-      } catch (Exception ex) {
-        throw new SolrException(SolrException.ErrorCode.UNKNOWN, "Error waiting for corenode gone", ex);
+//    if (!parallel) {
+//      try {
+//        if (!callable.call())
+//          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+//                  "Could not remove replica : " + collectionName + "/" + shard + "/" + replicaName);
+//      } catch (InterruptedException | KeeperException e) {
+//        throw e;
+//      } catch (Exception ex) {
+//        throw new SolrException(SolrException.ErrorCode.UNKNOWN, "Error waiting for corenode gone", ex);
+//      }
+//
+//    } else {
+      try (ParWork worker = new ParWork(this)) {
+        worker.add("AddReplica", callable);
       }
 
-    } else {
-      ocmh.tpe.submit(callable);
-    }
+ //   }
 
   }
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
index ff7edfa..2e22084 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/DeleteShardCmd.java
@@ -40,6 +40,7 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CoreAdminParams;
@@ -132,6 +133,7 @@ public class DeleteShardCmd implements OverseerCollectionMessageHandler.Cmd {
         } catch (KeeperException e) {
           log.warn("Error deleting replica: {}", r, e);
           cleanupLatch.countDown();
+          throw e;
         } catch (Exception e) {
           log.warn("Error deleting replica: {}", r, e);
           cleanupLatch.countDown();
@@ -152,6 +154,7 @@ public class DeleteShardCmd implements OverseerCollectionMessageHandler.Cmd {
     } catch (SolrException e) {
       throw e;
     } catch (Exception e) {
+      SolrZkClient.checkInterrupted(e);
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
           "Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
     }
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/MaintainRoutedAliasCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/MaintainRoutedAliasCmd.java
index 396b45b..88045d6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/MaintainRoutedAliasCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/MaintainRoutedAliasCmd.java
@@ -25,6 +25,7 @@ import java.util.Map;
 
 import org.apache.solr.client.solrj.SolrResponse;
 import org.apache.solr.cloud.Overseer;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.Aliases;
 import org.apache.solr.common.cloud.ClusterState;
@@ -124,16 +125,18 @@ public class MaintainRoutedAliasCmd extends AliasCmd {
       switch (action.actionType) {
         case ENSURE_REMOVED:
           if (exists) {
-            ocmh.tpe.submit(() -> {
-              try {
-                deleteTargetCollection(clusterState, results, aliasName, aliasesManager, action);
-              } catch (Exception e) {
-                log.warn("Deletion of {} by {} {} failed (this might be ok if two clients were"
-                    , action.targetCollection, ra.getAliasName()
-                    , " writing to a routed alias at the same time and both caused a deletion)");
-                log.debug("Exception for last message:", e);
-              }
-            });
+            try (ParWork worker = new ParWork(this)) {
+              worker.add("AddReplica", () -> {
+                try {
+                  deleteTargetCollection(clusterState, results, aliasName, aliasesManager, action);
+                } catch (Exception e) {
+                  log.warn("Deletion of {} by {} {} failed (this might be ok if two clients were"
+                          , action.targetCollection, ra.getAliasName()
+                          , " writing to a routed alias at the same time and both caused a deletion)");
+                  log.debug("Exception for last message:", e);
+                }
+              });
+            }
           }
           break;
         case ENSURE_EXISTS:
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java
index c41cb7f..462228a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/MigrateCmd.java
@@ -236,9 +236,9 @@ public class MigrateCmd implements OverseerCollectionMessageHandler.Cmd {
         Overseer.QUEUE_OPERATION, CREATE.toLower(),
         NAME, tempSourceCollectionName,
         NRT_REPLICAS, 1,
-        OverseerCollectionMessageHandler.NUM_SLICES, 1,
+        ZkStateReader.NUM_SHARDS_PROP, 1,
         CollectionAdminParams.COLL_CONF, configName,
-        OverseerCollectionMessageHandler.CREATE_NODE_SET, sourceLeader.getNodeName());
+        ZkStateReader.CREATE_NODE_SET, sourceLeader.getNodeName());
     if (asyncId != null) {
       String internalAsyncId = asyncId + Math.abs(System.nanoTime());
       props.put(ASYNC, internalAsyncId);
@@ -252,7 +252,7 @@ public class MigrateCmd implements OverseerCollectionMessageHandler.Cmd {
     Replica tempSourceLeader = zkStateReader.getLeaderRetry(tempSourceCollectionName, tempSourceSlice.getName(), 120000);
 
     String tempCollectionReplica1 = tempSourceLeader.getCoreName();
-    String coreNodeName = ocmh.waitForCoreNodeName(tempSourceCollectionName,
+    String coreNodeName = ocmh.waitForCoreNodeName(zkStateReader, tempSourceCollectionName,
         sourceLeader.getNodeName(), tempCollectionReplica1);
     // wait for the replicas to be seen as active on temp source leader
     if (log.isInfoEnabled()) {
@@ -320,7 +320,7 @@ public class MigrateCmd implements OverseerCollectionMessageHandler.Cmd {
       syncRequestTracker.processResponses(results, shardHandler, true, "MIGRATE failed to create replica of " +
         "temporary collection in target leader node.");
     }
-    coreNodeName = ocmh.waitForCoreNodeName(tempSourceCollectionName,
+    coreNodeName = ocmh.waitForCoreNodeName(zkStateReader, tempSourceCollectionName,
         targetLeader.getNodeName(), tempCollectionReplica2);
     // wait for the replicas to be seen as active on temp source leader
     if (log.isInfoEnabled()) {
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
index 007fbec..ea0f9da 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/OverseerCollectionMessageHandler.java
@@ -31,9 +31,11 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicReference;
 
 import com.google.common.collect.ImmutableMap;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.http.client.HttpClient;
 import org.apache.solr.client.solrj.SolrResponse;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.cloud.DistribStateManager;
@@ -54,6 +56,7 @@ import org.apache.solr.cloud.OverseerTaskProcessor;
 import org.apache.solr.cloud.Stats;
 import org.apache.solr.cloud.ZkController;
 import org.apache.solr.cloud.overseer.OverseerAction;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrCloseable;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
@@ -118,12 +121,8 @@ import static org.apache.solr.common.util.Utils.makeMap;
  */
 public class OverseerCollectionMessageHandler implements OverseerMessageHandler, SolrCloseable {
 
-  public static final String NUM_SLICES = "numShards";
-
   public static final boolean CREATE_NODE_SET_SHUFFLE_DEFAULT = true;
   public static final String CREATE_NODE_SET_SHUFFLE = CollectionAdminParams.CREATE_NODE_SET_SHUFFLE_PARAM;
-  public static final String CREATE_NODE_SET_EMPTY = "EMPTY";
-  public static final String CREATE_NODE_SET = CollectionAdminParams.CREATE_NODE_SET_PARAM;
 
   public static final String ROUTER = "router";
 
@@ -172,11 +171,8 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
   // This is used for handling mutual exclusion of the tasks.
 
   final private LockTree lockTree = new LockTree();
-  ExecutorService tpe = new ExecutorUtil.MDCAwareThreadPoolExecutor(5, 10, 0L, TimeUnit.MILLISECONDS,
-      new SynchronousQueue<>(),
-      new SolrNamedThreadFactory("OverseerCollectionMessageHandlerThreadFactory"));
 
-  protected static final Random RANDOM;
+  public static final Random RANDOM;
   static {
     // We try to make things reproducible in the context of our tests by initializing the random instance
     // based on the current seed
@@ -223,7 +219,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
         .put(MIGRATESTATEFORMAT, this::migrateStateFormat)
         .put(CREATESHARD, new CreateShardCmd(this))
         .put(MIGRATE, new MigrateCmd(this))
-        .put(CREATE, new CreateCollectionCmd(this))
+            .put(CREATE, new CreateCollectionCmd(this, overseer.getCoreContainer(), cloudManager, zkStateReader))
         .put(MODIFYCOLLECTION, this::modifyCollection)
         .put(ADDREPLICAPROP, this::processReplicaAddPropertyCommand)
         .put(DELETEREPLICAPROP, this::processReplicaDeletePropertyCommand)
@@ -249,7 +245,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
 
   @Override
   @SuppressWarnings("unchecked")
-  public OverseerSolrResponse processMessage(ZkNodeProps message, String operation) {
+  public OverseerSolrResponse processMessage(ZkNodeProps message, String operation) throws InterruptedException {
     MDCLoggingContext.setCollection(message.getStr(COLLECTION));
     MDCLoggingContext.setShard(message.getStr(SHARD_ID_PROP));
     MDCLoggingContext.setReplica(message.getStr(REPLICA_PROP));
@@ -266,6 +262,9 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
         throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:"
             + operation);
       }
+    }  catch (InterruptedException e) {
+      ParWork.propegateInterrupt(e);
+      throw e;
     } catch (Exception e) {
       String collName = message.getStr("collection");
       if (collName == null) collName = message.getStr(NAME);
@@ -346,7 +345,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
   @SuppressWarnings("unchecked")
   private void processReplicaAddPropertyCommand(ClusterState clusterState, ZkNodeProps message, @SuppressWarnings({"rawtypes"})NamedList results)
       throws Exception {
-    checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP, PROPERTY_VALUE_PROP);
+    checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, ZkStateReader.NUM_SHARDS_PROP, "shards", REPLICA_PROP, PROPERTY_PROP, PROPERTY_VALUE_PROP);
     SolrZkClient zkClient = zkStateReader.getZkClient();
     Map<String, Object> propMap = new HashMap<>();
     propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICAPROP.toLower());
@@ -510,7 +509,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
     // and we force open a searcher so that we have documents to show upon switching states
     UpdateResponse updateResponse = null;
     try {
-      updateResponse = softCommit(coreUrl);
+      updateResponse = softCommit(coreUrl, overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
       processResponse(results, null, coreUrl, updateResponse, slice, Collections.emptySet());
     } catch (Exception e) {
       processResponse(results, e, coreUrl, updateResponse, slice, Collections.emptySet());
@@ -519,11 +518,13 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
   }
 
 
-  static UpdateResponse softCommit(String url) throws SolrServerException, IOException {
+  static UpdateResponse softCommit(String url, HttpClient httpClient) throws SolrServerException, IOException {
 
     try (HttpSolrClient client = new HttpSolrClient.Builder(url)
-        .withConnectionTimeout(30000)
-        .withSocketTimeout(120000)
+        .withConnectionTimeout(Integer.getInteger("solr.connect_timeout.default", 15000))
+        .withSocketTimeout(Integer.getInteger("solr.so_commit_timeout.default", 30000))
+        .withHttpClient(httpClient)
+        .markInternalRequest()
         .build()) {
       UpdateRequest ureq = new UpdateRequest();
       ureq.setParams(new ModifiableSolrParams());
@@ -532,60 +533,60 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
     }
   }
 
-  String waitForCoreNodeName(String collectionName, String msgNodeName, String msgCore) {
-    int retryCount = 320;
-    while (retryCount-- > 0) {
-      final DocCollection docCollection = zkStateReader.getClusterState().getCollectionOrNull(collectionName);
-      if (docCollection != null && docCollection.getSlicesMap() != null) {
-        Map<String,Slice> slicesMap = docCollection.getSlicesMap();
+  static String waitForCoreNodeName(ZkStateReader zkStateReader, String collectionName, String msgNodeName, String msgCore) {
+    AtomicReference<String> errorMessage = new AtomicReference<>();
+    AtomicReference<String> coreNodeName = new AtomicReference<>();
+    try {
+      zkStateReader.waitForState(collectionName, 320, TimeUnit.SECONDS, (n, c) -> {
+        if (c == null)
+          return false;
+        final Map<String,Slice> slicesMap = c.getSlicesMap();
         for (Slice slice : slicesMap.values()) {
           for (Replica replica : slice.getReplicas()) {
-            // TODO: for really large clusters, we could 'index' on this
 
             String nodeName = replica.getStr(ZkStateReader.NODE_NAME_PROP);
             String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
 
-            if (nodeName.equals(msgNodeName) && core.equals(msgCore)) {
-              return replica.getName();
+            if (msgNodeName.equals(nodeName) && core.equals(msgCore)) {
+              coreNodeName.set(replica.getName());
+              return true;
             }
           }
         }
-      }
-      try {
-        Thread.sleep(1000);
-      } catch (InterruptedException e) {
-        Thread.currentThread().interrupt();
-      }
+        return false;
+      });
+    } catch (TimeoutException e) {
+      String error = errorMessage.get();
+      if (error == null)
+        error = "Timeout waiting for collection state.";
+      throw new ZkController.NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Interrupted");
     }
-    throw new SolrException(ErrorCode.SERVER_ERROR, "Could not find coreNodeName");
+
+    return coreNodeName.get();
   }
 
-  ClusterState waitForNewShard(String collectionName, String sliceName) throws KeeperException, InterruptedException {
+  void waitForNewShard(String collectionName, String sliceName) {
     log.debug("Waiting for slice {} of collection {} to be available", sliceName, collectionName);
-    RTimer timer = new RTimer();
-    int retryCount = 320;
-    while (retryCount-- > 0) {
-      ClusterState clusterState = zkStateReader.getClusterState();
-      DocCollection collection = clusterState.getCollection(collectionName);
-
-      if (collection == null) {
-        throw new SolrException(ErrorCode.SERVER_ERROR,
-            "Unable to find collection: " + collectionName + " in clusterstate");
-      }
-      Slice slice = collection.getSlice(sliceName);
-      if (slice != null) {
-        if (log.isDebugEnabled()) {
-          log.debug("Waited for {}ms for slice {} of collection {} to be available",
-              timer.getTime(), sliceName, collectionName);
+    try {
+      zkStateReader.waitForState(collectionName, 15, TimeUnit.SECONDS, (n, c) -> {
+        if (c == null)
+          return false;
+        Slice slice = c.getSlice(sliceName);
+        if (slice != null) {
+          return true;
         }
-        return clusterState;
-      }
-      Thread.sleep(1000);
+        return false;
+      });
+    } catch (TimeoutException e) {
+      String error = "Timeout waiting for new shard.";
+      throw new ZkController.NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Interrupted");
     }
-    throw new SolrException(ErrorCode.SERVER_ERROR,
-        "Could not find new slice " + sliceName + " in collection " + collectionName
-            + " even after waiting for " + timer.getTime() + "ms"
-    );
   }
 
   DocRouter.Range intersect(DocRouter.Range a, DocRouter.Range b) {
@@ -639,34 +640,31 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
 
     overseer.offerStateUpdate(Utils.toJSON(message));
 
-    TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, timeSource);
-    boolean areChangesVisible = true;
-    while (!timeout.hasTimedOut()) {
-      DocCollection collection = cloudManager.getClusterStateProvider().getClusterState().getCollection(collectionName);
-      areChangesVisible = true;
-      for (Map.Entry<String,Object> updateEntry : message.getProperties().entrySet()) {
-        String updateKey = updateEntry.getKey();
-
-        if (!updateKey.equals(ZkStateReader.COLLECTION_PROP)
-            && !updateKey.equals(Overseer.QUEUE_OPERATION)
-            && updateEntry.getValue() != null // handled below in a separate conditional
-            && !updateEntry.getValue().equals(collection.get(updateKey))) {
-          areChangesVisible = false;
-          break;
-        }
+    try {
+      zkStateReader.waitForState(collectionName, 30, TimeUnit.SECONDS, (n, c) -> {
+        if (c == null) return false;
+
+        for (Map.Entry<String,Object> updateEntry : message.getProperties().entrySet()) {
+          String updateKey = updateEntry.getKey();
+
+          if (!updateKey.equals(ZkStateReader.COLLECTION_PROP)
+                  && !updateKey.equals(Overseer.QUEUE_OPERATION)
+                  && updateEntry.getValue() != null // handled below in a separate conditional
+                  && !updateEntry.getValue().equals(c.get(updateKey))) {
+            return false;
+          }
 
-        if (updateEntry.getValue() == null && collection.containsKey(updateKey)) {
-          areChangesVisible = false;
-          break;
+          if (updateEntry.getValue() == null && c.containsKey(updateKey)) {
+            return false;
+          }
         }
-      }
-      if (areChangesVisible) break;
-      timeout.sleep(100);
+        return true;
+      });
+    } catch (TimeoutException | InterruptedException e) {
+      log.error("modifyCollection(ClusterState=" + clusterState + ", ZkNodeProps=" + message + ", NamedList=" + results + ")", e);
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Could not modify collection " + message, e);
     }
 
-    if (!areChangesVisible)
-      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not modify collection " + message);
-
     // if switching to/from read-only mode reload the collection
     if (message.keySet().contains(ZkStateReader.READ_ONLY)) {
       reloadCollection(null, new ZkNodeProps(NAME, collectionName), results);
@@ -681,35 +679,52 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
     commandMap.get(DELETE).call(zkStateReader.getClusterState(), new ZkNodeProps(props), results);
   }
 
-  Map<String, Replica> waitToSeeReplicasInState(String collectionName, Collection<String> coreNames) throws InterruptedException {
-    assert coreNames.size() > 0;
-    Map<String, Replica> result = new HashMap<>();
-    TimeOut timeout = new TimeOut(Integer.getInteger("solr.waitToSeeReplicasInStateTimeoutSeconds", 120), TimeUnit.SECONDS, timeSource); // could be a big cluster
-    while (true) {
-      DocCollection coll = zkStateReader.getClusterState().getCollection(collectionName);
-      for (String coreName : coreNames) {
-        if (result.containsKey(coreName)) continue;
-        for (Slice slice : coll.getSlices()) {
-          for (Replica replica : slice.getReplicas()) {
-            if (coreName.equals(replica.getStr(ZkStateReader.CORE_NAME_PROP))) {
-              result.put(coreName, replica);
-              break;
+  Map<String, Replica> waitToSeeReplicasInState(String collectionName, Collection<String> coreUrls, boolean requireActive) {
+    log.info("wait to see {} in clusterstate {}", coreUrls, zkStateReader.getClusterState().getCollection(collectionName));
+    assert coreUrls.size() > 0;
+
+    AtomicReference<Map<String, Replica>> result = new AtomicReference<>();
+    AtomicReference<String> errorMessage = new AtomicReference<>();
+    try {
+      zkStateReader.waitForState(collectionName, 10, TimeUnit.SECONDS, (n, c) -> { // TODO config timeout up for prod, down for non nightly tests
+        if (c == null)
+          return false;
+        Map<String, Replica> r = new HashMap<>();
+        for (String coreUrl : coreUrls) {
+          if (r.containsKey(coreUrl)) continue;
+          Collection<Slice> slices = c.getSlices();
+          if (slices != null) {
+            for (Slice slice : slices) {
+              for (Replica replica : slice.getReplicas()) {
+                if (coreUrl.equals(replica.getCoreUrl()) && ((requireActive ? replica.getState().equals(Replica.State.ACTIVE) : true)
+                        && zkStateReader.getClusterState().liveNodesContain(replica.getNodeName()))) {
+                  r.put(coreUrl, replica);
+                  break;
+                }
+              }
             }
           }
         }
-      }
 
-      if (result.size() == coreNames.size()) {
-        return result;
-      } else {
-        log.debug("Expecting {} cores but found {}", coreNames, result);
-      }
-      if (timeout.hasTimedOut()) {
-        throw new SolrException(ErrorCode.SERVER_ERROR, "Timed out waiting to see all replicas: " + coreNames + " in cluster state. Last state: " + coll);
-      }
+        if (r.size() == coreUrls.size()) {
+          result.set(r);
+          return true;
+        } else {
+          errorMessage.set("Timed out waiting to see all replicas: " + coreUrls + " in cluster state. Last state: " + c);
+          return false;
+        }
 
-      Thread.sleep(100);
+      });
+    } catch (TimeoutException e) {
+      String error = errorMessage.get();
+      if (error == null)
+        error = "Timeout waiting for collection state.";
+      throw new SolrException(ErrorCode.SERVER_ERROR, error);
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Interrupted");
     }
+    return result.get();
   }
 
   List<ZkNodeProps> addReplica(ClusterState clusterState, ZkNodeProps message, @SuppressWarnings({"rawtypes"})NamedList results, Runnable onComplete)
@@ -931,11 +946,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
   @Override
   public void close() throws IOException {
     this.isClosed = true;
-    if (tpe != null) {
-      if (!tpe.isShutdown()) {
-        ExecutorUtil.shutdownAndAwaitTermination(tpe);
-      }
-    }
+    cloudManager.close();
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java
index c0fc491..2f57381 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java
@@ -634,7 +634,7 @@ public class ReindexCollectionCmd implements OverseerCollectionMessageHandler.Cm
     HttpClient client = ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient();
     try (HttpSolrClient solrClient = new HttpSolrClient.Builder()
         .withHttpClient(client)
-        .withBaseSolrUrl(daemonUrl).build()) {
+        .withBaseSolrUrl(daemonUrl).markInternalRequest().build()) {
       ModifiableSolrParams q = new ModifiableSolrParams();
       q.set(CommonParams.QT, "/stream");
       q.set("action", "list");
@@ -687,6 +687,7 @@ public class ReindexCollectionCmd implements OverseerCollectionMessageHandler.Cm
     HttpClient client = ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient();
     try (HttpSolrClient solrClient = new HttpSolrClient.Builder()
         .withHttpClient(client)
+        .markInternalRequest()
         .withBaseSolrUrl(daemonUrl).build()) {
       ModifiableSolrParams q = new ModifiableSolrParams();
       q.set(CommonParams.QT, "/stream");
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
index f314ebb..aa4562a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java
@@ -179,7 +179,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
       }
 
       propMap.put(NAME, restoreCollectionName);
-      propMap.put(OverseerCollectionMessageHandler.CREATE_NODE_SET, OverseerCollectionMessageHandler.CREATE_NODE_SET_EMPTY); //no cores
+      propMap.put(ZkStateReader.CREATE_NODE_SET, ZkStateReader.CREATE_NODE_SET_EMPTY); //no cores
       propMap.put(CollectionAdminParams.COLL_CONF, restoreConfigName);
 
       // router.*
@@ -192,7 +192,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
       if (backupCollectionState.getRouter() instanceof ImplicitDocRouter) {
         propMap.put(OverseerCollectionMessageHandler.SHARDS_PROP, StrUtils.join(sliceNames, ','));
       } else {
-        propMap.put(OverseerCollectionMessageHandler.NUM_SLICES, sliceNames.size());
+        propMap.put(ZkStateReader.NUM_SHARDS_PROP, sliceNames.size());
         // ClusterStateMutator.createCollection detects that "slices" is in fact a slice structure instead of a
         //   list of names, and if so uses this instead of building it.  We clear the replica list.
         Collection<Slice> backupSlices = backupCollectionState.getActiveSlices();
diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
index 2d04947..8276bab 100644
--- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
+++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
@@ -311,7 +311,10 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
         ocmh.overseer.offerStateUpdate(Utils.toJSON(new ZkNodeProps(propMap)));
 
         // wait until we are able to see the new shard in cluster state and refresh the local view of the cluster state
-        clusterState = ocmh.waitForNewShard(collectionName, subSlice);
+        ocmh.waitForNewShard(collectionName, subSlice);
+
+        // refresh cluster state
+        clusterState = zkStateReader.getClusterState();
 
         log.debug("Adding first replica {} as part of slice {} of collection {} on {}"
             , subShardName, subSlice, collectionName, nodeName);
@@ -350,7 +353,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
         for (String subShardName : subShardNames) {
           // wait for parent leader to acknowledge the sub-shard core
           log.debug("Asking parent leader to wait for: {} to be alive on: {}", subShardName, nodeName);
-          String coreNodeName = ocmh.waitForCoreNodeName(collectionName, nodeName, subShardName);
+          String coreNodeName = OverseerCollectionMessageHandler.waitForCoreNodeName(zkStateReader, collectionName, nodeName, subShardName);
           CoreAdminRequest.WaitForState cmd = new CoreAdminRequest.WaitForState();
           cmd.setCoreName(subShardName);
           cmd.setNodeName(nodeName);
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScaling.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScaling.java
index 1a191ee..65db8c6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScaling.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScaling.java
@@ -154,9 +154,6 @@ public class AutoScaling {
 
     @Override
     public synchronized Trigger create(TriggerEventType type, String name, Map<String, Object> props) throws TriggerValidationException {
-      if (isClosed) {
-        throw new AlreadyClosedException("TriggerFactory has already been closed, cannot create new triggers");
-      }
       if (type == null) {
         throw new IllegalArgumentException("Trigger type must not be null");
       }
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScalingHandler.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScalingHandler.java
index 23ec075..48cfb6d 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScalingHandler.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/AutoScalingHandler.java
@@ -590,6 +590,7 @@ public class AutoScalingHandler extends RequestHandlerBase implements Permission
     try {
       t = triggerFactory.create(trigger.event, trigger.name, trigger.properties);
     } catch (Exception e) {
+      log.error("", e);
       op.addError("Error validating trigger config " + trigger.name + ": " + e.toString());
       return currentConfig;
     } finally {
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
index 33bf6b0..e81172d 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
@@ -40,6 +40,7 @@ import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
 import static org.apache.solr.cloud.autoscaling.TriggerEvent.NODE_NAMES;
+import static org.apache.solr.common.params.AutoScalingParams.PREFERRED_OP;
 
 /**
  * This class is responsible for using the configured policy and preferences
@@ -56,7 +57,10 @@ public class ComputePlanAction extends TriggerActionBase {
 
   public ComputePlanAction() {
     super();
-    TriggerUtils.validProperties(validProperties, "collections");
+
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, "collections");
+    this.validProperties = vProperties;
   }
 
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ExecutePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ExecutePlanAction.java
index 1dfc3b1..be9b176 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ExecutePlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ExecutePlanAction.java
@@ -21,9 +21,11 @@ import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
@@ -63,7 +65,9 @@ public class ExecutePlanAction extends TriggerActionBase {
   boolean taskTimeoutFail;
 
   public ExecutePlanAction() {
-    TriggerUtils.validProperties(validProperties, TASK_TIMEOUT_SECONDS, TASK_TIMEOUT_FAIL);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, TASK_TIMEOUT_SECONDS, TASK_TIMEOUT_FAIL);
+    this.validProperties = vProperties;
   }
 
   @Override
@@ -224,7 +228,7 @@ public class ExecutePlanAction extends TriggerActionBase {
       if (i > 0 && i % 5 == 0) {
         log.trace("Task with requestId={} still not complete after {}s. Last state={}", requestId, i * 5, state);
       }
-      cloudManager.getTimeSource().sleep(5000);
+      cloudManager.getTimeSource().sleep(250);
     }
     log.debug("Task with requestId={} did not complete within {} seconds. Last state={}", timeoutSeconds, requestId, state);
     return statusResponse;
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/HttpTriggerListener.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/HttpTriggerListener.java
index 139efe0..9947017 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/HttpTriggerListener.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/HttpTriggerListener.java
@@ -18,6 +18,7 @@ package org.apache.solr.cloud.autoscaling;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Properties;
@@ -62,7 +63,7 @@ public class HttpTriggerListener extends TriggerListenerBase {
   private String urlTemplate;
   private String payloadTemplate;
   private String contentType;
-  private Map<String, String> headerTemplates = new HashMap<>();
+  private volatile Map<String, String> headerTemplates = Collections.unmodifiableMap(new HashMap<>());
   private int timeout = HttpClientUtil.DEFAULT_CONNECT_TIMEOUT;
   private boolean followRedirects;
 
@@ -79,11 +80,13 @@ public class HttpTriggerListener extends TriggerListenerBase {
     urlTemplate = (String)config.properties.get("url");
     payloadTemplate = (String)config.properties.get("payload");
     contentType = (String)config.properties.get("contentType");
+    Map<String, String> hTemplates = new HashMap<>();
     config.properties.forEach((k, v) -> {
       if (k.startsWith("header.")) {
-        headerTemplates.put(k.substring(7), String.valueOf(v));
+        hTemplates.put(k.substring(7), String.valueOf(v));
       }
     });
+    headerTemplates = hTemplates;
     timeout = PropertiesUtil.toInteger(String.valueOf(config.properties.get("timeout")), HttpClientUtil.DEFAULT_CONNECT_TIMEOUT);
     followRedirects = PropertiesUtil.toBoolean(String.valueOf(config.properties.get("followRedirects")));
   }
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveMarkersPlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveMarkersPlanAction.java
index c863703..73e8b90 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveMarkersPlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveMarkersPlanAction.java
@@ -18,6 +18,7 @@ package org.apache.solr.cloud.autoscaling;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -31,6 +32,7 @@ import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.client.solrj.cloud.autoscaling.BadVersionException;
 import org.apache.solr.client.solrj.cloud.autoscaling.NotEmptyException;
 import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.params.AutoScalingParams;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.core.SolrResourceLoader;
 import org.apache.zookeeper.KeeperException;
@@ -56,7 +58,9 @@ public class InactiveMarkersPlanAction extends TriggerActionBase {
 
   public InactiveMarkersPlanAction() {
     super();
-    TriggerUtils.validProperties(validProperties, TTL_PROP);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, TTL_PROP);
+    this.validProperties = vProperties;
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveShardPlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveShardPlanAction.java
index d3de649..3289074 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveShardPlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/InactiveShardPlanAction.java
@@ -18,10 +18,12 @@ package org.apache.solr.cloud.autoscaling;
 
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
@@ -31,6 +33,7 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.params.AutoScalingParams;
 import org.apache.solr.common.util.Utils;
 import org.apache.solr.core.SolrResourceLoader;
 import org.slf4j.Logger;
@@ -50,11 +53,13 @@ public class InactiveShardPlanAction extends TriggerActionBase {
 
   public static final int DEFAULT_TTL_SECONDS = 3600 * 24 * 2;
 
-  private int cleanupTTL;
+  private volatile int cleanupTTL;
 
   public InactiveShardPlanAction() {
     super();
-    TriggerUtils.validProperties(validProperties, TTL_PROP);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, TTL_PROP);
+    this.validProperties = vProperties;
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
index da40366..1143b33 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/IndexSizeTrigger.java
@@ -99,11 +99,14 @@ public class IndexSizeTrigger extends TriggerBase {
 
   public IndexSizeTrigger(String name) {
     super(TriggerEventType.INDEXSIZE, name);
-    TriggerUtils.validProperties(validProperties,
-        ABOVE_BYTES_PROP, ABOVE_DOCS_PROP, ABOVE_OP_PROP,
-        BELOW_BYTES_PROP, BELOW_DOCS_PROP, BELOW_OP_PROP,
-        COLLECTIONS_PROP, MAX_OPS_PROP,
-        SPLIT_METHOD_PROP, SPLIT_FUZZ_PROP, SPLIT_BY_PREFIX);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties,
+            ABOVE_BYTES_PROP, ABOVE_DOCS_PROP, ABOVE_OP_PROP,
+            BELOW_BYTES_PROP, BELOW_DOCS_PROP, BELOW_OP_PROP,
+            COLLECTIONS_PROP, MAX_OPS_PROP,
+            SPLIT_METHOD_PROP, SPLIT_FUZZ_PROP, SPLIT_BY_PREFIX);
+    this.validProperties = vProperties;
+
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/MetricTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/MetricTrigger.java
index 573ac77..1a361bb 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/MetricTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/MetricTrigger.java
@@ -61,11 +61,17 @@ public class MetricTrigger extends TriggerBase {
 
   public MetricTrigger(String name) {
     super(TriggerEventType.METRIC, name);
-    TriggerUtils.requiredProperties(requiredProperties, validProperties, METRIC);
-    TriggerUtils.validProperties(validProperties, ABOVE, BELOW, PREFERRED_OP,
-        AutoScalingParams.COLLECTION,
-        AutoScalingParams.SHARD,
-        AutoScalingParams.NODE);
+
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, ABOVE, BELOW, PREFERRED_OP,
+            AutoScalingParams.COLLECTION,
+            AutoScalingParams.SHARD,
+            AutoScalingParams.NODE);
+    this.validProperties = vProperties;
+
+    Set<String> rProperties = new HashSet<>(requiredProperties);
+    TriggerUtils.requiredProperties(rProperties, validProperties, METRIC);
+    this.requiredProperties = rProperties;
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeAddedTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeAddedTrigger.java
index 42188e4..ec550e3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeAddedTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeAddedTrigger.java
@@ -29,6 +29,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
@@ -55,16 +56,18 @@ import static org.apache.solr.common.params.AutoScalingParams.REPLICA_TYPE;
 public class NodeAddedTrigger extends TriggerBase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
-  private Set<String> lastLiveNodes = new HashSet<>();
+  private Set<String> lastLiveNodes = ConcurrentHashMap.newKeySet();
 
-  private Map<String, Long> nodeNameVsTimeAdded = new HashMap<>();
+  private Map<String, Long> nodeNameVsTimeAdded = new ConcurrentHashMap<>();
 
   private String preferredOp;
   private Replica.Type replicaType = Replica.Type.NRT;
 
   public NodeAddedTrigger(String name) {
     super(TriggerEventType.NODEADDED, name);
-    TriggerUtils.validProperties(validProperties, PREFERRED_OP, REPLICA_TYPE);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, PREFERRED_OP, REPLICA_TYPE);
+    this.validProperties = vProperties;
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeLostTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeLostTrigger.java
index b1c5818..6a53317 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeLostTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/NodeLostTrigger.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -29,6 +30,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
@@ -47,6 +49,7 @@ import static org.apache.solr.cloud.autoscaling.OverseerTriggerThread.MARKER_ACT
 import static org.apache.solr.cloud.autoscaling.OverseerTriggerThread.MARKER_INACTIVE;
 import static org.apache.solr.cloud.autoscaling.OverseerTriggerThread.MARKER_STATE;
 import static org.apache.solr.common.params.AutoScalingParams.PREFERRED_OP;
+import static org.apache.solr.common.params.AutoScalingParams.REPLICA_TYPE;
 
 /**
  * Trigger for the {@link TriggerEventType#NODELOST} event
@@ -54,15 +57,17 @@ import static org.apache.solr.common.params.AutoScalingParams.PREFERRED_OP;
 public class NodeLostTrigger extends TriggerBase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
-  private Set<String> lastLiveNodes = new HashSet<>();
+  private Set<String> lastLiveNodes = ConcurrentHashMap.newKeySet();
 
-  private Map<String, Long> nodeNameVsTimeRemoved = new HashMap<>();
+  private Map<String, Long> nodeNameVsTimeRemoved = new ConcurrentHashMap<>();
 
   private String preferredOp;
 
   public NodeLostTrigger(String name) {
     super(TriggerEventType.NODELOST, name);
-    TriggerUtils.validProperties(validProperties, PREFERRED_OP);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties, PREFERRED_OP);
+    this.validProperties = vProperties;
   }
 
   @Override
@@ -232,6 +237,7 @@ public class NodeLostTrigger extends TriggerBase {
     public NodeLostEvent(TriggerEventType eventType, String source, List<Long> times, List<String> nodeNames, String preferredOp) {
       // use the oldest time as the time of the event
       super(eventType, source, times.get(0), null);
+
       properties.put(NODE_NAMES, nodeNames);
       properties.put(EVENT_TIMES, times);
       properties.put(PREFERRED_OP, preferredOp);
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/OverseerTriggerThread.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/OverseerTriggerThread.java
index 00dc3c9..6a301c6 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/OverseerTriggerThread.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/OverseerTriggerThread.java
@@ -26,6 +26,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.Condition;
 import java.util.concurrent.locks.ReentrantLock;
 
@@ -36,6 +38,7 @@ import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.client.solrj.cloud.autoscaling.Policy;
 import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
 import org.apache.solr.common.AlreadyClosedException;
+import org.apache.solr.common.ParWork;
 import org.apache.solr.common.SolrCloseable;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.IOUtils;
@@ -76,9 +79,9 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
   /*
   Following variables are only accessed or modified when updateLock is held
    */
-  private int znodeVersion = 0;
+  private volatile int znodeVersion = 0;
 
-  private Map<String, AutoScaling.Trigger> activeTriggers = new HashMap<>();
+  private Map<String, AutoScaling.Trigger> activeTriggers = new ConcurrentHashMap<>();
 
   private volatile int processedZnodeVersion = -1;
 
@@ -94,16 +97,22 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
 
   @Override
   public void close() throws IOException {
-    updateLock.lock();
+    isClosed = true;
+    IOUtils.closeQuietly(triggerFactory);
+    IOUtils.closeQuietly(scheduledTriggers);
+
+    activeTriggers.clear();
+
+    try {
+      updateLock.lockInterruptibly();
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+    }
     try {
-      isClosed = true;
-      activeTriggers.clear();
       updated.signalAll();
     } finally {
       updateLock.unlock();
     }
-    IOUtils.closeQuietly(triggerFactory);
-    IOUtils.closeQuietly(scheduledTriggers);
     log.debug("OverseerTriggerThread has been closed explicitly");
   }
 
@@ -139,10 +148,6 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
     // we also automatically add a scheduled maintenance trigger
     while (!isClosed)  {
       try {
-        if (Thread.currentThread().isInterrupted()) {
-          log.warn("Interrupted");
-          break;
-        }
         AutoScalingConfig autoScalingConfig = cloudManager.getDistribStateManager().getAutoScalingConfig();
         AutoScalingConfig updatedConfig = withDefaultPolicy(autoScalingConfig);
         updatedConfig = withAutoAddReplicasTrigger(updatedConfig);
@@ -152,7 +157,8 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
         cloudManager.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(updatedConfig), updatedConfig.getZkVersion());
         break;
       } catch (AlreadyClosedException e) {
-        break;
+        log.info("Already closed");
+        return;
       } catch (BadVersionException bve) {
         // somebody else has changed the configuration so we must retry
       } catch (InterruptedException e) {
@@ -177,12 +183,13 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
 
     try {
       refreshAutoScalingConf(new AutoScalingWatcher());
-    } catch (ConnectException e) {
-      log.warn("ZooKeeper watch triggered for autoscaling conf, but Solr cannot talk to ZK: [{}]", e.getMessage());
-    } catch (InterruptedException e) {
+    } catch (IOException e) {
+      log.error("IO error: [{}]", e);
+    } catch (InterruptedException | AlreadyClosedException e) {
       // Restore the interrupted status
       Thread.currentThread().interrupt();
-      log.warn("Interrupted", e);
+      log.info("Interrupted", e);
+      return;
     } catch (Exception e)  {
       log.error("Unexpected exception", e);
     }
@@ -203,7 +210,7 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
           log.debug("Current znodeVersion {}, lastZnodeVersion {}", znodeVersion, lastZnodeVersion);
           
           if (znodeVersion == lastZnodeVersion) {
-            updated.await();
+            updated.await(10, TimeUnit.SECONDS);
             
             // are we closed?
             if (isClosed) {
@@ -220,11 +227,11 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
         } finally {
           updateLock.unlock();
         }
-      } catch (InterruptedException e) {
+      } catch (InterruptedException | AlreadyClosedException e) {
         // Restore the interrupted status
         Thread.currentThread().interrupt();
-        log.warn("Interrupted", e);
-        break;
+        log.info("Interrupted", e);
+        return;
       }
      
       // update the current config
@@ -245,27 +252,40 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
           try {
             scheduledTriggers.add(entry.getValue());
           } catch (AlreadyClosedException e) {
-
+            log.info("already closed");
+            return;
           } catch (Exception e) {
-            log.warn("Exception initializing trigger {}, configuration ignored", entry.getKey(), e);
+            ParWork.propegateInterrupt(e);
+            if (e instanceof KeeperException.SessionExpiredException || e instanceof InterruptedException) {
+              log.error("", e);
+              return;
+            }
+            log.error("Exception initializing trigger {}, configuration ignored", entry.getKey(), e);
           }
         }
       } catch (AlreadyClosedException e) {
-        // this _should_ mean that we're closing, complain loudly if that's not the case
-        if (isClosed) {
-          return;
-        } else {
-          throw new IllegalStateException("Caught AlreadyClosedException from ScheduledTriggers, but we're not closed yet!", e);
-        }
+        log.info("already closed");
+        return;
       }
       log.debug("-- deactivating old nodeLost / nodeAdded markers");
-      deactivateMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
-      deactivateMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
+      try {
+        deactivateMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
+        deactivateMarkers(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
+      } catch (InterruptedException | AlreadyClosedException e) {
+        ParWork.propegateInterrupt(e);
+        return;
+      } catch (KeeperException e) {
+        log.error("", e);
+        return;
+      } catch (Exception e) {
+        log.error("Exception deactivating markers", e);
+      }
+
       processedZnodeVersion = znodeVersion;
     }
   }
 
-  private void deactivateMarkers(String path) {
+  private void deactivateMarkers(String path) throws InterruptedException, IOException, KeeperException, BadVersionException {
     DistribStateManager stateManager = cloudManager.getDistribStateManager();
     try {
       List<String> markers = stateManager.listData(path);
@@ -281,8 +301,6 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
       }
     } catch (NoSuchElementException e) {
       // ignore
-    } catch (Exception e) {
-      log.warn("Error deactivating old markers", e);
     }
   }
 
@@ -296,9 +314,9 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
 
       try {
         refreshAutoScalingConf(this);
-      } catch (ConnectException e) {
-        log.warn("ZooKeeper watch triggered for autoscaling conf, but we cannot talk to ZK: [{}]", e.getMessage());
-      } catch (InterruptedException e) {
+      } catch (IOException e) {
+        log.warn("IO Error: [{}]", e);
+      } catch (InterruptedException | AlreadyClosedException e) {
         // Restore the interrupted status
         Thread.currentThread().interrupt();
         log.warn("Interrupted", e);
@@ -310,7 +328,7 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
   }
 
   private void refreshAutoScalingConf(Watcher watcher) throws InterruptedException, IOException {
-    updateLock.lock();
+    updateLock.lockInterruptibly();
     try {
       if (isClosed) {
         return;
@@ -390,7 +408,7 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
       return Collections.emptyMap();
     }
 
-    Map<String, AutoScaling.Trigger> triggerMap = new HashMap<>(triggers.size());
+    Map<String, AutoScaling.Trigger> triggerMap = new ConcurrentHashMap<>(triggers.size());
 
     for (Map.Entry<String, AutoScalingConfig.TriggerConfig> entry : triggers.entrySet()) {
       AutoScalingConfig.TriggerConfig cfg = entry.getValue();
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTrigger.java
index 98a367c..63498d0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTrigger.java
@@ -25,8 +25,10 @@ import java.time.format.DateTimeFormatterBuilder;
 import java.time.temporal.ChronoField;
 import java.util.Collections;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.TimeZone;
 import java.util.concurrent.TimeUnit;
 
@@ -66,8 +68,14 @@ public class ScheduledTrigger extends TriggerBase {
 
   public ScheduledTrigger(String name) {
     super(TriggerEventType.SCHEDULED, name);
-    TriggerUtils.requiredProperties(requiredProperties, validProperties, "startTime", "every");
-    TriggerUtils.validProperties(validProperties, "timeZone", "graceDuration", AutoScalingParams.PREFERRED_OP);
+    Set<String> vProperties = new HashSet<>(validProperties);
+
+    Set<String> rProperties = new HashSet<>(requiredProperties);
+    TriggerUtils.requiredProperties(rProperties, vProperties, "startTime", "every");
+    this.requiredProperties = rProperties;
+
+    TriggerUtils.validProperties(vProperties, "timeZone", "graceDuration", AutoScalingParams.PREFERRED_OP);
+    this.validProperties = vProperties;
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
index e080eec..df71fa3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ScheduledTriggers.java
@@ -75,9 +75,9 @@ import static org.apache.solr.common.util.ExecutorUtil.awaitTermination;
 public class ScheduledTriggers implements Closeable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   public static final int DEFAULT_SCHEDULED_TRIGGER_DELAY_SECONDS = 1;
-  public static final int DEFAULT_ACTION_THROTTLE_PERIOD_SECONDS = 5;
-  public static final int DEFAULT_COOLDOWN_PERIOD_SECONDS = 5;
-  public static final int DEFAULT_TRIGGER_CORE_POOL_SIZE = 4;
+  public static int DEFAULT_ACTION_THROTTLE_PERIOD_SECONDS =55;
+  public static int DEFAULT_COOLDOWN_PERIOD_SECONDS = 5;
+  public static int DEFAULT_TRIGGER_CORE_POOL_SIZE = 4;
 
   static final Map<String, Object> DEFAULT_PROPERTIES = new HashMap<>();
 
@@ -134,7 +134,7 @@ public class ScheduledTriggers implements Closeable {
 
   private final TriggerListeners listeners;
 
-  private final List<TriggerListener> additionalListeners = new ArrayList<>();
+  private final List<TriggerListener> additionalListeners = Collections.synchronizedList(new ArrayList<>());
 
   private AutoScalingConfig autoScalingConfig;
 
@@ -214,16 +214,10 @@ public class ScheduledTriggers implements Closeable {
    * @throws AlreadyClosedException if this class has already been closed
    */
   public synchronized void add(AutoScaling.Trigger newTrigger) throws Exception {
-    if (isClosed) {
-      throw new AlreadyClosedException("ScheduledTriggers has been closed and cannot be used anymore");
-    }
     TriggerWrapper st;
     try {
       st = new TriggerWrapper(newTrigger, cloudManager, queueStats);
     } catch (Exception e) {
-      if (isClosed || e instanceof AlreadyClosedException) {
-        throw new AlreadyClosedException("ScheduledTriggers has been closed and cannot be used anymore");
-      }
       if (cloudManager.isClosed()) {
         log.error("Failed to add trigger {} - closing or disconnected from data provider", newTrigger.getName(), e);
       } else {
@@ -465,9 +459,6 @@ public class ScheduledTriggers implements Closeable {
       Thread.currentThread().interrupt();
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Thread interrupted", e);
     } catch (Exception e) {
-      if (cloudManager.isClosed())  {
-        throw new AlreadyClosedException("The Solr instance has been shutdown");
-      }
       // we catch but don't rethrow because a failure to wait for pending tasks
       // should not keep the actions from executing
       log.error("Unexpected exception while waiting for pending tasks to finish", e);
@@ -596,25 +587,16 @@ public class ScheduledTriggers implements Closeable {
     }
 
     public boolean enqueue(TriggerEvent event) {
-      if (isClosed) {
-        throw new AlreadyClosedException("ScheduledTrigger " + trigger.getName() + " has been closed.");
-      }
       return queue.offerEvent(event);
     }
 
     public TriggerEvent dequeue() {
-      if (isClosed) {
-        throw new AlreadyClosedException("ScheduledTrigger " + trigger.getName() + " has been closed.");
-      }
       TriggerEvent event = queue.pollEvent();
       return event;
     }
 
     @Override
     public void run() {
-      if (isClosed) {
-        throw new AlreadyClosedException("ScheduledTrigger " + trigger.getName() + " has been closed.");
-      }
       // fire a trigger only if an action is not pending
       // note this is not fool proof e.g. it does not prevent an action being executed while a trigger
       // is still executing. There is additional protection against that scenario in the event listener.
@@ -680,9 +662,9 @@ public class ScheduledTriggers implements Closeable {
   }
 
   private class TriggerListeners {
-    Map<String, Map<TriggerEventProcessorStage, List<TriggerListener>>> listenersPerStage = new HashMap<>();
-    Map<String, TriggerListener> listenersPerName = new HashMap<>();
-    List<TriggerListener> additionalListeners = new ArrayList<>();
+    final Map<String, Map<TriggerEventProcessorStage, List<TriggerListener>>> listenersPerStage = new ConcurrentHashMap<>();
+    final Map<String, TriggerListener> listenersPerName = new ConcurrentHashMap<>();
+    final Set<TriggerListener> additionalListeners = ConcurrentHashMap.newKeySet();
     ReentrantLock updateLock = new ReentrantLock();
 
     public TriggerListeners() {
@@ -691,7 +673,6 @@ public class ScheduledTriggers implements Closeable {
 
     private TriggerListeners(Map<String, Map<TriggerEventProcessorStage, List<TriggerListener>>> listenersPerStage,
                              Map<String, TriggerListener> listenersPerName) {
-      this.listenersPerStage = new HashMap<>();
       listenersPerStage.forEach((n, listeners) -> {
         Map<TriggerEventProcessorStage, List<TriggerListener>> perStage = this.listenersPerStage.computeIfAbsent(n, name -> new HashMap<>());
         listeners.forEach((s, lst) -> {
@@ -699,7 +680,7 @@ public class ScheduledTriggers implements Closeable {
           newLst.addAll(lst);
         });
       });
-      this.listenersPerName = new HashMap<>(listenersPerName);
+      this.listenersPerName .putAll(listenersPerName);
     }
 
     public TriggerListeners copy() {
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
index efd5b24..505e33b 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/SearchRateTrigger.java
@@ -111,22 +111,24 @@ public class SearchRateTrigger extends TriggerBase {
     this.state.put("lastNodeEvent", lastNodeEvent);
     this.state.put("lastShardEvent", lastShardEvent);
     this.state.put("lastReplicaEvent", lastReplicaEvent);
-    TriggerUtils.validProperties(validProperties,
-        COLLECTIONS_PROP, AutoScalingParams.SHARD, AutoScalingParams.NODE,
-        METRIC_PROP,
-        MAX_OPS_PROP,
-        MIN_REPLICAS_PROP,
-        ABOVE_OP_PROP,
-        BELOW_OP_PROP,
-        ABOVE_NODE_OP_PROP,
-        BELOW_NODE_OP_PROP,
-        ABOVE_RATE_PROP,
-        BELOW_RATE_PROP,
-        ABOVE_NODE_RATE_PROP,
-        BELOW_NODE_RATE_PROP,
-        // back-compat props
-        BC_COLLECTION_PROP,
-        BC_RATE_PROP);
+    Set<String> vProperties = new HashSet<>(validProperties);
+    TriggerUtils.validProperties(vProperties,
+            COLLECTIONS_PROP, AutoScalingParams.SHARD, AutoScalingParams.NODE,
+            METRIC_PROP,
+            MAX_OPS_PROP,
+            MIN_REPLICAS_PROP,
+            ABOVE_OP_PROP,
+            BELOW_OP_PROP,
+            ABOVE_NODE_OP_PROP,
+            BELOW_NODE_OP_PROP,
+            ABOVE_RATE_PROP,
+            BELOW_RATE_PROP,
+            ABOVE_NODE_RATE_PROP,
+            BELOW_NODE_RATE_PROP,
+            // back-compat props
+            BC_COLLECTION_PROP,
+            BC_RATE_PROP);
+    this.validProperties = vProperties;
   }
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionBase.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionBase.java
index 7a9f34b..aacedc8 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerActionBase.java
@@ -17,10 +17,12 @@
 package org.apache.solr.cloud.autoscaling;
 
 import java.io.IOException;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
 import org.apache.solr.core.SolrResourceLoader;
@@ -30,14 +32,14 @@ import org.apache.solr.core.SolrResourceLoader;
  */
 public abstract class TriggerActionBase implements TriggerAction {
 
-  protected Map<String, Object> properties = new HashMap<>();
+  protected volatile Map<String, Object> properties = new HashMap<>();
   protected SolrResourceLoader loader;
   protected SolrCloudManager cloudManager;
   /**
    * Set of valid property names. Subclasses may add to this set
    * using {@link TriggerUtils#validProperties(Set, String...)}
    */
-  protected final Set<String> validProperties = new HashSet<>();
+  protected volatile Set<String> validProperties = Collections.EMPTY_SET;
   /**
    * Set of required property names. Subclasses may add to this set
    * using {@link TriggerUtils#requiredProperties(Set, Set, String...)}
@@ -47,7 +49,12 @@ public abstract class TriggerActionBase implements TriggerAction {
 
   protected TriggerActionBase() {
     // not strictly needed here because they are already checked during instantiation
-    TriggerUtils.validProperties(validProperties, "name", "class");
+    Set<String> vProperties = new HashSet<>();
+    // subclasses may further modify this set to include other supported properties
+    TriggerUtils.validProperties(vProperties, "name", "class");
+
+    this. validProperties = Collections.unmodifiableSet(vProperties);
+
   }
 
   @Override
@@ -70,7 +77,8 @@ public abstract class TriggerActionBase implements TriggerAction {
     this.loader = loader;
     this.cloudManager = cloudManager;
     if (properties != null) {
-      this.properties.putAll(properties);
+      Map<String, Object> props = new HashMap<>(properties);
+      this.properties = props;
     }
     // validate the config
     Map<String, String> results = new HashMap<>();
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerBase.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerBase.java
index d045f6a..a0ed4c4 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/TriggerBase.java
@@ -25,6 +25,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Properties;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
 
@@ -56,23 +57,23 @@ public abstract class TriggerBase implements AutoScaling.Trigger {
   protected SolrCloudManager cloudManager;
   protected SolrResourceLoader loader;
   protected DistribStateManager stateManager;
-  protected final Map<String, Object> properties = new HashMap<>();
+  protected volatile Map<String, Object> properties = Collections.unmodifiableMap(new HashMap<>());
   /**
    * Set of valid property names. Subclasses may add to this set
    * using {@link TriggerUtils#validProperties(Set, String...)}
    */
-  protected final Set<String> validProperties = new HashSet<>();
+  protected volatile Set<String> validProperties = Collections.unmodifiableSet(new HashSet<>());
   /**
    * Set of required property names. Subclasses may add to this set
    * using {@link TriggerUtils#requiredProperties(Set, Set, String...)}
    * (required properties are also valid properties).
    */
-  protected final Set<String> requiredProperties = new HashSet<>();
+  protected volatile Set<String> requiredProperties =  Collections.emptySet();
   protected final TriggerEventType eventType;
   protected int waitForSecond;
   protected Map<String,Object> lastState;
   protected final AtomicReference<AutoScaling.TriggerEventProcessor> processorRef = new AtomicReference<>();
-  protected List<TriggerAction> actions;
+  protected volatile List<TriggerAction> actions;
   protected boolean enabled;
   protected boolean isClosed;
 
@@ -80,23 +81,25 @@ public abstract class TriggerBase implements AutoScaling.Trigger {
   protected TriggerBase(TriggerEventType eventType, String name) {
     this.eventType = eventType;
     this.name = name;
-
+    Set<String> vProperties = new HashSet<>();
     // subclasses may further modify this set to include other supported properties
-    TriggerUtils.validProperties(validProperties, "name", "class", "event", "enabled", "waitFor", "actions");
+    TriggerUtils.validProperties(vProperties, "name", "class", "event", "enabled", "waitFor", "actions");
+
+   this. validProperties = Collections.unmodifiableSet(vProperties);
   }
 
   /**
    * Return a set of valid property names supported by this trigger.
    */
   public final Set<String> getValidProperties() {
-    return Collections.unmodifiableSet(this.validProperties);
+    return this.validProperties;
   }
 
   /**
    * Return a set of required property names supported by this trigger.
    */
   public final Set<String> getRequiredProperties() {
-    return Collections.unmodifiableSet(this.requiredProperties);
+    return this.requiredProperties;
   }
 
   @Override
@@ -104,13 +107,14 @@ public abstract class TriggerBase implements AutoScaling.Trigger {
     this.cloudManager = cloudManager;
     this.loader = loader;
     this.stateManager = cloudManager.getDistribStateManager();
+    Map<String, Object> props = new HashMap<>(this.properties);
     if (properties != null) {
-      this.properties.putAll(properties);
+      props.putAll(properties);
     }
-    this.enabled = Boolean.parseBoolean(String.valueOf(this.properties.getOrDefault("enabled", "true")));
-    this.waitForSecond = ((Number) this.properties.getOrDefault("waitFor", -1L)).intValue();
+    this.enabled = Boolean.parseBoolean(String.valueOf(props.getOrDefault("enabled", "true")));
+    this.waitForSecond = ((Number) props.getOrDefault("waitFor", -1L)).intValue();
     @SuppressWarnings({"unchecked"})
-    List<Map<String, Object>> o = (List<Map<String, Object>>) properties.get("actions");
+    List<Map<String, Object>> o = (List<Map<String, Object>>) props.get("actions");
     if (o != null && !o.isEmpty()) {
       actions = new ArrayList<>(3);
... 37891 lines suppressed ...