You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by gi...@apache.org on 2017/06/02 15:00:13 UTC
[12/32] hbase-site git commit: Published site at
82d554e3783372cc6b05489452c815b57c06f6cd.
http://git-wip-us.apache.org/repos/asf/hbase-site/blob/7e7efd3e/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
----------------------------------------------------------------------
diff --git a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
index 8ffd272..58b1653 100644
--- a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
+++ b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
@@ -233,220 +233,235 @@
<span class="sourceLineNo">225</span> return entryLists;<a name="line.225"></a>
<span class="sourceLineNo">226</span> }<a name="line.226"></a>
<span class="sourceLineNo">227</span><a name="line.227"></a>
-<span class="sourceLineNo">228</span> /**<a name="line.228"></a>
-<span class="sourceLineNo">229</span> * Do the shipping logic<a name="line.229"></a>
-<span class="sourceLineNo">230</span> */<a name="line.230"></a>
-<span class="sourceLineNo">231</span> @Override<a name="line.231"></a>
-<span class="sourceLineNo">232</span> public boolean replicate(ReplicateContext replicateContext) {<a name="line.232"></a>
-<span class="sourceLineNo">233</span> CompletionService<Integer> pool = new ExecutorCompletionService<>(this.exec);<a name="line.233"></a>
-<span class="sourceLineNo">234</span> List<List<Entry>> batches;<a name="line.234"></a>
-<span class="sourceLineNo">235</span> String walGroupId = replicateContext.getWalGroupId();<a name="line.235"></a>
-<span class="sourceLineNo">236</span> int sleepMultiplier = 1;<a name="line.236"></a>
-<span class="sourceLineNo">237</span><a name="line.237"></a>
-<span class="sourceLineNo">238</span> if (!peersSelected && this.isRunning()) {<a name="line.238"></a>
-<span class="sourceLineNo">239</span> connectToPeers();<a name="line.239"></a>
-<span class="sourceLineNo">240</span> peersSelected = true;<a name="line.240"></a>
-<span class="sourceLineNo">241</span> }<a name="line.241"></a>
-<span class="sourceLineNo">242</span><a name="line.242"></a>
-<span class="sourceLineNo">243</span> int numSinks = replicationSinkMgr.getNumSinks();<a name="line.243"></a>
-<span class="sourceLineNo">244</span> if (numSinks == 0) {<a name="line.244"></a>
-<span class="sourceLineNo">245</span> LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.245"></a>
-<span class="sourceLineNo">246</span> + " with the same set of edits.");<a name="line.246"></a>
-<span class="sourceLineNo">247</span> return false;<a name="line.247"></a>
-<span class="sourceLineNo">248</span> }<a name="line.248"></a>
+<span class="sourceLineNo">228</span> private void reconnectToPeerCluster() {<a name="line.228"></a>
+<span class="sourceLineNo">229</span> ClusterConnection connection = null;<a name="line.229"></a>
+<span class="sourceLineNo">230</span> try {<a name="line.230"></a>
+<span class="sourceLineNo">231</span> connection = (ClusterConnection) ConnectionFactory.createConnection(this.conf);<a name="line.231"></a>
+<span class="sourceLineNo">232</span> } catch (IOException ioe) {<a name="line.232"></a>
+<span class="sourceLineNo">233</span> LOG.warn("Failed to create connection for peer cluster", ioe);<a name="line.233"></a>
+<span class="sourceLineNo">234</span> }<a name="line.234"></a>
+<span class="sourceLineNo">235</span> if (connection != null) {<a name="line.235"></a>
+<span class="sourceLineNo">236</span> this.conn = connection;<a name="line.236"></a>
+<span class="sourceLineNo">237</span> }<a name="line.237"></a>
+<span class="sourceLineNo">238</span> }<a name="line.238"></a>
+<span class="sourceLineNo">239</span><a name="line.239"></a>
+<span class="sourceLineNo">240</span> /**<a name="line.240"></a>
+<span class="sourceLineNo">241</span> * Do the shipping logic<a name="line.241"></a>
+<span class="sourceLineNo">242</span> */<a name="line.242"></a>
+<span class="sourceLineNo">243</span> @Override<a name="line.243"></a>
+<span class="sourceLineNo">244</span> public boolean replicate(ReplicateContext replicateContext) {<a name="line.244"></a>
+<span class="sourceLineNo">245</span> CompletionService<Integer> pool = new ExecutorCompletionService<>(this.exec);<a name="line.245"></a>
+<span class="sourceLineNo">246</span> List<List<Entry>> batches;<a name="line.246"></a>
+<span class="sourceLineNo">247</span> String walGroupId = replicateContext.getWalGroupId();<a name="line.247"></a>
+<span class="sourceLineNo">248</span> int sleepMultiplier = 1;<a name="line.248"></a>
<span class="sourceLineNo">249</span><a name="line.249"></a>
-<span class="sourceLineNo">250</span> batches = createBatches(replicateContext.getEntries());<a name="line.250"></a>
-<span class="sourceLineNo">251</span><a name="line.251"></a>
-<span class="sourceLineNo">252</span> while (this.isRunning() && !exec.isShutdown()) {<a name="line.252"></a>
-<span class="sourceLineNo">253</span> if (!isPeerEnabled()) {<a name="line.253"></a>
-<span class="sourceLineNo">254</span> if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.254"></a>
-<span class="sourceLineNo">255</span> sleepMultiplier++;<a name="line.255"></a>
-<span class="sourceLineNo">256</span> }<a name="line.256"></a>
-<span class="sourceLineNo">257</span> continue;<a name="line.257"></a>
-<span class="sourceLineNo">258</span> }<a name="line.258"></a>
-<span class="sourceLineNo">259</span> try {<a name="line.259"></a>
-<span class="sourceLineNo">260</span> int futures = 0;<a name="line.260"></a>
-<span class="sourceLineNo">261</span> for (int i=0; i<batches.size(); i++) {<a name="line.261"></a>
-<span class="sourceLineNo">262</span> List<Entry> entries = batches.get(i);<a name="line.262"></a>
-<span class="sourceLineNo">263</span> if (!entries.isEmpty()) {<a name="line.263"></a>
-<span class="sourceLineNo">264</span> if (LOG.isTraceEnabled()) {<a name="line.264"></a>
-<span class="sourceLineNo">265</span> LOG.trace("Submitting " + entries.size() +<a name="line.265"></a>
-<span class="sourceLineNo">266</span> " entries of total size " + replicateContext.getSize());<a name="line.266"></a>
-<span class="sourceLineNo">267</span> }<a name="line.267"></a>
-<span class="sourceLineNo">268</span> // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.268"></a>
-<span class="sourceLineNo">269</span> pool.submit(createReplicator(entries, i));<a name="line.269"></a>
-<span class="sourceLineNo">270</span> futures++;<a name="line.270"></a>
-<span class="sourceLineNo">271</span> }<a name="line.271"></a>
-<span class="sourceLineNo">272</span> }<a name="line.272"></a>
-<span class="sourceLineNo">273</span> IOException iox = null;<a name="line.273"></a>
-<span class="sourceLineNo">274</span><a name="line.274"></a>
-<span class="sourceLineNo">275</span> long lastWriteTime = 0;<a name="line.275"></a>
-<span class="sourceLineNo">276</span> for (int i=0; i<futures; i++) {<a name="line.276"></a>
-<span class="sourceLineNo">277</span> try {<a name="line.277"></a>
-<span class="sourceLineNo">278</span> // wait for all futures, remove successful parts<a name="line.278"></a>
-<span class="sourceLineNo">279</span> // (only the remaining parts will be retried)<a name="line.279"></a>
-<span class="sourceLineNo">280</span> Future<Integer> f = pool.take();<a name="line.280"></a>
-<span class="sourceLineNo">281</span> int index = f.get().intValue();<a name="line.281"></a>
-<span class="sourceLineNo">282</span> List<Entry> batch = batches.get(index);<a name="line.282"></a>
-<span class="sourceLineNo">283</span> batches.set(index, Collections.<Entry>emptyList()); // remove successful batch<a name="line.283"></a>
-<span class="sourceLineNo">284</span> // Find the most recent write time in the batch<a name="line.284"></a>
-<span class="sourceLineNo">285</span> long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.285"></a>
-<span class="sourceLineNo">286</span> if (writeTime > lastWriteTime) {<a name="line.286"></a>
-<span class="sourceLineNo">287</span> lastWriteTime = writeTime;<a name="line.287"></a>
-<span class="sourceLineNo">288</span> }<a name="line.288"></a>
-<span class="sourceLineNo">289</span> } catch (InterruptedException ie) {<a name="line.289"></a>
-<span class="sourceLineNo">290</span> iox = new IOException(ie);<a name="line.290"></a>
-<span class="sourceLineNo">291</span> } catch (ExecutionException ee) {<a name="line.291"></a>
-<span class="sourceLineNo">292</span> // cause must be an IOException<a name="line.292"></a>
-<span class="sourceLineNo">293</span> iox = (IOException)ee.getCause();<a name="line.293"></a>
-<span class="sourceLineNo">294</span> }<a name="line.294"></a>
-<span class="sourceLineNo">295</span> }<a name="line.295"></a>
-<span class="sourceLineNo">296</span> if (iox != null) {<a name="line.296"></a>
-<span class="sourceLineNo">297</span> // if we had any exceptions, try again<a name="line.297"></a>
-<span class="sourceLineNo">298</span> throw iox;<a name="line.298"></a>
-<span class="sourceLineNo">299</span> }<a name="line.299"></a>
-<span class="sourceLineNo">300</span> // update metrics<a name="line.300"></a>
-<span class="sourceLineNo">301</span> if (lastWriteTime > 0) {<a name="line.301"></a>
-<span class="sourceLineNo">302</span> this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.302"></a>
-<span class="sourceLineNo">303</span> }<a name="line.303"></a>
-<span class="sourceLineNo">304</span> return true;<a name="line.304"></a>
-<span class="sourceLineNo">305</span><a name="line.305"></a>
-<span class="sourceLineNo">306</span> } catch (IOException ioe) {<a name="line.306"></a>
-<span class="sourceLineNo">307</span> // Didn't ship anything, but must still age the last time we did<a name="line.307"></a>
-<span class="sourceLineNo">308</span> this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.308"></a>
-<span class="sourceLineNo">309</span> if (ioe instanceof RemoteException) {<a name="line.309"></a>
-<span class="sourceLineNo">310</span> ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.310"></a>
-<span class="sourceLineNo">311</span> LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.311"></a>
-<span class="sourceLineNo">312</span> if (ioe instanceof TableNotFoundException) {<a name="line.312"></a>
-<span class="sourceLineNo">313</span> if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.313"></a>
-<span class="sourceLineNo">314</span> + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.314"></a>
-<span class="sourceLineNo">315</span> sleepMultiplier++;<a name="line.315"></a>
-<span class="sourceLineNo">316</span> }<a name="line.316"></a>
-<span class="sourceLineNo">317</span> } else {<a name="line.317"></a>
-<span class="sourceLineNo">318</span> LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.318"></a>
-<span class="sourceLineNo">319</span> replicationSinkMgr.chooseSinks();<a name="line.319"></a>
-<span class="sourceLineNo">320</span> }<a name="line.320"></a>
-<span class="sourceLineNo">321</span> } else {<a name="line.321"></a>
-<span class="sourceLineNo">322</span> if (ioe instanceof SocketTimeoutException) {<a name="line.322"></a>
-<span class="sourceLineNo">323</span> // This exception means we waited for more than 60s and nothing<a name="line.323"></a>
-<span class="sourceLineNo">324</span> // happened, the cluster is alive and calling it right away<a name="line.324"></a>
-<span class="sourceLineNo">325</span> // even for a test just makes things worse.<a name="line.325"></a>
-<span class="sourceLineNo">326</span> sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.326"></a>
-<span class="sourceLineNo">327</span> "call to the remote cluster timed out, which is usually " +<a name="line.327"></a>
-<span class="sourceLineNo">328</span> "caused by a machine failure or a massive slowdown",<a name="line.328"></a>
-<span class="sourceLineNo">329</span> this.socketTimeoutMultiplier);<a name="line.329"></a>
-<span class="sourceLineNo">330</span> } else if (ioe instanceof ConnectException) {<a name="line.330"></a>
-<span class="sourceLineNo">331</span> LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.331"></a>
-<span class="sourceLineNo">332</span> replicationSinkMgr.chooseSinks();<a name="line.332"></a>
-<span class="sourceLineNo">333</span> } else {<a name="line.333"></a>
-<span class="sourceLineNo">334</span> LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.334"></a>
+<span class="sourceLineNo">250</span> if (!peersSelected && this.isRunning()) {<a name="line.250"></a>
+<span class="sourceLineNo">251</span> connectToPeers();<a name="line.251"></a>
+<span class="sourceLineNo">252</span> peersSelected = true;<a name="line.252"></a>
+<span class="sourceLineNo">253</span> }<a name="line.253"></a>
+<span class="sourceLineNo">254</span><a name="line.254"></a>
+<span class="sourceLineNo">255</span> int numSinks = replicationSinkMgr.getNumSinks();<a name="line.255"></a>
+<span class="sourceLineNo">256</span> if (numSinks == 0) {<a name="line.256"></a>
+<span class="sourceLineNo">257</span> LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.257"></a>
+<span class="sourceLineNo">258</span> + " with the same set of edits.");<a name="line.258"></a>
+<span class="sourceLineNo">259</span> return false;<a name="line.259"></a>
+<span class="sourceLineNo">260</span> }<a name="line.260"></a>
+<span class="sourceLineNo">261</span><a name="line.261"></a>
+<span class="sourceLineNo">262</span> batches = createBatches(replicateContext.getEntries());<a name="line.262"></a>
+<span class="sourceLineNo">263</span><a name="line.263"></a>
+<span class="sourceLineNo">264</span> while (this.isRunning() && !exec.isShutdown()) {<a name="line.264"></a>
+<span class="sourceLineNo">265</span> if (!isPeerEnabled()) {<a name="line.265"></a>
+<span class="sourceLineNo">266</span> if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.266"></a>
+<span class="sourceLineNo">267</span> sleepMultiplier++;<a name="line.267"></a>
+<span class="sourceLineNo">268</span> }<a name="line.268"></a>
+<span class="sourceLineNo">269</span> continue;<a name="line.269"></a>
+<span class="sourceLineNo">270</span> }<a name="line.270"></a>
+<span class="sourceLineNo">271</span> if (this.conn == null || this.conn.isClosed()) {<a name="line.271"></a>
+<span class="sourceLineNo">272</span> reconnectToPeerCluster();<a name="line.272"></a>
+<span class="sourceLineNo">273</span> }<a name="line.273"></a>
+<span class="sourceLineNo">274</span> try {<a name="line.274"></a>
+<span class="sourceLineNo">275</span> int futures = 0;<a name="line.275"></a>
+<span class="sourceLineNo">276</span> for (int i=0; i<batches.size(); i++) {<a name="line.276"></a>
+<span class="sourceLineNo">277</span> List<Entry> entries = batches.get(i);<a name="line.277"></a>
+<span class="sourceLineNo">278</span> if (!entries.isEmpty()) {<a name="line.278"></a>
+<span class="sourceLineNo">279</span> if (LOG.isTraceEnabled()) {<a name="line.279"></a>
+<span class="sourceLineNo">280</span> LOG.trace("Submitting " + entries.size() +<a name="line.280"></a>
+<span class="sourceLineNo">281</span> " entries of total size " + replicateContext.getSize());<a name="line.281"></a>
+<span class="sourceLineNo">282</span> }<a name="line.282"></a>
+<span class="sourceLineNo">283</span> // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.283"></a>
+<span class="sourceLineNo">284</span> pool.submit(createReplicator(entries, i));<a name="line.284"></a>
+<span class="sourceLineNo">285</span> futures++;<a name="line.285"></a>
+<span class="sourceLineNo">286</span> }<a name="line.286"></a>
+<span class="sourceLineNo">287</span> }<a name="line.287"></a>
+<span class="sourceLineNo">288</span> IOException iox = null;<a name="line.288"></a>
+<span class="sourceLineNo">289</span><a name="line.289"></a>
+<span class="sourceLineNo">290</span> long lastWriteTime = 0;<a name="line.290"></a>
+<span class="sourceLineNo">291</span> for (int i=0; i<futures; i++) {<a name="line.291"></a>
+<span class="sourceLineNo">292</span> try {<a name="line.292"></a>
+<span class="sourceLineNo">293</span> // wait for all futures, remove successful parts<a name="line.293"></a>
+<span class="sourceLineNo">294</span> // (only the remaining parts will be retried)<a name="line.294"></a>
+<span class="sourceLineNo">295</span> Future<Integer> f = pool.take();<a name="line.295"></a>
+<span class="sourceLineNo">296</span> int index = f.get().intValue();<a name="line.296"></a>
+<span class="sourceLineNo">297</span> List<Entry> batch = batches.get(index);<a name="line.297"></a>
+<span class="sourceLineNo">298</span> batches.set(index, Collections.<Entry>emptyList()); // remove successful batch<a name="line.298"></a>
+<span class="sourceLineNo">299</span> // Find the most recent write time in the batch<a name="line.299"></a>
+<span class="sourceLineNo">300</span> long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.300"></a>
+<span class="sourceLineNo">301</span> if (writeTime > lastWriteTime) {<a name="line.301"></a>
+<span class="sourceLineNo">302</span> lastWriteTime = writeTime;<a name="line.302"></a>
+<span class="sourceLineNo">303</span> }<a name="line.303"></a>
+<span class="sourceLineNo">304</span> } catch (InterruptedException ie) {<a name="line.304"></a>
+<span class="sourceLineNo">305</span> iox = new IOException(ie);<a name="line.305"></a>
+<span class="sourceLineNo">306</span> } catch (ExecutionException ee) {<a name="line.306"></a>
+<span class="sourceLineNo">307</span> // cause must be an IOException<a name="line.307"></a>
+<span class="sourceLineNo">308</span> iox = (IOException)ee.getCause();<a name="line.308"></a>
+<span class="sourceLineNo">309</span> }<a name="line.309"></a>
+<span class="sourceLineNo">310</span> }<a name="line.310"></a>
+<span class="sourceLineNo">311</span> if (iox != null) {<a name="line.311"></a>
+<span class="sourceLineNo">312</span> // if we had any exceptions, try again<a name="line.312"></a>
+<span class="sourceLineNo">313</span> throw iox;<a name="line.313"></a>
+<span class="sourceLineNo">314</span> }<a name="line.314"></a>
+<span class="sourceLineNo">315</span> // update metrics<a name="line.315"></a>
+<span class="sourceLineNo">316</span> if (lastWriteTime > 0) {<a name="line.316"></a>
+<span class="sourceLineNo">317</span> this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.317"></a>
+<span class="sourceLineNo">318</span> }<a name="line.318"></a>
+<span class="sourceLineNo">319</span> return true;<a name="line.319"></a>
+<span class="sourceLineNo">320</span><a name="line.320"></a>
+<span class="sourceLineNo">321</span> } catch (IOException ioe) {<a name="line.321"></a>
+<span class="sourceLineNo">322</span> // Didn't ship anything, but must still age the last time we did<a name="line.322"></a>
+<span class="sourceLineNo">323</span> this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.323"></a>
+<span class="sourceLineNo">324</span> if (ioe instanceof RemoteException) {<a name="line.324"></a>
+<span class="sourceLineNo">325</span> ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.325"></a>
+<span class="sourceLineNo">326</span> LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.326"></a>
+<span class="sourceLineNo">327</span> if (ioe instanceof TableNotFoundException) {<a name="line.327"></a>
+<span class="sourceLineNo">328</span> if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.328"></a>
+<span class="sourceLineNo">329</span> + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.329"></a>
+<span class="sourceLineNo">330</span> sleepMultiplier++;<a name="line.330"></a>
+<span class="sourceLineNo">331</span> }<a name="line.331"></a>
+<span class="sourceLineNo">332</span> } else {<a name="line.332"></a>
+<span class="sourceLineNo">333</span> LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.333"></a>
+<span class="sourceLineNo">334</span> replicationSinkMgr.chooseSinks();<a name="line.334"></a>
<span class="sourceLineNo">335</span> }<a name="line.335"></a>
-<span class="sourceLineNo">336</span> }<a name="line.336"></a>
-<span class="sourceLineNo">337</span> if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.337"></a>
-<span class="sourceLineNo">338</span> sleepMultiplier++;<a name="line.338"></a>
-<span class="sourceLineNo">339</span> }<a name="line.339"></a>
-<span class="sourceLineNo">340</span> }<a name="line.340"></a>
-<span class="sourceLineNo">341</span> }<a name="line.341"></a>
-<span class="sourceLineNo">342</span> return false; // in case we exited before replicating<a name="line.342"></a>
-<span class="sourceLineNo">343</span> }<a name="line.343"></a>
-<span class="sourceLineNo">344</span><a name="line.344"></a>
-<span class="sourceLineNo">345</span> protected boolean isPeerEnabled() {<a name="line.345"></a>
-<span class="sourceLineNo">346</span> return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.346"></a>
-<span class="sourceLineNo">347</span> }<a name="line.347"></a>
-<span class="sourceLineNo">348</span><a name="line.348"></a>
-<span class="sourceLineNo">349</span> @Override<a name="line.349"></a>
-<span class="sourceLineNo">350</span> protected void doStop() {<a name="line.350"></a>
-<span class="sourceLineNo">351</span> disconnect(); //don't call super.doStop()<a name="line.351"></a>
-<span class="sourceLineNo">352</span> if (this.conn != null) {<a name="line.352"></a>
-<span class="sourceLineNo">353</span> try {<a name="line.353"></a>
-<span class="sourceLineNo">354</span> this.conn.close();<a name="line.354"></a>
-<span class="sourceLineNo">355</span> this.conn = null;<a name="line.355"></a>
-<span class="sourceLineNo">356</span> } catch (IOException e) {<a name="line.356"></a>
-<span class="sourceLineNo">357</span> LOG.warn("Failed to close the connection");<a name="line.357"></a>
-<span class="sourceLineNo">358</span> }<a name="line.358"></a>
-<span class="sourceLineNo">359</span> }<a name="line.359"></a>
-<span class="sourceLineNo">360</span> // Allow currently running replication tasks to finish<a name="line.360"></a>
-<span class="sourceLineNo">361</span> exec.shutdown();<a name="line.361"></a>
-<span class="sourceLineNo">362</span> try {<a name="line.362"></a>
-<span class="sourceLineNo">363</span> exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.363"></a>
-<span class="sourceLineNo">364</span> } catch (InterruptedException e) {<a name="line.364"></a>
-<span class="sourceLineNo">365</span> }<a name="line.365"></a>
-<span class="sourceLineNo">366</span> // Abort if the tasks did not terminate in time<a name="line.366"></a>
-<span class="sourceLineNo">367</span> if (!exec.isTerminated()) {<a name="line.367"></a>
-<span class="sourceLineNo">368</span> String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.368"></a>
-<span class="sourceLineNo">369</span> "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.369"></a>
-<span class="sourceLineNo">370</span> "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.370"></a>
-<span class="sourceLineNo">371</span> abortable.abort(errMsg, new IOException(errMsg));<a name="line.371"></a>
-<span class="sourceLineNo">372</span> }<a name="line.372"></a>
-<span class="sourceLineNo">373</span> notifyStopped();<a name="line.373"></a>
-<span class="sourceLineNo">374</span> }<a name="line.374"></a>
-<span class="sourceLineNo">375</span><a name="line.375"></a>
-<span class="sourceLineNo">376</span> // is this needed? Nobody else will call doStop() otherwise<a name="line.376"></a>
-<span class="sourceLineNo">377</span> @Override<a name="line.377"></a>
-<span class="sourceLineNo">378</span> public State stopAndWait() {<a name="line.378"></a>
-<span class="sourceLineNo">379</span> doStop();<a name="line.379"></a>
-<span class="sourceLineNo">380</span> return super.stopAndWait();<a name="line.380"></a>
-<span class="sourceLineNo">381</span> }<a name="line.381"></a>
-<span class="sourceLineNo">382</span><a name="line.382"></a>
-<span class="sourceLineNo">383</span> @VisibleForTesting<a name="line.383"></a>
-<span class="sourceLineNo">384</span> protected Replicator createReplicator(List<Entry> entries, int ordinal) {<a name="line.384"></a>
-<span class="sourceLineNo">385</span> return new Replicator(entries, ordinal);<a name="line.385"></a>
-<span class="sourceLineNo">386</span> }<a name="line.386"></a>
-<span class="sourceLineNo">387</span><a name="line.387"></a>
-<span class="sourceLineNo">388</span> @VisibleForTesting<a name="line.388"></a>
-<span class="sourceLineNo">389</span> protected class Replicator implements Callable<Integer> {<a name="line.389"></a>
-<span class="sourceLineNo">390</span> private List<Entry> entries;<a name="line.390"></a>
-<span class="sourceLineNo">391</span> private int ordinal;<a name="line.391"></a>
-<span class="sourceLineNo">392</span> public Replicator(List<Entry> entries, int ordinal) {<a name="line.392"></a>
-<span class="sourceLineNo">393</span> this.entries = entries;<a name="line.393"></a>
-<span class="sourceLineNo">394</span> this.ordinal = ordinal;<a name="line.394"></a>
-<span class="sourceLineNo">395</span> }<a name="line.395"></a>
-<span class="sourceLineNo">396</span><a name="line.396"></a>
-<span class="sourceLineNo">397</span> protected void replicateEntries(BlockingInterface rrs, final List<Entry> batch,<a name="line.397"></a>
-<span class="sourceLineNo">398</span> String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.398"></a>
-<span class="sourceLineNo">399</span> throws IOException {<a name="line.399"></a>
-<span class="sourceLineNo">400</span> if (LOG.isTraceEnabled()) {<a name="line.400"></a>
-<span class="sourceLineNo">401</span> long size = 0;<a name="line.401"></a>
-<span class="sourceLineNo">402</span> for (Entry e: entries) {<a name="line.402"></a>
-<span class="sourceLineNo">403</span> size += e.getKey().estimatedSerializedSizeOf();<a name="line.403"></a>
-<span class="sourceLineNo">404</span> size += e.getEdit().estimatedSerializedSizeOf();<a name="line.404"></a>
-<span class="sourceLineNo">405</span> }<a name="line.405"></a>
-<span class="sourceLineNo">406</span> LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.406"></a>
-<span class="sourceLineNo">407</span> entries.size() + " entries with total size " + size + " bytes to " +<a name="line.407"></a>
-<span class="sourceLineNo">408</span> replicationClusterId);<a name="line.408"></a>
-<span class="sourceLineNo">409</span> }<a name="line.409"></a>
-<span class="sourceLineNo">410</span> try {<a name="line.410"></a>
-<span class="sourceLineNo">411</span> ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.411"></a>
-<span class="sourceLineNo">412</span> replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.412"></a>
-<span class="sourceLineNo">413</span> if (LOG.isTraceEnabled()) {<a name="line.413"></a>
-<span class="sourceLineNo">414</span> LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.414"></a>
-<span class="sourceLineNo">415</span> }<a name="line.415"></a>
-<span class="sourceLineNo">416</span> } catch (IOException e) {<a name="line.416"></a>
-<span class="sourceLineNo">417</span> if (LOG.isTraceEnabled()) {<a name="line.417"></a>
-<span class="sourceLineNo">418</span> LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.418"></a>
-<span class="sourceLineNo">419</span> }<a name="line.419"></a>
-<span class="sourceLineNo">420</span> throw e;<a name="line.420"></a>
-<span class="sourceLineNo">421</span> }<a name="line.421"></a>
-<span class="sourceLineNo">422</span> }<a name="line.422"></a>
-<span class="sourceLineNo">423</span><a name="line.423"></a>
-<span class="sourceLineNo">424</span> @Override<a name="line.424"></a>
-<span class="sourceLineNo">425</span> public Integer call() throws IOException {<a name="line.425"></a>
-<span class="sourceLineNo">426</span> SinkPeer sinkPeer = null;<a name="line.426"></a>
-<span class="sourceLineNo">427</span> try {<a name="line.427"></a>
-<span class="sourceLineNo">428</span> sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.428"></a>
-<span class="sourceLineNo">429</span> BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.429"></a>
-<span class="sourceLineNo">430</span> replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.430"></a>
-<span class="sourceLineNo">431</span> replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.431"></a>
-<span class="sourceLineNo">432</span> return ordinal;<a name="line.432"></a>
-<span class="sourceLineNo">433</span> } catch (IOException ioe) {<a name="line.433"></a>
-<span class="sourceLineNo">434</span> if (sinkPeer != null) {<a name="line.434"></a>
-<span class="sourceLineNo">435</span> replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.435"></a>
-<span class="sourceLineNo">436</span> }<a name="line.436"></a>
-<span class="sourceLineNo">437</span> throw ioe;<a name="line.437"></a>
-<span class="sourceLineNo">438</span> }<a name="line.438"></a>
-<span class="sourceLineNo">439</span> }<a name="line.439"></a>
-<span class="sourceLineNo">440</span> }<a name="line.440"></a>
-<span class="sourceLineNo">441</span>}<a name="line.441"></a>
+<span class="sourceLineNo">336</span> } else {<a name="line.336"></a>
+<span class="sourceLineNo">337</span> if (ioe instanceof SocketTimeoutException) {<a name="line.337"></a>
+<span class="sourceLineNo">338</span> // This exception means we waited for more than 60s and nothing<a name="line.338"></a>
+<span class="sourceLineNo">339</span> // happened, the cluster is alive and calling it right away<a name="line.339"></a>
+<span class="sourceLineNo">340</span> // even for a test just makes things worse.<a name="line.340"></a>
+<span class="sourceLineNo">341</span> sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.341"></a>
+<span class="sourceLineNo">342</span> "call to the remote cluster timed out, which is usually " +<a name="line.342"></a>
+<span class="sourceLineNo">343</span> "caused by a machine failure or a massive slowdown",<a name="line.343"></a>
+<span class="sourceLineNo">344</span> this.socketTimeoutMultiplier);<a name="line.344"></a>
+<span class="sourceLineNo">345</span> } else if (ioe instanceof ConnectException) {<a name="line.345"></a>
+<span class="sourceLineNo">346</span> LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.346"></a>
+<span class="sourceLineNo">347</span> replicationSinkMgr.chooseSinks();<a name="line.347"></a>
+<span class="sourceLineNo">348</span> } else {<a name="line.348"></a>
+<span class="sourceLineNo">349</span> LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.349"></a>
+<span class="sourceLineNo">350</span> }<a name="line.350"></a>
+<span class="sourceLineNo">351</span> }<a name="line.351"></a>
+<span class="sourceLineNo">352</span> if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.352"></a>
+<span class="sourceLineNo">353</span> sleepMultiplier++;<a name="line.353"></a>
+<span class="sourceLineNo">354</span> }<a name="line.354"></a>
+<span class="sourceLineNo">355</span> }<a name="line.355"></a>
+<span class="sourceLineNo">356</span> }<a name="line.356"></a>
+<span class="sourceLineNo">357</span> return false; // in case we exited before replicating<a name="line.357"></a>
+<span class="sourceLineNo">358</span> }<a name="line.358"></a>
+<span class="sourceLineNo">359</span><a name="line.359"></a>
+<span class="sourceLineNo">360</span> protected boolean isPeerEnabled() {<a name="line.360"></a>
+<span class="sourceLineNo">361</span> return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.361"></a>
+<span class="sourceLineNo">362</span> }<a name="line.362"></a>
+<span class="sourceLineNo">363</span><a name="line.363"></a>
+<span class="sourceLineNo">364</span> @Override<a name="line.364"></a>
+<span class="sourceLineNo">365</span> protected void doStop() {<a name="line.365"></a>
+<span class="sourceLineNo">366</span> disconnect(); //don't call super.doStop()<a name="line.366"></a>
+<span class="sourceLineNo">367</span> if (this.conn != null) {<a name="line.367"></a>
+<span class="sourceLineNo">368</span> try {<a name="line.368"></a>
+<span class="sourceLineNo">369</span> this.conn.close();<a name="line.369"></a>
+<span class="sourceLineNo">370</span> this.conn = null;<a name="line.370"></a>
+<span class="sourceLineNo">371</span> } catch (IOException e) {<a name="line.371"></a>
+<span class="sourceLineNo">372</span> LOG.warn("Failed to close the connection");<a name="line.372"></a>
+<span class="sourceLineNo">373</span> }<a name="line.373"></a>
+<span class="sourceLineNo">374</span> }<a name="line.374"></a>
+<span class="sourceLineNo">375</span> // Allow currently running replication tasks to finish<a name="line.375"></a>
+<span class="sourceLineNo">376</span> exec.shutdown();<a name="line.376"></a>
+<span class="sourceLineNo">377</span> try {<a name="line.377"></a>
+<span class="sourceLineNo">378</span> exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.378"></a>
+<span class="sourceLineNo">379</span> } catch (InterruptedException e) {<a name="line.379"></a>
+<span class="sourceLineNo">380</span> }<a name="line.380"></a>
+<span class="sourceLineNo">381</span> // Abort if the tasks did not terminate in time<a name="line.381"></a>
+<span class="sourceLineNo">382</span> if (!exec.isTerminated()) {<a name="line.382"></a>
+<span class="sourceLineNo">383</span> String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.383"></a>
+<span class="sourceLineNo">384</span> "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.384"></a>
+<span class="sourceLineNo">385</span> "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.385"></a>
+<span class="sourceLineNo">386</span> abortable.abort(errMsg, new IOException(errMsg));<a name="line.386"></a>
+<span class="sourceLineNo">387</span> }<a name="line.387"></a>
+<span class="sourceLineNo">388</span> notifyStopped();<a name="line.388"></a>
+<span class="sourceLineNo">389</span> }<a name="line.389"></a>
+<span class="sourceLineNo">390</span><a name="line.390"></a>
+<span class="sourceLineNo">391</span> // is this needed? Nobody else will call doStop() otherwise<a name="line.391"></a>
+<span class="sourceLineNo">392</span> @Override<a name="line.392"></a>
+<span class="sourceLineNo">393</span> public State stopAndWait() {<a name="line.393"></a>
+<span class="sourceLineNo">394</span> doStop();<a name="line.394"></a>
+<span class="sourceLineNo">395</span> return super.stopAndWait();<a name="line.395"></a>
+<span class="sourceLineNo">396</span> }<a name="line.396"></a>
+<span class="sourceLineNo">397</span><a name="line.397"></a>
+<span class="sourceLineNo">398</span> @VisibleForTesting<a name="line.398"></a>
+<span class="sourceLineNo">399</span> protected Replicator createReplicator(List<Entry> entries, int ordinal) {<a name="line.399"></a>
+<span class="sourceLineNo">400</span> return new Replicator(entries, ordinal);<a name="line.400"></a>
+<span class="sourceLineNo">401</span> }<a name="line.401"></a>
+<span class="sourceLineNo">402</span><a name="line.402"></a>
+<span class="sourceLineNo">403</span> @VisibleForTesting<a name="line.403"></a>
+<span class="sourceLineNo">404</span> protected class Replicator implements Callable<Integer> {<a name="line.404"></a>
+<span class="sourceLineNo">405</span> private List<Entry> entries;<a name="line.405"></a>
+<span class="sourceLineNo">406</span> private int ordinal;<a name="line.406"></a>
+<span class="sourceLineNo">407</span> public Replicator(List<Entry> entries, int ordinal) {<a name="line.407"></a>
+<span class="sourceLineNo">408</span> this.entries = entries;<a name="line.408"></a>
+<span class="sourceLineNo">409</span> this.ordinal = ordinal;<a name="line.409"></a>
+<span class="sourceLineNo">410</span> }<a name="line.410"></a>
+<span class="sourceLineNo">411</span><a name="line.411"></a>
+<span class="sourceLineNo">412</span> protected void replicateEntries(BlockingInterface rrs, final List<Entry> batch,<a name="line.412"></a>
+<span class="sourceLineNo">413</span> String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.413"></a>
+<span class="sourceLineNo">414</span> throws IOException {<a name="line.414"></a>
+<span class="sourceLineNo">415</span> if (LOG.isTraceEnabled()) {<a name="line.415"></a>
+<span class="sourceLineNo">416</span> long size = 0;<a name="line.416"></a>
+<span class="sourceLineNo">417</span> for (Entry e: entries) {<a name="line.417"></a>
+<span class="sourceLineNo">418</span> size += e.getKey().estimatedSerializedSizeOf();<a name="line.418"></a>
+<span class="sourceLineNo">419</span> size += e.getEdit().estimatedSerializedSizeOf();<a name="line.419"></a>
+<span class="sourceLineNo">420</span> }<a name="line.420"></a>
+<span class="sourceLineNo">421</span> LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.421"></a>
+<span class="sourceLineNo">422</span> entries.size() + " entries with total size " + size + " bytes to " +<a name="line.422"></a>
+<span class="sourceLineNo">423</span> replicationClusterId);<a name="line.423"></a>
+<span class="sourceLineNo">424</span> }<a name="line.424"></a>
+<span class="sourceLineNo">425</span> try {<a name="line.425"></a>
+<span class="sourceLineNo">426</span> ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.426"></a>
+<span class="sourceLineNo">427</span> replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.427"></a>
+<span class="sourceLineNo">428</span> if (LOG.isTraceEnabled()) {<a name="line.428"></a>
+<span class="sourceLineNo">429</span> LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.429"></a>
+<span class="sourceLineNo">430</span> }<a name="line.430"></a>
+<span class="sourceLineNo">431</span> } catch (IOException e) {<a name="line.431"></a>
+<span class="sourceLineNo">432</span> if (LOG.isTraceEnabled()) {<a name="line.432"></a>
+<span class="sourceLineNo">433</span> LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.433"></a>
+<span class="sourceLineNo">434</span> }<a name="line.434"></a>
+<span class="sourceLineNo">435</span> throw e;<a name="line.435"></a>
+<span class="sourceLineNo">436</span> }<a name="line.436"></a>
+<span class="sourceLineNo">437</span> }<a name="line.437"></a>
+<span class="sourceLineNo">438</span><a name="line.438"></a>
+<span class="sourceLineNo">439</span> @Override<a name="line.439"></a>
+<span class="sourceLineNo">440</span> public Integer call() throws IOException {<a name="line.440"></a>
+<span class="sourceLineNo">441</span> SinkPeer sinkPeer = null;<a name="line.441"></a>
+<span class="sourceLineNo">442</span> try {<a name="line.442"></a>
+<span class="sourceLineNo">443</span> sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.443"></a>
+<span class="sourceLineNo">444</span> BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.444"></a>
+<span class="sourceLineNo">445</span> replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.445"></a>
+<span class="sourceLineNo">446</span> replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.446"></a>
+<span class="sourceLineNo">447</span> return ordinal;<a name="line.447"></a>
+<span class="sourceLineNo">448</span> } catch (IOException ioe) {<a name="line.448"></a>
+<span class="sourceLineNo">449</span> if (sinkPeer != null) {<a name="line.449"></a>
+<span class="sourceLineNo">450</span> replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.450"></a>
+<span class="sourceLineNo">451</span> }<a name="line.451"></a>
+<span class="sourceLineNo">452</span> throw ioe;<a name="line.452"></a>
+<span class="sourceLineNo">453</span> }<a name="line.453"></a>
+<span class="sourceLineNo">454</span> }<a name="line.454"></a>
+<span class="sourceLineNo">455</span> }<a name="line.455"></a>
+<span class="sourceLineNo">456</span>}<a name="line.456"></a>
http://git-wip-us.apache.org/repos/asf/hbase-site/blob/7e7efd3e/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
----------------------------------------------------------------------
diff --git a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
index 8ffd272..58b1653 100644
--- a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
+++ b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
@@ -233,220 +233,235 @@
<span class="sourceLineNo">225</span> return entryLists;<a name="line.225"></a>
<span class="sourceLineNo">226</span> }<a name="line.226"></a>
<span class="sourceLineNo">227</span><a name="line.227"></a>
-<span class="sourceLineNo">228</span> /**<a name="line.228"></a>
-<span class="sourceLineNo">229</span> * Do the shipping logic<a name="line.229"></a>
-<span class="sourceLineNo">230</span> */<a name="line.230"></a>
-<span class="sourceLineNo">231</span> @Override<a name="line.231"></a>
-<span class="sourceLineNo">232</span> public boolean replicate(ReplicateContext replicateContext) {<a name="line.232"></a>
-<span class="sourceLineNo">233</span> CompletionService<Integer> pool = new ExecutorCompletionService<>(this.exec);<a name="line.233"></a>
-<span class="sourceLineNo">234</span> List<List<Entry>> batches;<a name="line.234"></a>
-<span class="sourceLineNo">235</span> String walGroupId = replicateContext.getWalGroupId();<a name="line.235"></a>
-<span class="sourceLineNo">236</span> int sleepMultiplier = 1;<a name="line.236"></a>
-<span class="sourceLineNo">237</span><a name="line.237"></a>
-<span class="sourceLineNo">238</span> if (!peersSelected && this.isRunning()) {<a name="line.238"></a>
-<span class="sourceLineNo">239</span> connectToPeers();<a name="line.239"></a>
-<span class="sourceLineNo">240</span> peersSelected = true;<a name="line.240"></a>
-<span class="sourceLineNo">241</span> }<a name="line.241"></a>
-<span class="sourceLineNo">242</span><a name="line.242"></a>
-<span class="sourceLineNo">243</span> int numSinks = replicationSinkMgr.getNumSinks();<a name="line.243"></a>
-<span class="sourceLineNo">244</span> if (numSinks == 0) {<a name="line.244"></a>
-<span class="sourceLineNo">245</span> LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.245"></a>
-<span class="sourceLineNo">246</span> + " with the same set of edits.");<a name="line.246"></a>
-<span class="sourceLineNo">247</span> return false;<a name="line.247"></a>
-<span class="sourceLineNo">248</span> }<a name="line.248"></a>
+<span class="sourceLineNo">228</span> private void reconnectToPeerCluster() {<a name="line.228"></a>
+<span class="sourceLineNo">229</span> ClusterConnection connection = null;<a name="line.229"></a>
+<span class="sourceLineNo">230</span> try {<a name="line.230"></a>
+<span class="sourceLineNo">231</span> connection = (ClusterConnection) ConnectionFactory.createConnection(this.conf);<a name="line.231"></a>
+<span class="sourceLineNo">232</span> } catch (IOException ioe) {<a name="line.232"></a>
+<span class="sourceLineNo">233</span> LOG.warn("Failed to create connection for peer cluster", ioe);<a name="line.233"></a>
+<span class="sourceLineNo">234</span> }<a name="line.234"></a>
+<span class="sourceLineNo">235</span> if (connection != null) {<a name="line.235"></a>
+<span class="sourceLineNo">236</span> this.conn = connection;<a name="line.236"></a>
+<span class="sourceLineNo">237</span> }<a name="line.237"></a>
+<span class="sourceLineNo">238</span> }<a name="line.238"></a>
+<span class="sourceLineNo">239</span><a name="line.239"></a>
+<span class="sourceLineNo">240</span> /**<a name="line.240"></a>
+<span class="sourceLineNo">241</span> * Do the shipping logic<a name="line.241"></a>
+<span class="sourceLineNo">242</span> */<a name="line.242"></a>
+<span class="sourceLineNo">243</span> @Override<a name="line.243"></a>
+<span class="sourceLineNo">244</span> public boolean replicate(ReplicateContext replicateContext) {<a name="line.244"></a>
+<span class="sourceLineNo">245</span> CompletionService<Integer> pool = new ExecutorCompletionService<>(this.exec);<a name="line.245"></a>
+<span class="sourceLineNo">246</span> List<List<Entry>> batches;<a name="line.246"></a>
+<span class="sourceLineNo">247</span> String walGroupId = replicateContext.getWalGroupId();<a name="line.247"></a>
+<span class="sourceLineNo">248</span> int sleepMultiplier = 1;<a name="line.248"></a>
<span class="sourceLineNo">249</span><a name="line.249"></a>
-<span class="sourceLineNo">250</span> batches = createBatches(replicateContext.getEntries());<a name="line.250"></a>
-<span class="sourceLineNo">251</span><a name="line.251"></a>
-<span class="sourceLineNo">252</span> while (this.isRunning() && !exec.isShutdown()) {<a name="line.252"></a>
-<span class="sourceLineNo">253</span> if (!isPeerEnabled()) {<a name="line.253"></a>
-<span class="sourceLineNo">254</span> if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.254"></a>
-<span class="sourceLineNo">255</span> sleepMultiplier++;<a name="line.255"></a>
-<span class="sourceLineNo">256</span> }<a name="line.256"></a>
-<span class="sourceLineNo">257</span> continue;<a name="line.257"></a>
-<span class="sourceLineNo">258</span> }<a name="line.258"></a>
-<span class="sourceLineNo">259</span> try {<a name="line.259"></a>
-<span class="sourceLineNo">260</span> int futures = 0;<a name="line.260"></a>
-<span class="sourceLineNo">261</span> for (int i=0; i<batches.size(); i++) {<a name="line.261"></a>
-<span class="sourceLineNo">262</span> List<Entry> entries = batches.get(i);<a name="line.262"></a>
-<span class="sourceLineNo">263</span> if (!entries.isEmpty()) {<a name="line.263"></a>
-<span class="sourceLineNo">264</span> if (LOG.isTraceEnabled()) {<a name="line.264"></a>
-<span class="sourceLineNo">265</span> LOG.trace("Submitting " + entries.size() +<a name="line.265"></a>
-<span class="sourceLineNo">266</span> " entries of total size " + replicateContext.getSize());<a name="line.266"></a>
-<span class="sourceLineNo">267</span> }<a name="line.267"></a>
-<span class="sourceLineNo">268</span> // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.268"></a>
-<span class="sourceLineNo">269</span> pool.submit(createReplicator(entries, i));<a name="line.269"></a>
-<span class="sourceLineNo">270</span> futures++;<a name="line.270"></a>
-<span class="sourceLineNo">271</span> }<a name="line.271"></a>
-<span class="sourceLineNo">272</span> }<a name="line.272"></a>
-<span class="sourceLineNo">273</span> IOException iox = null;<a name="line.273"></a>
-<span class="sourceLineNo">274</span><a name="line.274"></a>
-<span class="sourceLineNo">275</span> long lastWriteTime = 0;<a name="line.275"></a>
-<span class="sourceLineNo">276</span> for (int i=0; i<futures; i++) {<a name="line.276"></a>
-<span class="sourceLineNo">277</span> try {<a name="line.277"></a>
-<span class="sourceLineNo">278</span> // wait for all futures, remove successful parts<a name="line.278"></a>
-<span class="sourceLineNo">279</span> // (only the remaining parts will be retried)<a name="line.279"></a>
-<span class="sourceLineNo">280</span> Future<Integer> f = pool.take();<a name="line.280"></a>
-<span class="sourceLineNo">281</span> int index = f.get().intValue();<a name="line.281"></a>
-<span class="sourceLineNo">282</span> List<Entry> batch = batches.get(index);<a name="line.282"></a>
-<span class="sourceLineNo">283</span> batches.set(index, Collections.<Entry>emptyList()); // remove successful batch<a name="line.283"></a>
-<span class="sourceLineNo">284</span> // Find the most recent write time in the batch<a name="line.284"></a>
-<span class="sourceLineNo">285</span> long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.285"></a>
-<span class="sourceLineNo">286</span> if (writeTime > lastWriteTime) {<a name="line.286"></a>
-<span class="sourceLineNo">287</span> lastWriteTime = writeTime;<a name="line.287"></a>
-<span class="sourceLineNo">288</span> }<a name="line.288"></a>
-<span class="sourceLineNo">289</span> } catch (InterruptedException ie) {<a name="line.289"></a>
-<span class="sourceLineNo">290</span> iox = new IOException(ie);<a name="line.290"></a>
-<span class="sourceLineNo">291</span> } catch (ExecutionException ee) {<a name="line.291"></a>
-<span class="sourceLineNo">292</span> // cause must be an IOException<a name="line.292"></a>
-<span class="sourceLineNo">293</span> iox = (IOException)ee.getCause();<a name="line.293"></a>
-<span class="sourceLineNo">294</span> }<a name="line.294"></a>
-<span class="sourceLineNo">295</span> }<a name="line.295"></a>
-<span class="sourceLineNo">296</span> if (iox != null) {<a name="line.296"></a>
-<span class="sourceLineNo">297</span> // if we had any exceptions, try again<a name="line.297"></a>
-<span class="sourceLineNo">298</span> throw iox;<a name="line.298"></a>
-<span class="sourceLineNo">299</span> }<a name="line.299"></a>
-<span class="sourceLineNo">300</span> // update metrics<a name="line.300"></a>
-<span class="sourceLineNo">301</span> if (lastWriteTime > 0) {<a name="line.301"></a>
-<span class="sourceLineNo">302</span> this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.302"></a>
-<span class="sourceLineNo">303</span> }<a name="line.303"></a>
-<span class="sourceLineNo">304</span> return true;<a name="line.304"></a>
-<span class="sourceLineNo">305</span><a name="line.305"></a>
-<span class="sourceLineNo">306</span> } catch (IOException ioe) {<a name="line.306"></a>
-<span class="sourceLineNo">307</span> // Didn't ship anything, but must still age the last time we did<a name="line.307"></a>
-<span class="sourceLineNo">308</span> this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.308"></a>
-<span class="sourceLineNo">309</span> if (ioe instanceof RemoteException) {<a name="line.309"></a>
-<span class="sourceLineNo">310</span> ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.310"></a>
-<span class="sourceLineNo">311</span> LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.311"></a>
-<span class="sourceLineNo">312</span> if (ioe instanceof TableNotFoundException) {<a name="line.312"></a>
-<span class="sourceLineNo">313</span> if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.313"></a>
-<span class="sourceLineNo">314</span> + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.314"></a>
-<span class="sourceLineNo">315</span> sleepMultiplier++;<a name="line.315"></a>
-<span class="sourceLineNo">316</span> }<a name="line.316"></a>
-<span class="sourceLineNo">317</span> } else {<a name="line.317"></a>
-<span class="sourceLineNo">318</span> LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.318"></a>
-<span class="sourceLineNo">319</span> replicationSinkMgr.chooseSinks();<a name="line.319"></a>
-<span class="sourceLineNo">320</span> }<a name="line.320"></a>
-<span class="sourceLineNo">321</span> } else {<a name="line.321"></a>
-<span class="sourceLineNo">322</span> if (ioe instanceof SocketTimeoutException) {<a name="line.322"></a>
-<span class="sourceLineNo">323</span> // This exception means we waited for more than 60s and nothing<a name="line.323"></a>
-<span class="sourceLineNo">324</span> // happened, the cluster is alive and calling it right away<a name="line.324"></a>
-<span class="sourceLineNo">325</span> // even for a test just makes things worse.<a name="line.325"></a>
-<span class="sourceLineNo">326</span> sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.326"></a>
-<span class="sourceLineNo">327</span> "call to the remote cluster timed out, which is usually " +<a name="line.327"></a>
-<span class="sourceLineNo">328</span> "caused by a machine failure or a massive slowdown",<a name="line.328"></a>
-<span class="sourceLineNo">329</span> this.socketTimeoutMultiplier);<a name="line.329"></a>
-<span class="sourceLineNo">330</span> } else if (ioe instanceof ConnectException) {<a name="line.330"></a>
-<span class="sourceLineNo">331</span> LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.331"></a>
-<span class="sourceLineNo">332</span> replicationSinkMgr.chooseSinks();<a name="line.332"></a>
-<span class="sourceLineNo">333</span> } else {<a name="line.333"></a>
-<span class="sourceLineNo">334</span> LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.334"></a>
+<span class="sourceLineNo">250</span> if (!peersSelected && this.isRunning()) {<a name="line.250"></a>
+<span class="sourceLineNo">251</span> connectToPeers();<a name="line.251"></a>
+<span class="sourceLineNo">252</span> peersSelected = true;<a name="line.252"></a>
+<span class="sourceLineNo">253</span> }<a name="line.253"></a>
+<span class="sourceLineNo">254</span><a name="line.254"></a>
+<span class="sourceLineNo">255</span> int numSinks = replicationSinkMgr.getNumSinks();<a name="line.255"></a>
+<span class="sourceLineNo">256</span> if (numSinks == 0) {<a name="line.256"></a>
+<span class="sourceLineNo">257</span> LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.257"></a>
+<span class="sourceLineNo">258</span> + " with the same set of edits.");<a name="line.258"></a>
+<span class="sourceLineNo">259</span> return false;<a name="line.259"></a>
+<span class="sourceLineNo">260</span> }<a name="line.260"></a>
+<span class="sourceLineNo">261</span><a name="line.261"></a>
+<span class="sourceLineNo">262</span> batches = createBatches(replicateContext.getEntries());<a name="line.262"></a>
+<span class="sourceLineNo">263</span><a name="line.263"></a>
+<span class="sourceLineNo">264</span> while (this.isRunning() && !exec.isShutdown()) {<a name="line.264"></a>
+<span class="sourceLineNo">265</span> if (!isPeerEnabled()) {<a name="line.265"></a>
+<span class="sourceLineNo">266</span> if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.266"></a>
+<span class="sourceLineNo">267</span> sleepMultiplier++;<a name="line.267"></a>
+<span class="sourceLineNo">268</span> }<a name="line.268"></a>
+<span class="sourceLineNo">269</span> continue;<a name="line.269"></a>
+<span class="sourceLineNo">270</span> }<a name="line.270"></a>
+<span class="sourceLineNo">271</span> if (this.conn == null || this.conn.isClosed()) {<a name="line.271"></a>
+<span class="sourceLineNo">272</span> reconnectToPeerCluster();<a name="line.272"></a>
+<span class="sourceLineNo">273</span> }<a name="line.273"></a>
+<span class="sourceLineNo">274</span> try {<a name="line.274"></a>
+<span class="sourceLineNo">275</span> int futures = 0;<a name="line.275"></a>
+<span class="sourceLineNo">276</span> for (int i=0; i<batches.size(); i++) {<a name="line.276"></a>
+<span class="sourceLineNo">277</span> List<Entry> entries = batches.get(i);<a name="line.277"></a>
+<span class="sourceLineNo">278</span> if (!entries.isEmpty()) {<a name="line.278"></a>
+<span class="sourceLineNo">279</span> if (LOG.isTraceEnabled()) {<a name="line.279"></a>
+<span class="sourceLineNo">280</span> LOG.trace("Submitting " + entries.size() +<a name="line.280"></a>
+<span class="sourceLineNo">281</span> " entries of total size " + replicateContext.getSize());<a name="line.281"></a>
+<span class="sourceLineNo">282</span> }<a name="line.282"></a>
+<span class="sourceLineNo">283</span> // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.283"></a>
+<span class="sourceLineNo">284</span> pool.submit(createReplicator(entries, i));<a name="line.284"></a>
+<span class="sourceLineNo">285</span> futures++;<a name="line.285"></a>
+<span class="sourceLineNo">286</span> }<a name="line.286"></a>
+<span class="sourceLineNo">287</span> }<a name="line.287"></a>
+<span class="sourceLineNo">288</span> IOException iox = null;<a name="line.288"></a>
+<span class="sourceLineNo">289</span><a name="line.289"></a>
+<span class="sourceLineNo">290</span> long lastWriteTime = 0;<a name="line.290"></a>
+<span class="sourceLineNo">291</span> for (int i=0; i<futures; i++) {<a name="line.291"></a>
+<span class="sourceLineNo">292</span> try {<a name="line.292"></a>
+<span class="sourceLineNo">293</span> // wait for all futures, remove successful parts<a name="line.293"></a>
+<span class="sourceLineNo">294</span> // (only the remaining parts will be retried)<a name="line.294"></a>
+<span class="sourceLineNo">295</span> Future<Integer> f = pool.take();<a name="line.295"></a>
+<span class="sourceLineNo">296</span> int index = f.get().intValue();<a name="line.296"></a>
+<span class="sourceLineNo">297</span> List<Entry> batch = batches.get(index);<a name="line.297"></a>
+<span class="sourceLineNo">298</span> batches.set(index, Collections.<Entry>emptyList()); // remove successful batch<a name="line.298"></a>
+<span class="sourceLineNo">299</span> // Find the most recent write time in the batch<a name="line.299"></a>
+<span class="sourceLineNo">300</span> long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.300"></a>
+<span class="sourceLineNo">301</span> if (writeTime > lastWriteTime) {<a name="line.301"></a>
+<span class="sourceLineNo">302</span> lastWriteTime = writeTime;<a name="line.302"></a>
+<span class="sourceLineNo">303</span> }<a name="line.303"></a>
+<span class="sourceLineNo">304</span> } catch (InterruptedException ie) {<a name="line.304"></a>
+<span class="sourceLineNo">305</span> iox = new IOException(ie);<a name="line.305"></a>
+<span class="sourceLineNo">306</span> } catch (ExecutionException ee) {<a name="line.306"></a>
+<span class="sourceLineNo">307</span> // cause must be an IOException<a name="line.307"></a>
+<span class="sourceLineNo">308</span> iox = (IOException)ee.getCause();<a name="line.308"></a>
+<span class="sourceLineNo">309</span> }<a name="line.309"></a>
+<span class="sourceLineNo">310</span> }<a name="line.310"></a>
+<span class="sourceLineNo">311</span> if (iox != null) {<a name="line.311"></a>
+<span class="sourceLineNo">312</span> // if we had any exceptions, try again<a name="line.312"></a>
+<span class="sourceLineNo">313</span> throw iox;<a name="line.313"></a>
+<span class="sourceLineNo">314</span> }<a name="line.314"></a>
+<span class="sourceLineNo">315</span> // update metrics<a name="line.315"></a>
+<span class="sourceLineNo">316</span> if (lastWriteTime > 0) {<a name="line.316"></a>
+<span class="sourceLineNo">317</span> this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.317"></a>
+<span class="sourceLineNo">318</span> }<a name="line.318"></a>
+<span class="sourceLineNo">319</span> return true;<a name="line.319"></a>
+<span class="sourceLineNo">320</span><a name="line.320"></a>
+<span class="sourceLineNo">321</span> } catch (IOException ioe) {<a name="line.321"></a>
+<span class="sourceLineNo">322</span> // Didn't ship anything, but must still age the last time we did<a name="line.322"></a>
+<span class="sourceLineNo">323</span> this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.323"></a>
+<span class="sourceLineNo">324</span> if (ioe instanceof RemoteException) {<a name="line.324"></a>
+<span class="sourceLineNo">325</span> ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.325"></a>
+<span class="sourceLineNo">326</span> LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.326"></a>
+<span class="sourceLineNo">327</span> if (ioe instanceof TableNotFoundException) {<a name="line.327"></a>
+<span class="sourceLineNo">328</span> if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.328"></a>
+<span class="sourceLineNo">329</span> + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.329"></a>
+<span class="sourceLineNo">330</span> sleepMultiplier++;<a name="line.330"></a>
+<span class="sourceLineNo">331</span> }<a name="line.331"></a>
+<span class="sourceLineNo">332</span> } else {<a name="line.332"></a>
+<span class="sourceLineNo">333</span> LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.333"></a>
+<span class="sourceLineNo">334</span> replicationSinkMgr.chooseSinks();<a name="line.334"></a>
<span class="sourceLineNo">335</span> }<a name="line.335"></a>
-<span class="sourceLineNo">336</span> }<a name="line.336"></a>
-<span class="sourceLineNo">337</span> if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.337"></a>
-<span class="sourceLineNo">338</span> sleepMultiplier++;<a name="line.338"></a>
-<span class="sourceLineNo">339</span> }<a name="line.339"></a>
-<span class="sourceLineNo">340</span> }<a name="line.340"></a>
-<span class="sourceLineNo">341</span> }<a name="line.341"></a>
-<span class="sourceLineNo">342</span> return false; // in case we exited before replicating<a name="line.342"></a>
-<span class="sourceLineNo">343</span> }<a name="line.343"></a>
-<span class="sourceLineNo">344</span><a name="line.344"></a>
-<span class="sourceLineNo">345</span> protected boolean isPeerEnabled() {<a name="line.345"></a>
-<span class="sourceLineNo">346</span> return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.346"></a>
-<span class="sourceLineNo">347</span> }<a name="line.347"></a>
-<span class="sourceLineNo">348</span><a name="line.348"></a>
-<span class="sourceLineNo">349</span> @Override<a name="line.349"></a>
-<span class="sourceLineNo">350</span> protected void doStop() {<a name="line.350"></a>
-<span class="sourceLineNo">351</span> disconnect(); //don't call super.doStop()<a name="line.351"></a>
-<span class="sourceLineNo">352</span> if (this.conn != null) {<a name="line.352"></a>
-<span class="sourceLineNo">353</span> try {<a name="line.353"></a>
-<span class="sourceLineNo">354</span> this.conn.close();<a name="line.354"></a>
-<span class="sourceLineNo">355</span> this.conn = null;<a name="line.355"></a>
-<span class="sourceLineNo">356</span> } catch (IOException e) {<a name="line.356"></a>
-<span class="sourceLineNo">357</span> LOG.warn("Failed to close the connection");<a name="line.357"></a>
-<span class="sourceLineNo">358</span> }<a name="line.358"></a>
-<span class="sourceLineNo">359</span> }<a name="line.359"></a>
-<span class="sourceLineNo">360</span> // Allow currently running replication tasks to finish<a name="line.360"></a>
-<span class="sourceLineNo">361</span> exec.shutdown();<a name="line.361"></a>
-<span class="sourceLineNo">362</span> try {<a name="line.362"></a>
-<span class="sourceLineNo">363</span> exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.363"></a>
-<span class="sourceLineNo">364</span> } catch (InterruptedException e) {<a name="line.364"></a>
-<span class="sourceLineNo">365</span> }<a name="line.365"></a>
-<span class="sourceLineNo">366</span> // Abort if the tasks did not terminate in time<a name="line.366"></a>
-<span class="sourceLineNo">367</span> if (!exec.isTerminated()) {<a name="line.367"></a>
-<span class="sourceLineNo">368</span> String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.368"></a>
-<span class="sourceLineNo">369</span> "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.369"></a>
-<span class="sourceLineNo">370</span> "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.370"></a>
-<span class="sourceLineNo">371</span> abortable.abort(errMsg, new IOException(errMsg));<a name="line.371"></a>
-<span class="sourceLineNo">372</span> }<a name="line.372"></a>
-<span class="sourceLineNo">373</span> notifyStopped();<a name="line.373"></a>
-<span class="sourceLineNo">374</span> }<a name="line.374"></a>
-<span class="sourceLineNo">375</span><a name="line.375"></a>
-<span class="sourceLineNo">376</span> // is this needed? Nobody else will call doStop() otherwise<a name="line.376"></a>
-<span class="sourceLineNo">377</span> @Override<a name="line.377"></a>
-<span class="sourceLineNo">378</span> public State stopAndWait() {<a name="line.378"></a>
-<span class="sourceLineNo">379</span> doStop();<a name="line.379"></a>
-<span class="sourceLineNo">380</span> return super.stopAndWait();<a name="line.380"></a>
-<span class="sourceLineNo">381</span> }<a name="line.381"></a>
-<span class="sourceLineNo">382</span><a name="line.382"></a>
-<span class="sourceLineNo">383</span> @VisibleForTesting<a name="line.383"></a>
-<span class="sourceLineNo">384</span> protected Replicator createReplicator(List<Entry> entries, int ordinal) {<a name="line.384"></a>
-<span class="sourceLineNo">385</span> return new Replicator(entries, ordinal);<a name="line.385"></a>
-<span class="sourceLineNo">386</span> }<a name="line.386"></a>
-<span class="sourceLineNo">387</span><a name="line.387"></a>
-<span class="sourceLineNo">388</span> @VisibleForTesting<a name="line.388"></a>
-<span class="sourceLineNo">389</span> protected class Replicator implements Callable<Integer> {<a name="line.389"></a>
-<span class="sourceLineNo">390</span> private List<Entry> entries;<a name="line.390"></a>
-<span class="sourceLineNo">391</span> private int ordinal;<a name="line.391"></a>
-<span class="sourceLineNo">392</span> public Replicator(List<Entry> entries, int ordinal) {<a name="line.392"></a>
-<span class="sourceLineNo">393</span> this.entries = entries;<a name="line.393"></a>
-<span class="sourceLineNo">394</span> this.ordinal = ordinal;<a name="line.394"></a>
-<span class="sourceLineNo">395</span> }<a name="line.395"></a>
-<span class="sourceLineNo">396</span><a name="line.396"></a>
-<span class="sourceLineNo">397</span> protected void replicateEntries(BlockingInterface rrs, final List<Entry> batch,<a name="line.397"></a>
-<span class="sourceLineNo">398</span> String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.398"></a>
-<span class="sourceLineNo">399</span> throws IOException {<a name="line.399"></a>
-<span class="sourceLineNo">400</span> if (LOG.isTraceEnabled()) {<a name="line.400"></a>
-<span class="sourceLineNo">401</span> long size = 0;<a name="line.401"></a>
-<span class="sourceLineNo">402</span> for (Entry e: entries) {<a name="line.402"></a>
-<span class="sourceLineNo">403</span> size += e.getKey().estimatedSerializedSizeOf();<a name="line.403"></a>
-<span class="sourceLineNo">404</span> size += e.getEdit().estimatedSerializedSizeOf();<a name="line.404"></a>
-<span class="sourceLineNo">405</span> }<a name="line.405"></a>
-<span class="sourceLineNo">406</span> LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.406"></a>
-<span class="sourceLineNo">407</span> entries.size() + " entries with total size " + size + " bytes to " +<a name="line.407"></a>
-<span class="sourceLineNo">408</span> replicationClusterId);<a name="line.408"></a>
-<span class="sourceLineNo">409</span> }<a name="line.409"></a>
-<span class="sourceLineNo">410</span> try {<a name="line.410"></a>
-<span class="sourceLineNo">411</span> ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.411"></a>
-<span class="sourceLineNo">412</span> replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.412"></a>
-<span class="sourceLineNo">413</span> if (LOG.isTraceEnabled()) {<a name="line.413"></a>
-<span class="sourceLineNo">414</span> LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.414"></a>
-<span class="sourceLineNo">415</span> }<a name="line.415"></a>
-<span class="sourceLineNo">416</span> } catch (IOException e) {<a name="line.416"></a>
-<span class="sourceLineNo">417</span> if (LOG.isTraceEnabled()) {<a name="line.417"></a>
-<span class="sourceLineNo">418</span> LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.418"></a>
-<span class="sourceLineNo">419</span> }<a name="line.419"></a>
-<span class="sourceLineNo">420</span> throw e;<a name="line.420"></a>
-<span class="sourceLineNo">421</span> }<a name="line.421"></a>
-<span class="sourceLineNo">422</span> }<a name="line.422"></a>
-<span class="sourceLineNo">423</span><a name="line.423"></a>
-<span class="sourceLineNo">424</span> @Override<a name="line.424"></a>
-<span class="sourceLineNo">425</span> public Integer call() throws IOException {<a name="line.425"></a>
-<span class="sourceLineNo">426</span> SinkPeer sinkPeer = null;<a name="line.426"></a>
-<span class="sourceLineNo">427</span> try {<a name="line.427"></a>
-<span class="sourceLineNo">428</span> sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.428"></a>
-<span class="sourceLineNo">429</span> BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.429"></a>
-<span class="sourceLineNo">430</span> replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.430"></a>
-<span class="sourceLineNo">431</span> replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.431"></a>
-<span class="sourceLineNo">432</span> return ordinal;<a name="line.432"></a>
-<span class="sourceLineNo">433</span> } catch (IOException ioe) {<a name="line.433"></a>
-<span class="sourceLineNo">434</span> if (sinkPeer != null) {<a name="line.434"></a>
-<span class="sourceLineNo">435</span> replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.435"></a>
-<span class="sourceLineNo">436</span> }<a name="line.436"></a>
-<span class="sourceLineNo">437</span> throw ioe;<a name="line.437"></a>
-<span class="sourceLineNo">438</span> }<a name="line.438"></a>
-<span class="sourceLineNo">439</span> }<a name="line.439"></a>
-<span class="sourceLineNo">440</span> }<a name="line.440"></a>
-<span class="sourceLineNo">441</span>}<a name="line.441"></a>
+<span class="sourceLineNo">336</span> } else {<a name="line.336"></a>
+<span class="sourceLineNo">337</span> if (ioe instanceof SocketTimeoutException) {<a name="line.337"></a>
+<span class="sourceLineNo">338</span> // This exception means we waited for more than 60s and nothing<a name="line.338"></a>
+<span class="sourceLineNo">339</span> // happened, the cluster is alive and calling it right away<a name="line.339"></a>
+<span class="sourceLineNo">340</span> // even for a test just makes things worse.<a name="line.340"></a>
+<span class="sourceLineNo">341</span> sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.341"></a>
+<span class="sourceLineNo">342</span> "call to the remote cluster timed out, which is usually " +<a name="line.342"></a>
+<span class="sourceLineNo">343</span> "caused by a machine failure or a massive slowdown",<a name="line.343"></a>
+<span class="sourceLineNo">344</span> this.socketTimeoutMultiplier);<a name="line.344"></a>
+<span class="sourceLineNo">345</span> } else if (ioe instanceof ConnectException) {<a name="line.345"></a>
+<span class="sourceLineNo">346</span> LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.346"></a>
+<span class="sourceLineNo">347</span> replicationSinkMgr.chooseSinks();<a name="line.347"></a>
+<span class="sourceLineNo">348</span> } else {<a name="line.348"></a>
+<span class="sourceLineNo">349</span> LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.349"></a>
+<span class="sourceLineNo">350</span> }<a name="line.350"></a>
+<span class="sourceLineNo">351</span> }<a name="line.351"></a>
+<span class="sourceLineNo">352</span> if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.352"></a>
+<span class="sourceLineNo">353</span> sleepMultiplier++;<a name="line.353"></a>
+<span class="sourceLineNo">354</span> }<a name="line.354"></a>
+<span class="sourceLineNo">355</span> }<a name="line.355"></a>
+<span class="sourceLineNo">356</span> }<a name="line.356"></a>
+<span class="sourceLineNo">357</span> return false; // in case we exited before replicating<a name="line.357"></a>
+<span class="sourceLineNo">358</span> }<a name="line.358"></a>
+<span class="sourceLineNo">359</span><a name="line.359"></a>
+<span class="sourceLineNo">360</span> protected boolean isPeerEnabled() {<a name="line.360"></a>
+<span class="sourceLineNo">361</span> return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.361"></a>
+<span class="sourceLineNo">362</span> }<a name="line.362"></a>
+<span class="sourceLineNo">363</span><a name="line.363"></a>
+<span class="sourceLineNo">364</span> @Override<a name="line.364"></a>
+<span class="sourceLineNo">365</span> protected void doStop() {<a name="line.365"></a>
+<span class="sourceLineNo">366</span> disconnect(); //don't call super.doStop()<a name="line.366"></a>
+<span class="sourceLineNo">367</span> if (this.conn != null) {<a name="line.367"></a>
+<span class="sourceLineNo">368</span> try {<a name="line.368"></a>
+<span class="sourceLineNo">369</span> this.conn.close();<a name="line.369"></a>
+<span class="sourceLineNo">370</span> this.conn = null;<a name="line.370"></a>
+<span class="sourceLineNo">371</span> } catch (IOException e) {<a name="line.371"></a>
+<span class="sourceLineNo">372</span> LOG.warn("Failed to close the connection");<a name="line.372"></a>
+<span class="sourceLineNo">373</span> }<a name="line.373"></a>
+<span class="sourceLineNo">374</span> }<a name="line.374"></a>
+<span class="sourceLineNo">375</span> // Allow currently running replication tasks to finish<a name="line.375"></a>
+<span class="sourceLineNo">376</span> exec.shutdown();<a name="line.376"></a>
+<span class="sourceLineNo">377</span> try {<a name="line.377"></a>
+<span class="sourceLineNo">378</span> exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.378"></a>
+<span class="sourceLineNo">379</span> } catch (InterruptedException e) {<a name="line.379"></a>
+<span class="sourceLineNo">380</span> }<a name="line.380"></a>
+<span class="sourceLineNo">381</span> // Abort if the tasks did not terminate in time<a name="line.381"></a>
+<span class="sourceLineNo">382</span> if (!exec.isTerminated()) {<a name="line.382"></a>
+<span class="sourceLineNo">383</span> String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.383"></a>
+<span class="sourceLineNo">384</span> "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.384"></a>
+<span class="sourceLineNo">385</span> "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.385"></a>
+<span class="sourceLineNo">386</span> abortable.abort(errMsg, new IOException(errMsg));<a name="line.386"></a>
+<span class="sourceLineNo">387</span> }<a name="line.387"></a>
+<span class="sourceLineNo">388</span> notifyStopped();<a name="line.388"></a>
+<span class="sourceLineNo">389</span> }<a name="line.389"></a>
+<span class="sourceLineNo">390</span><a name="line.390"></a>
+<span class="sourceLineNo">391</span> // is this needed? Nobody else will call doStop() otherwise<a name="line.391"></a>
+<span class="sourceLineNo">392</span> @Override<a name="line.392"></a>
+<span class="sourceLineNo">393</span> public State stopAndWait() {<a name="line.393"></a>
+<span class="sourceLineNo">394</span> doStop();<a name="line.394"></a>
+<span class="sourceLineNo">395</span> return super.stopAndWait();<a name="line.395"></a>
+<span class="sourceLineNo">396</span> }<a name="line.396"></a>
+<span class="sourceLineNo">397</span><a name="line.397"></a>
+<span class="sourceLineNo">398</span> @VisibleForTesting<a name="line.398"></a>
+<span class="sourceLineNo">399</span> protected Replicator createReplicator(List<Entry> entries, int ordinal) {<a name="line.399"></a>
+<span class="sourceLineNo">400</span> return new Replicator(entries, ordinal);<a name="line.400"></a>
+<span class="sourceLineNo">401</span> }<a name="line.401"></a>
+<span class="sourceLineNo">402</span><a name="line.402"></a>
+<span class="sourceLineNo">403</span> @VisibleForTesting<a name="line.403"></a>
+<span class="sourceLineNo">404</span> protected class Replicator implements Callable<Integer> {<a name="line.404"></a>
+<span class="sourceLineNo">405</span> private List<Entry> entries;<a name="line.405"></a>
+<span class="sourceLineNo">406</span> private int ordinal;<a name="line.406"></a>
+<span class="sourceLineNo">407</span> public Replicator(List<Entry> entries, int ordinal) {<a name="line.407"></a>
+<span class="sourceLineNo">408</span> this.entries = entries;<a name="line.408"></a>
+<span class="sourceLineNo">409</span> this.ordinal = ordinal;<a name="line.409"></a>
+<span class="sourceLineNo">410</span> }<a name="line.410"></a>
+<span class="sourceLineNo">411</span><a name="line.411"></a>
+<span class="sourceLineNo">412</span> protected void replicateEntries(BlockingInterface rrs, final List<Entry> batch,<a name="line.412"></a>
+<span class="sourceLineNo">413</span> String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.413"></a>
+<span class="sourceLineNo">414</span> throws IOException {<a name="line.414"></a>
+<span class="sourceLineNo">415</span> if (LOG.isTraceEnabled()) {<a name="line.415"></a>
+<span class="sourceLineNo">416</span> long size = 0;<a name="line.416"></a>
+<span class="sourceLineNo">417</span> for (Entry e: entries) {<a name="line.417"></a>
+<span class="sourceLineNo">418</span> size += e.getKey().estimatedSerializedSizeOf();<a name="line.418"></a>
+<span class="sourceLineNo">419</span> size += e.getEdit().estimatedSerializedSizeOf();<a name="line.419"></a>
+<span class="sourceLineNo">420</span> }<a name="line.420"></a>
+<span class="sourceLineNo">421</span> LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.421"></a>
+<span class="sourceLineNo">422</span> entries.size() + " entries with total size " + size + " bytes to " +<a name="line.422"></a>
+<span class="sourceLineNo">423</span> replicationClusterId);<a name="line.423"></a>
+<span class="sourceLineNo">424</span> }<a name="line.424"></a>
+<span class="sourceLineNo">425</span> try {<a name="line.425"></a>
+<span class="sourceLineNo">426</span> ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.426"></a>
+<span class="sourceLineNo">427</span> replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.427"></a>
+<span class="sourceLineNo">428</span> if (LOG.isTraceEnabled()) {<a name="line.428"></a>
+<span class="sourceLineNo">429</span> LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.429"></a>
+<span class="sourceLineNo">430</span> }<a name="line.430"></a>
+<span class="sourceLineNo">431</span> } catch (IOException e) {<a name="line.431"></a>
+<span class="sourceLineNo">432</span> if (LOG.isTraceEnabled()) {<a name="line.432"></a>
+<span class="sourceLineNo">433</span> LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.433"></a>
+<span class="sourceLineNo">434</span> }<a name="line.434"></a>
+<span class="sourceLineNo">435</span> throw e;<a name="line.435"></a>
+<span class="sourceLineNo">436</span> }<a name="line.436"></a>
+<span class="sourceLineNo">437</span> }<a name="line.437"></a>
+<span class="sourceLineNo">438</span><a name="line.438"></a>
+<span class="sourceLineNo">439</span> @Override<a name="line.439"></a>
+<span class="sourceLineNo">440</span> public Integer call() throws IOException {<a name="line.440"></a>
+<span class="sourceLineNo">441</span> SinkPeer sinkPeer = null;<a name="line.441"></a>
+<span class="sourceLineNo">442</span> try {<a name="line.442"></a>
+<span class="sourceLineNo">443</span> sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.443"></a>
+<span class="sourceLineNo">444</span> BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.444"></a>
+<span class="sourceLineNo">445</span> replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.445"></a>
+<span class="sourceLineNo">446</span> replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.446"></a>
+<span class="sourceLineNo">447</span> return ordinal;<a name="line.447"></a>
+<span class="sourceLineNo">448</span> } catch (IOException ioe) {<a name="line.448"></a>
+<span class="sourceLineNo">449</span> if (sinkPeer != null) {<a name="line.449"></a>
+<span class="sourceLineNo">450</span> replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.450"></a>
+<span class="sourceLineNo">451</span> }<a name="line.451"></a>
+<span class="sourceLineNo">452</span> throw ioe;<a name="line.452"></a>
+<span class="sourceLineNo">453</span> }<a name="line.453"></a>
+<span class="sourceLineNo">454</span> }<a name="line.454"></a>
+<span class="sourceLineNo">455</span> }<a name="line.455"></a>
+<span class="sourceLineNo">456</span>}<a name="line.456"></a>