You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by gi...@apache.org on 2017/06/02 15:00:13 UTC

[12/32] hbase-site git commit: Published site at 82d554e3783372cc6b05489452c815b57c06f6cd.

http://git-wip-us.apache.org/repos/asf/hbase-site/blob/7e7efd3e/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
----------------------------------------------------------------------
diff --git a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
index 8ffd272..58b1653 100644
--- a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
+++ b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.Replicator.html
@@ -233,220 +233,235 @@
 <span class="sourceLineNo">225</span>    return entryLists;<a name="line.225"></a>
 <span class="sourceLineNo">226</span>  }<a name="line.226"></a>
 <span class="sourceLineNo">227</span><a name="line.227"></a>
-<span class="sourceLineNo">228</span>  /**<a name="line.228"></a>
-<span class="sourceLineNo">229</span>   * Do the shipping logic<a name="line.229"></a>
-<span class="sourceLineNo">230</span>   */<a name="line.230"></a>
-<span class="sourceLineNo">231</span>  @Override<a name="line.231"></a>
-<span class="sourceLineNo">232</span>  public boolean replicate(ReplicateContext replicateContext) {<a name="line.232"></a>
-<span class="sourceLineNo">233</span>    CompletionService&lt;Integer&gt; pool = new ExecutorCompletionService&lt;&gt;(this.exec);<a name="line.233"></a>
-<span class="sourceLineNo">234</span>    List&lt;List&lt;Entry&gt;&gt; batches;<a name="line.234"></a>
-<span class="sourceLineNo">235</span>    String walGroupId = replicateContext.getWalGroupId();<a name="line.235"></a>
-<span class="sourceLineNo">236</span>    int sleepMultiplier = 1;<a name="line.236"></a>
-<span class="sourceLineNo">237</span><a name="line.237"></a>
-<span class="sourceLineNo">238</span>    if (!peersSelected &amp;&amp; this.isRunning()) {<a name="line.238"></a>
-<span class="sourceLineNo">239</span>      connectToPeers();<a name="line.239"></a>
-<span class="sourceLineNo">240</span>      peersSelected = true;<a name="line.240"></a>
-<span class="sourceLineNo">241</span>    }<a name="line.241"></a>
-<span class="sourceLineNo">242</span><a name="line.242"></a>
-<span class="sourceLineNo">243</span>    int numSinks = replicationSinkMgr.getNumSinks();<a name="line.243"></a>
-<span class="sourceLineNo">244</span>    if (numSinks == 0) {<a name="line.244"></a>
-<span class="sourceLineNo">245</span>      LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.245"></a>
-<span class="sourceLineNo">246</span>          + " with the same set of edits.");<a name="line.246"></a>
-<span class="sourceLineNo">247</span>      return false;<a name="line.247"></a>
-<span class="sourceLineNo">248</span>    }<a name="line.248"></a>
+<span class="sourceLineNo">228</span>  private void reconnectToPeerCluster() {<a name="line.228"></a>
+<span class="sourceLineNo">229</span>    ClusterConnection connection = null;<a name="line.229"></a>
+<span class="sourceLineNo">230</span>    try {<a name="line.230"></a>
+<span class="sourceLineNo">231</span>      connection = (ClusterConnection) ConnectionFactory.createConnection(this.conf);<a name="line.231"></a>
+<span class="sourceLineNo">232</span>    } catch (IOException ioe) {<a name="line.232"></a>
+<span class="sourceLineNo">233</span>      LOG.warn("Failed to create connection for peer cluster", ioe);<a name="line.233"></a>
+<span class="sourceLineNo">234</span>    }<a name="line.234"></a>
+<span class="sourceLineNo">235</span>    if (connection != null) {<a name="line.235"></a>
+<span class="sourceLineNo">236</span>      this.conn = connection;<a name="line.236"></a>
+<span class="sourceLineNo">237</span>    }<a name="line.237"></a>
+<span class="sourceLineNo">238</span>  }<a name="line.238"></a>
+<span class="sourceLineNo">239</span><a name="line.239"></a>
+<span class="sourceLineNo">240</span>  /**<a name="line.240"></a>
+<span class="sourceLineNo">241</span>   * Do the shipping logic<a name="line.241"></a>
+<span class="sourceLineNo">242</span>   */<a name="line.242"></a>
+<span class="sourceLineNo">243</span>  @Override<a name="line.243"></a>
+<span class="sourceLineNo">244</span>  public boolean replicate(ReplicateContext replicateContext) {<a name="line.244"></a>
+<span class="sourceLineNo">245</span>    CompletionService&lt;Integer&gt; pool = new ExecutorCompletionService&lt;&gt;(this.exec);<a name="line.245"></a>
+<span class="sourceLineNo">246</span>    List&lt;List&lt;Entry&gt;&gt; batches;<a name="line.246"></a>
+<span class="sourceLineNo">247</span>    String walGroupId = replicateContext.getWalGroupId();<a name="line.247"></a>
+<span class="sourceLineNo">248</span>    int sleepMultiplier = 1;<a name="line.248"></a>
 <span class="sourceLineNo">249</span><a name="line.249"></a>
-<span class="sourceLineNo">250</span>    batches = createBatches(replicateContext.getEntries());<a name="line.250"></a>
-<span class="sourceLineNo">251</span><a name="line.251"></a>
-<span class="sourceLineNo">252</span>    while (this.isRunning() &amp;&amp; !exec.isShutdown()) {<a name="line.252"></a>
-<span class="sourceLineNo">253</span>      if (!isPeerEnabled()) {<a name="line.253"></a>
-<span class="sourceLineNo">254</span>        if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.254"></a>
-<span class="sourceLineNo">255</span>          sleepMultiplier++;<a name="line.255"></a>
-<span class="sourceLineNo">256</span>        }<a name="line.256"></a>
-<span class="sourceLineNo">257</span>        continue;<a name="line.257"></a>
-<span class="sourceLineNo">258</span>      }<a name="line.258"></a>
-<span class="sourceLineNo">259</span>      try {<a name="line.259"></a>
-<span class="sourceLineNo">260</span>        int futures = 0;<a name="line.260"></a>
-<span class="sourceLineNo">261</span>        for (int i=0; i&lt;batches.size(); i++) {<a name="line.261"></a>
-<span class="sourceLineNo">262</span>          List&lt;Entry&gt; entries = batches.get(i);<a name="line.262"></a>
-<span class="sourceLineNo">263</span>          if (!entries.isEmpty()) {<a name="line.263"></a>
-<span class="sourceLineNo">264</span>            if (LOG.isTraceEnabled()) {<a name="line.264"></a>
-<span class="sourceLineNo">265</span>              LOG.trace("Submitting " + entries.size() +<a name="line.265"></a>
-<span class="sourceLineNo">266</span>                  " entries of total size " + replicateContext.getSize());<a name="line.266"></a>
-<span class="sourceLineNo">267</span>            }<a name="line.267"></a>
-<span class="sourceLineNo">268</span>            // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.268"></a>
-<span class="sourceLineNo">269</span>            pool.submit(createReplicator(entries, i));<a name="line.269"></a>
-<span class="sourceLineNo">270</span>            futures++;<a name="line.270"></a>
-<span class="sourceLineNo">271</span>          }<a name="line.271"></a>
-<span class="sourceLineNo">272</span>        }<a name="line.272"></a>
-<span class="sourceLineNo">273</span>        IOException iox = null;<a name="line.273"></a>
-<span class="sourceLineNo">274</span><a name="line.274"></a>
-<span class="sourceLineNo">275</span>        long lastWriteTime = 0;<a name="line.275"></a>
-<span class="sourceLineNo">276</span>        for (int i=0; i&lt;futures; i++) {<a name="line.276"></a>
-<span class="sourceLineNo">277</span>          try {<a name="line.277"></a>
-<span class="sourceLineNo">278</span>            // wait for all futures, remove successful parts<a name="line.278"></a>
-<span class="sourceLineNo">279</span>            // (only the remaining parts will be retried)<a name="line.279"></a>
-<span class="sourceLineNo">280</span>            Future&lt;Integer&gt; f = pool.take();<a name="line.280"></a>
-<span class="sourceLineNo">281</span>            int index = f.get().intValue();<a name="line.281"></a>
-<span class="sourceLineNo">282</span>            List&lt;Entry&gt; batch = batches.get(index);<a name="line.282"></a>
-<span class="sourceLineNo">283</span>            batches.set(index, Collections.&lt;Entry&gt;emptyList()); // remove successful batch<a name="line.283"></a>
-<span class="sourceLineNo">284</span>            // Find the most recent write time in the batch<a name="line.284"></a>
-<span class="sourceLineNo">285</span>            long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.285"></a>
-<span class="sourceLineNo">286</span>            if (writeTime &gt; lastWriteTime) {<a name="line.286"></a>
-<span class="sourceLineNo">287</span>              lastWriteTime = writeTime;<a name="line.287"></a>
-<span class="sourceLineNo">288</span>            }<a name="line.288"></a>
-<span class="sourceLineNo">289</span>          } catch (InterruptedException ie) {<a name="line.289"></a>
-<span class="sourceLineNo">290</span>            iox =  new IOException(ie);<a name="line.290"></a>
-<span class="sourceLineNo">291</span>          } catch (ExecutionException ee) {<a name="line.291"></a>
-<span class="sourceLineNo">292</span>            // cause must be an IOException<a name="line.292"></a>
-<span class="sourceLineNo">293</span>            iox = (IOException)ee.getCause();<a name="line.293"></a>
-<span class="sourceLineNo">294</span>          }<a name="line.294"></a>
-<span class="sourceLineNo">295</span>        }<a name="line.295"></a>
-<span class="sourceLineNo">296</span>        if (iox != null) {<a name="line.296"></a>
-<span class="sourceLineNo">297</span>          // if we had any exceptions, try again<a name="line.297"></a>
-<span class="sourceLineNo">298</span>          throw iox;<a name="line.298"></a>
-<span class="sourceLineNo">299</span>        }<a name="line.299"></a>
-<span class="sourceLineNo">300</span>        // update metrics<a name="line.300"></a>
-<span class="sourceLineNo">301</span>        if (lastWriteTime &gt; 0) {<a name="line.301"></a>
-<span class="sourceLineNo">302</span>          this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.302"></a>
-<span class="sourceLineNo">303</span>        }<a name="line.303"></a>
-<span class="sourceLineNo">304</span>        return true;<a name="line.304"></a>
-<span class="sourceLineNo">305</span><a name="line.305"></a>
-<span class="sourceLineNo">306</span>      } catch (IOException ioe) {<a name="line.306"></a>
-<span class="sourceLineNo">307</span>        // Didn't ship anything, but must still age the last time we did<a name="line.307"></a>
-<span class="sourceLineNo">308</span>        this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.308"></a>
-<span class="sourceLineNo">309</span>        if (ioe instanceof RemoteException) {<a name="line.309"></a>
-<span class="sourceLineNo">310</span>          ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.310"></a>
-<span class="sourceLineNo">311</span>          LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.311"></a>
-<span class="sourceLineNo">312</span>          if (ioe instanceof TableNotFoundException) {<a name="line.312"></a>
-<span class="sourceLineNo">313</span>            if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.313"></a>
-<span class="sourceLineNo">314</span>                + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.314"></a>
-<span class="sourceLineNo">315</span>              sleepMultiplier++;<a name="line.315"></a>
-<span class="sourceLineNo">316</span>            }<a name="line.316"></a>
-<span class="sourceLineNo">317</span>          } else {<a name="line.317"></a>
-<span class="sourceLineNo">318</span>            LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.318"></a>
-<span class="sourceLineNo">319</span>            replicationSinkMgr.chooseSinks();<a name="line.319"></a>
-<span class="sourceLineNo">320</span>          }<a name="line.320"></a>
-<span class="sourceLineNo">321</span>        } else {<a name="line.321"></a>
-<span class="sourceLineNo">322</span>          if (ioe instanceof SocketTimeoutException) {<a name="line.322"></a>
-<span class="sourceLineNo">323</span>            // This exception means we waited for more than 60s and nothing<a name="line.323"></a>
-<span class="sourceLineNo">324</span>            // happened, the cluster is alive and calling it right away<a name="line.324"></a>
-<span class="sourceLineNo">325</span>            // even for a test just makes things worse.<a name="line.325"></a>
-<span class="sourceLineNo">326</span>            sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.326"></a>
-<span class="sourceLineNo">327</span>              "call to the remote cluster timed out, which is usually " +<a name="line.327"></a>
-<span class="sourceLineNo">328</span>              "caused by a machine failure or a massive slowdown",<a name="line.328"></a>
-<span class="sourceLineNo">329</span>              this.socketTimeoutMultiplier);<a name="line.329"></a>
-<span class="sourceLineNo">330</span>          } else if (ioe instanceof ConnectException) {<a name="line.330"></a>
-<span class="sourceLineNo">331</span>            LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.331"></a>
-<span class="sourceLineNo">332</span>            replicationSinkMgr.chooseSinks();<a name="line.332"></a>
-<span class="sourceLineNo">333</span>          } else {<a name="line.333"></a>
-<span class="sourceLineNo">334</span>            LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.334"></a>
+<span class="sourceLineNo">250</span>    if (!peersSelected &amp;&amp; this.isRunning()) {<a name="line.250"></a>
+<span class="sourceLineNo">251</span>      connectToPeers();<a name="line.251"></a>
+<span class="sourceLineNo">252</span>      peersSelected = true;<a name="line.252"></a>
+<span class="sourceLineNo">253</span>    }<a name="line.253"></a>
+<span class="sourceLineNo">254</span><a name="line.254"></a>
+<span class="sourceLineNo">255</span>    int numSinks = replicationSinkMgr.getNumSinks();<a name="line.255"></a>
+<span class="sourceLineNo">256</span>    if (numSinks == 0) {<a name="line.256"></a>
+<span class="sourceLineNo">257</span>      LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.257"></a>
+<span class="sourceLineNo">258</span>          + " with the same set of edits.");<a name="line.258"></a>
+<span class="sourceLineNo">259</span>      return false;<a name="line.259"></a>
+<span class="sourceLineNo">260</span>    }<a name="line.260"></a>
+<span class="sourceLineNo">261</span><a name="line.261"></a>
+<span class="sourceLineNo">262</span>    batches = createBatches(replicateContext.getEntries());<a name="line.262"></a>
+<span class="sourceLineNo">263</span><a name="line.263"></a>
+<span class="sourceLineNo">264</span>    while (this.isRunning() &amp;&amp; !exec.isShutdown()) {<a name="line.264"></a>
+<span class="sourceLineNo">265</span>      if (!isPeerEnabled()) {<a name="line.265"></a>
+<span class="sourceLineNo">266</span>        if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.266"></a>
+<span class="sourceLineNo">267</span>          sleepMultiplier++;<a name="line.267"></a>
+<span class="sourceLineNo">268</span>        }<a name="line.268"></a>
+<span class="sourceLineNo">269</span>        continue;<a name="line.269"></a>
+<span class="sourceLineNo">270</span>      }<a name="line.270"></a>
+<span class="sourceLineNo">271</span>      if (this.conn == null || this.conn.isClosed()) {<a name="line.271"></a>
+<span class="sourceLineNo">272</span>        reconnectToPeerCluster();<a name="line.272"></a>
+<span class="sourceLineNo">273</span>      }<a name="line.273"></a>
+<span class="sourceLineNo">274</span>      try {<a name="line.274"></a>
+<span class="sourceLineNo">275</span>        int futures = 0;<a name="line.275"></a>
+<span class="sourceLineNo">276</span>        for (int i=0; i&lt;batches.size(); i++) {<a name="line.276"></a>
+<span class="sourceLineNo">277</span>          List&lt;Entry&gt; entries = batches.get(i);<a name="line.277"></a>
+<span class="sourceLineNo">278</span>          if (!entries.isEmpty()) {<a name="line.278"></a>
+<span class="sourceLineNo">279</span>            if (LOG.isTraceEnabled()) {<a name="line.279"></a>
+<span class="sourceLineNo">280</span>              LOG.trace("Submitting " + entries.size() +<a name="line.280"></a>
+<span class="sourceLineNo">281</span>                  " entries of total size " + replicateContext.getSize());<a name="line.281"></a>
+<span class="sourceLineNo">282</span>            }<a name="line.282"></a>
+<span class="sourceLineNo">283</span>            // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.283"></a>
+<span class="sourceLineNo">284</span>            pool.submit(createReplicator(entries, i));<a name="line.284"></a>
+<span class="sourceLineNo">285</span>            futures++;<a name="line.285"></a>
+<span class="sourceLineNo">286</span>          }<a name="line.286"></a>
+<span class="sourceLineNo">287</span>        }<a name="line.287"></a>
+<span class="sourceLineNo">288</span>        IOException iox = null;<a name="line.288"></a>
+<span class="sourceLineNo">289</span><a name="line.289"></a>
+<span class="sourceLineNo">290</span>        long lastWriteTime = 0;<a name="line.290"></a>
+<span class="sourceLineNo">291</span>        for (int i=0; i&lt;futures; i++) {<a name="line.291"></a>
+<span class="sourceLineNo">292</span>          try {<a name="line.292"></a>
+<span class="sourceLineNo">293</span>            // wait for all futures, remove successful parts<a name="line.293"></a>
+<span class="sourceLineNo">294</span>            // (only the remaining parts will be retried)<a name="line.294"></a>
+<span class="sourceLineNo">295</span>            Future&lt;Integer&gt; f = pool.take();<a name="line.295"></a>
+<span class="sourceLineNo">296</span>            int index = f.get().intValue();<a name="line.296"></a>
+<span class="sourceLineNo">297</span>            List&lt;Entry&gt; batch = batches.get(index);<a name="line.297"></a>
+<span class="sourceLineNo">298</span>            batches.set(index, Collections.&lt;Entry&gt;emptyList()); // remove successful batch<a name="line.298"></a>
+<span class="sourceLineNo">299</span>            // Find the most recent write time in the batch<a name="line.299"></a>
+<span class="sourceLineNo">300</span>            long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.300"></a>
+<span class="sourceLineNo">301</span>            if (writeTime &gt; lastWriteTime) {<a name="line.301"></a>
+<span class="sourceLineNo">302</span>              lastWriteTime = writeTime;<a name="line.302"></a>
+<span class="sourceLineNo">303</span>            }<a name="line.303"></a>
+<span class="sourceLineNo">304</span>          } catch (InterruptedException ie) {<a name="line.304"></a>
+<span class="sourceLineNo">305</span>            iox =  new IOException(ie);<a name="line.305"></a>
+<span class="sourceLineNo">306</span>          } catch (ExecutionException ee) {<a name="line.306"></a>
+<span class="sourceLineNo">307</span>            // cause must be an IOException<a name="line.307"></a>
+<span class="sourceLineNo">308</span>            iox = (IOException)ee.getCause();<a name="line.308"></a>
+<span class="sourceLineNo">309</span>          }<a name="line.309"></a>
+<span class="sourceLineNo">310</span>        }<a name="line.310"></a>
+<span class="sourceLineNo">311</span>        if (iox != null) {<a name="line.311"></a>
+<span class="sourceLineNo">312</span>          // if we had any exceptions, try again<a name="line.312"></a>
+<span class="sourceLineNo">313</span>          throw iox;<a name="line.313"></a>
+<span class="sourceLineNo">314</span>        }<a name="line.314"></a>
+<span class="sourceLineNo">315</span>        // update metrics<a name="line.315"></a>
+<span class="sourceLineNo">316</span>        if (lastWriteTime &gt; 0) {<a name="line.316"></a>
+<span class="sourceLineNo">317</span>          this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.317"></a>
+<span class="sourceLineNo">318</span>        }<a name="line.318"></a>
+<span class="sourceLineNo">319</span>        return true;<a name="line.319"></a>
+<span class="sourceLineNo">320</span><a name="line.320"></a>
+<span class="sourceLineNo">321</span>      } catch (IOException ioe) {<a name="line.321"></a>
+<span class="sourceLineNo">322</span>        // Didn't ship anything, but must still age the last time we did<a name="line.322"></a>
+<span class="sourceLineNo">323</span>        this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.323"></a>
+<span class="sourceLineNo">324</span>        if (ioe instanceof RemoteException) {<a name="line.324"></a>
+<span class="sourceLineNo">325</span>          ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.325"></a>
+<span class="sourceLineNo">326</span>          LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.326"></a>
+<span class="sourceLineNo">327</span>          if (ioe instanceof TableNotFoundException) {<a name="line.327"></a>
+<span class="sourceLineNo">328</span>            if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.328"></a>
+<span class="sourceLineNo">329</span>                + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.329"></a>
+<span class="sourceLineNo">330</span>              sleepMultiplier++;<a name="line.330"></a>
+<span class="sourceLineNo">331</span>            }<a name="line.331"></a>
+<span class="sourceLineNo">332</span>          } else {<a name="line.332"></a>
+<span class="sourceLineNo">333</span>            LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.333"></a>
+<span class="sourceLineNo">334</span>            replicationSinkMgr.chooseSinks();<a name="line.334"></a>
 <span class="sourceLineNo">335</span>          }<a name="line.335"></a>
-<span class="sourceLineNo">336</span>        }<a name="line.336"></a>
-<span class="sourceLineNo">337</span>        if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.337"></a>
-<span class="sourceLineNo">338</span>          sleepMultiplier++;<a name="line.338"></a>
-<span class="sourceLineNo">339</span>        }<a name="line.339"></a>
-<span class="sourceLineNo">340</span>      }<a name="line.340"></a>
-<span class="sourceLineNo">341</span>    }<a name="line.341"></a>
-<span class="sourceLineNo">342</span>    return false; // in case we exited before replicating<a name="line.342"></a>
-<span class="sourceLineNo">343</span>  }<a name="line.343"></a>
-<span class="sourceLineNo">344</span><a name="line.344"></a>
-<span class="sourceLineNo">345</span>  protected boolean isPeerEnabled() {<a name="line.345"></a>
-<span class="sourceLineNo">346</span>    return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.346"></a>
-<span class="sourceLineNo">347</span>  }<a name="line.347"></a>
-<span class="sourceLineNo">348</span><a name="line.348"></a>
-<span class="sourceLineNo">349</span>  @Override<a name="line.349"></a>
-<span class="sourceLineNo">350</span>  protected void doStop() {<a name="line.350"></a>
-<span class="sourceLineNo">351</span>    disconnect(); //don't call super.doStop()<a name="line.351"></a>
-<span class="sourceLineNo">352</span>    if (this.conn != null) {<a name="line.352"></a>
-<span class="sourceLineNo">353</span>      try {<a name="line.353"></a>
-<span class="sourceLineNo">354</span>        this.conn.close();<a name="line.354"></a>
-<span class="sourceLineNo">355</span>        this.conn = null;<a name="line.355"></a>
-<span class="sourceLineNo">356</span>      } catch (IOException e) {<a name="line.356"></a>
-<span class="sourceLineNo">357</span>        LOG.warn("Failed to close the connection");<a name="line.357"></a>
-<span class="sourceLineNo">358</span>      }<a name="line.358"></a>
-<span class="sourceLineNo">359</span>    }<a name="line.359"></a>
-<span class="sourceLineNo">360</span>    // Allow currently running replication tasks to finish<a name="line.360"></a>
-<span class="sourceLineNo">361</span>    exec.shutdown();<a name="line.361"></a>
-<span class="sourceLineNo">362</span>    try {<a name="line.362"></a>
-<span class="sourceLineNo">363</span>      exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.363"></a>
-<span class="sourceLineNo">364</span>    } catch (InterruptedException e) {<a name="line.364"></a>
-<span class="sourceLineNo">365</span>    }<a name="line.365"></a>
-<span class="sourceLineNo">366</span>    // Abort if the tasks did not terminate in time<a name="line.366"></a>
-<span class="sourceLineNo">367</span>    if (!exec.isTerminated()) {<a name="line.367"></a>
-<span class="sourceLineNo">368</span>      String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.368"></a>
-<span class="sourceLineNo">369</span>          "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.369"></a>
-<span class="sourceLineNo">370</span>          "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.370"></a>
-<span class="sourceLineNo">371</span>      abortable.abort(errMsg, new IOException(errMsg));<a name="line.371"></a>
-<span class="sourceLineNo">372</span>    }<a name="line.372"></a>
-<span class="sourceLineNo">373</span>    notifyStopped();<a name="line.373"></a>
-<span class="sourceLineNo">374</span>  }<a name="line.374"></a>
-<span class="sourceLineNo">375</span><a name="line.375"></a>
-<span class="sourceLineNo">376</span>  // is this needed? Nobody else will call doStop() otherwise<a name="line.376"></a>
-<span class="sourceLineNo">377</span>  @Override<a name="line.377"></a>
-<span class="sourceLineNo">378</span>  public State stopAndWait() {<a name="line.378"></a>
-<span class="sourceLineNo">379</span>    doStop();<a name="line.379"></a>
-<span class="sourceLineNo">380</span>    return super.stopAndWait();<a name="line.380"></a>
-<span class="sourceLineNo">381</span>  }<a name="line.381"></a>
-<span class="sourceLineNo">382</span><a name="line.382"></a>
-<span class="sourceLineNo">383</span>  @VisibleForTesting<a name="line.383"></a>
-<span class="sourceLineNo">384</span>  protected Replicator createReplicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.384"></a>
-<span class="sourceLineNo">385</span>    return new Replicator(entries, ordinal);<a name="line.385"></a>
-<span class="sourceLineNo">386</span>  }<a name="line.386"></a>
-<span class="sourceLineNo">387</span><a name="line.387"></a>
-<span class="sourceLineNo">388</span>  @VisibleForTesting<a name="line.388"></a>
-<span class="sourceLineNo">389</span>  protected class Replicator implements Callable&lt;Integer&gt; {<a name="line.389"></a>
-<span class="sourceLineNo">390</span>    private List&lt;Entry&gt; entries;<a name="line.390"></a>
-<span class="sourceLineNo">391</span>    private int ordinal;<a name="line.391"></a>
-<span class="sourceLineNo">392</span>    public Replicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.392"></a>
-<span class="sourceLineNo">393</span>      this.entries = entries;<a name="line.393"></a>
-<span class="sourceLineNo">394</span>      this.ordinal = ordinal;<a name="line.394"></a>
-<span class="sourceLineNo">395</span>    }<a name="line.395"></a>
-<span class="sourceLineNo">396</span><a name="line.396"></a>
-<span class="sourceLineNo">397</span>    protected void replicateEntries(BlockingInterface rrs, final List&lt;Entry&gt; batch,<a name="line.397"></a>
-<span class="sourceLineNo">398</span>        String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.398"></a>
-<span class="sourceLineNo">399</span>        throws IOException {<a name="line.399"></a>
-<span class="sourceLineNo">400</span>      if (LOG.isTraceEnabled()) {<a name="line.400"></a>
-<span class="sourceLineNo">401</span>        long size = 0;<a name="line.401"></a>
-<span class="sourceLineNo">402</span>        for (Entry e: entries) {<a name="line.402"></a>
-<span class="sourceLineNo">403</span>          size += e.getKey().estimatedSerializedSizeOf();<a name="line.403"></a>
-<span class="sourceLineNo">404</span>          size += e.getEdit().estimatedSerializedSizeOf();<a name="line.404"></a>
-<span class="sourceLineNo">405</span>        }<a name="line.405"></a>
-<span class="sourceLineNo">406</span>        LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.406"></a>
-<span class="sourceLineNo">407</span>            entries.size() + " entries with total size " + size + " bytes to " +<a name="line.407"></a>
-<span class="sourceLineNo">408</span>            replicationClusterId);<a name="line.408"></a>
-<span class="sourceLineNo">409</span>      }<a name="line.409"></a>
-<span class="sourceLineNo">410</span>      try {<a name="line.410"></a>
-<span class="sourceLineNo">411</span>        ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.411"></a>
-<span class="sourceLineNo">412</span>          replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.412"></a>
-<span class="sourceLineNo">413</span>        if (LOG.isTraceEnabled()) {<a name="line.413"></a>
-<span class="sourceLineNo">414</span>          LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.414"></a>
-<span class="sourceLineNo">415</span>        }<a name="line.415"></a>
-<span class="sourceLineNo">416</span>      } catch (IOException e) {<a name="line.416"></a>
-<span class="sourceLineNo">417</span>        if (LOG.isTraceEnabled()) {<a name="line.417"></a>
-<span class="sourceLineNo">418</span>          LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.418"></a>
-<span class="sourceLineNo">419</span>        }<a name="line.419"></a>
-<span class="sourceLineNo">420</span>        throw e;<a name="line.420"></a>
-<span class="sourceLineNo">421</span>      }<a name="line.421"></a>
-<span class="sourceLineNo">422</span>    }<a name="line.422"></a>
-<span class="sourceLineNo">423</span><a name="line.423"></a>
-<span class="sourceLineNo">424</span>    @Override<a name="line.424"></a>
-<span class="sourceLineNo">425</span>    public Integer call() throws IOException {<a name="line.425"></a>
-<span class="sourceLineNo">426</span>      SinkPeer sinkPeer = null;<a name="line.426"></a>
-<span class="sourceLineNo">427</span>      try {<a name="line.427"></a>
-<span class="sourceLineNo">428</span>        sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.428"></a>
-<span class="sourceLineNo">429</span>        BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.429"></a>
-<span class="sourceLineNo">430</span>        replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.430"></a>
-<span class="sourceLineNo">431</span>        replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.431"></a>
-<span class="sourceLineNo">432</span>        return ordinal;<a name="line.432"></a>
-<span class="sourceLineNo">433</span>      } catch (IOException ioe) {<a name="line.433"></a>
-<span class="sourceLineNo">434</span>        if (sinkPeer != null) {<a name="line.434"></a>
-<span class="sourceLineNo">435</span>          replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.435"></a>
-<span class="sourceLineNo">436</span>        }<a name="line.436"></a>
-<span class="sourceLineNo">437</span>        throw ioe;<a name="line.437"></a>
-<span class="sourceLineNo">438</span>      }<a name="line.438"></a>
-<span class="sourceLineNo">439</span>    }<a name="line.439"></a>
-<span class="sourceLineNo">440</span>  }<a name="line.440"></a>
-<span class="sourceLineNo">441</span>}<a name="line.441"></a>
+<span class="sourceLineNo">336</span>        } else {<a name="line.336"></a>
+<span class="sourceLineNo">337</span>          if (ioe instanceof SocketTimeoutException) {<a name="line.337"></a>
+<span class="sourceLineNo">338</span>            // This exception means we waited for more than 60s and nothing<a name="line.338"></a>
+<span class="sourceLineNo">339</span>            // happened, the cluster is alive and calling it right away<a name="line.339"></a>
+<span class="sourceLineNo">340</span>            // even for a test just makes things worse.<a name="line.340"></a>
+<span class="sourceLineNo">341</span>            sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.341"></a>
+<span class="sourceLineNo">342</span>              "call to the remote cluster timed out, which is usually " +<a name="line.342"></a>
+<span class="sourceLineNo">343</span>              "caused by a machine failure or a massive slowdown",<a name="line.343"></a>
+<span class="sourceLineNo">344</span>              this.socketTimeoutMultiplier);<a name="line.344"></a>
+<span class="sourceLineNo">345</span>          } else if (ioe instanceof ConnectException) {<a name="line.345"></a>
+<span class="sourceLineNo">346</span>            LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.346"></a>
+<span class="sourceLineNo">347</span>            replicationSinkMgr.chooseSinks();<a name="line.347"></a>
+<span class="sourceLineNo">348</span>          } else {<a name="line.348"></a>
+<span class="sourceLineNo">349</span>            LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.349"></a>
+<span class="sourceLineNo">350</span>          }<a name="line.350"></a>
+<span class="sourceLineNo">351</span>        }<a name="line.351"></a>
+<span class="sourceLineNo">352</span>        if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.352"></a>
+<span class="sourceLineNo">353</span>          sleepMultiplier++;<a name="line.353"></a>
+<span class="sourceLineNo">354</span>        }<a name="line.354"></a>
+<span class="sourceLineNo">355</span>      }<a name="line.355"></a>
+<span class="sourceLineNo">356</span>    }<a name="line.356"></a>
+<span class="sourceLineNo">357</span>    return false; // in case we exited before replicating<a name="line.357"></a>
+<span class="sourceLineNo">358</span>  }<a name="line.358"></a>
+<span class="sourceLineNo">359</span><a name="line.359"></a>
+<span class="sourceLineNo">360</span>  protected boolean isPeerEnabled() {<a name="line.360"></a>
+<span class="sourceLineNo">361</span>    return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.361"></a>
+<span class="sourceLineNo">362</span>  }<a name="line.362"></a>
+<span class="sourceLineNo">363</span><a name="line.363"></a>
+<span class="sourceLineNo">364</span>  @Override<a name="line.364"></a>
+<span class="sourceLineNo">365</span>  protected void doStop() {<a name="line.365"></a>
+<span class="sourceLineNo">366</span>    disconnect(); //don't call super.doStop()<a name="line.366"></a>
+<span class="sourceLineNo">367</span>    if (this.conn != null) {<a name="line.367"></a>
+<span class="sourceLineNo">368</span>      try {<a name="line.368"></a>
+<span class="sourceLineNo">369</span>        this.conn.close();<a name="line.369"></a>
+<span class="sourceLineNo">370</span>        this.conn = null;<a name="line.370"></a>
+<span class="sourceLineNo">371</span>      } catch (IOException e) {<a name="line.371"></a>
+<span class="sourceLineNo">372</span>        LOG.warn("Failed to close the connection");<a name="line.372"></a>
+<span class="sourceLineNo">373</span>      }<a name="line.373"></a>
+<span class="sourceLineNo">374</span>    }<a name="line.374"></a>
+<span class="sourceLineNo">375</span>    // Allow currently running replication tasks to finish<a name="line.375"></a>
+<span class="sourceLineNo">376</span>    exec.shutdown();<a name="line.376"></a>
+<span class="sourceLineNo">377</span>    try {<a name="line.377"></a>
+<span class="sourceLineNo">378</span>      exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.378"></a>
+<span class="sourceLineNo">379</span>    } catch (InterruptedException e) {<a name="line.379"></a>
+<span class="sourceLineNo">380</span>    }<a name="line.380"></a>
+<span class="sourceLineNo">381</span>    // Abort if the tasks did not terminate in time<a name="line.381"></a>
+<span class="sourceLineNo">382</span>    if (!exec.isTerminated()) {<a name="line.382"></a>
+<span class="sourceLineNo">383</span>      String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.383"></a>
+<span class="sourceLineNo">384</span>          "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.384"></a>
+<span class="sourceLineNo">385</span>          "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.385"></a>
+<span class="sourceLineNo">386</span>      abortable.abort(errMsg, new IOException(errMsg));<a name="line.386"></a>
+<span class="sourceLineNo">387</span>    }<a name="line.387"></a>
+<span class="sourceLineNo">388</span>    notifyStopped();<a name="line.388"></a>
+<span class="sourceLineNo">389</span>  }<a name="line.389"></a>
+<span class="sourceLineNo">390</span><a name="line.390"></a>
+<span class="sourceLineNo">391</span>  // is this needed? Nobody else will call doStop() otherwise<a name="line.391"></a>
+<span class="sourceLineNo">392</span>  @Override<a name="line.392"></a>
+<span class="sourceLineNo">393</span>  public State stopAndWait() {<a name="line.393"></a>
+<span class="sourceLineNo">394</span>    doStop();<a name="line.394"></a>
+<span class="sourceLineNo">395</span>    return super.stopAndWait();<a name="line.395"></a>
+<span class="sourceLineNo">396</span>  }<a name="line.396"></a>
+<span class="sourceLineNo">397</span><a name="line.397"></a>
+<span class="sourceLineNo">398</span>  @VisibleForTesting<a name="line.398"></a>
+<span class="sourceLineNo">399</span>  protected Replicator createReplicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.399"></a>
+<span class="sourceLineNo">400</span>    return new Replicator(entries, ordinal);<a name="line.400"></a>
+<span class="sourceLineNo">401</span>  }<a name="line.401"></a>
+<span class="sourceLineNo">402</span><a name="line.402"></a>
+<span class="sourceLineNo">403</span>  @VisibleForTesting<a name="line.403"></a>
+<span class="sourceLineNo">404</span>  protected class Replicator implements Callable&lt;Integer&gt; {<a name="line.404"></a>
+<span class="sourceLineNo">405</span>    private List&lt;Entry&gt; entries;<a name="line.405"></a>
+<span class="sourceLineNo">406</span>    private int ordinal;<a name="line.406"></a>
+<span class="sourceLineNo">407</span>    public Replicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.407"></a>
+<span class="sourceLineNo">408</span>      this.entries = entries;<a name="line.408"></a>
+<span class="sourceLineNo">409</span>      this.ordinal = ordinal;<a name="line.409"></a>
+<span class="sourceLineNo">410</span>    }<a name="line.410"></a>
+<span class="sourceLineNo">411</span><a name="line.411"></a>
+<span class="sourceLineNo">412</span>    protected void replicateEntries(BlockingInterface rrs, final List&lt;Entry&gt; batch,<a name="line.412"></a>
+<span class="sourceLineNo">413</span>        String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.413"></a>
+<span class="sourceLineNo">414</span>        throws IOException {<a name="line.414"></a>
+<span class="sourceLineNo">415</span>      if (LOG.isTraceEnabled()) {<a name="line.415"></a>
+<span class="sourceLineNo">416</span>        long size = 0;<a name="line.416"></a>
+<span class="sourceLineNo">417</span>        for (Entry e: entries) {<a name="line.417"></a>
+<span class="sourceLineNo">418</span>          size += e.getKey().estimatedSerializedSizeOf();<a name="line.418"></a>
+<span class="sourceLineNo">419</span>          size += e.getEdit().estimatedSerializedSizeOf();<a name="line.419"></a>
+<span class="sourceLineNo">420</span>        }<a name="line.420"></a>
+<span class="sourceLineNo">421</span>        LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.421"></a>
+<span class="sourceLineNo">422</span>            entries.size() + " entries with total size " + size + " bytes to " +<a name="line.422"></a>
+<span class="sourceLineNo">423</span>            replicationClusterId);<a name="line.423"></a>
+<span class="sourceLineNo">424</span>      }<a name="line.424"></a>
+<span class="sourceLineNo">425</span>      try {<a name="line.425"></a>
+<span class="sourceLineNo">426</span>        ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.426"></a>
+<span class="sourceLineNo">427</span>          replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.427"></a>
+<span class="sourceLineNo">428</span>        if (LOG.isTraceEnabled()) {<a name="line.428"></a>
+<span class="sourceLineNo">429</span>          LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.429"></a>
+<span class="sourceLineNo">430</span>        }<a name="line.430"></a>
+<span class="sourceLineNo">431</span>      } catch (IOException e) {<a name="line.431"></a>
+<span class="sourceLineNo">432</span>        if (LOG.isTraceEnabled()) {<a name="line.432"></a>
+<span class="sourceLineNo">433</span>          LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.433"></a>
+<span class="sourceLineNo">434</span>        }<a name="line.434"></a>
+<span class="sourceLineNo">435</span>        throw e;<a name="line.435"></a>
+<span class="sourceLineNo">436</span>      }<a name="line.436"></a>
+<span class="sourceLineNo">437</span>    }<a name="line.437"></a>
+<span class="sourceLineNo">438</span><a name="line.438"></a>
+<span class="sourceLineNo">439</span>    @Override<a name="line.439"></a>
+<span class="sourceLineNo">440</span>    public Integer call() throws IOException {<a name="line.440"></a>
+<span class="sourceLineNo">441</span>      SinkPeer sinkPeer = null;<a name="line.441"></a>
+<span class="sourceLineNo">442</span>      try {<a name="line.442"></a>
+<span class="sourceLineNo">443</span>        sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.443"></a>
+<span class="sourceLineNo">444</span>        BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.444"></a>
+<span class="sourceLineNo">445</span>        replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.445"></a>
+<span class="sourceLineNo">446</span>        replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.446"></a>
+<span class="sourceLineNo">447</span>        return ordinal;<a name="line.447"></a>
+<span class="sourceLineNo">448</span>      } catch (IOException ioe) {<a name="line.448"></a>
+<span class="sourceLineNo">449</span>        if (sinkPeer != null) {<a name="line.449"></a>
+<span class="sourceLineNo">450</span>          replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.450"></a>
+<span class="sourceLineNo">451</span>        }<a name="line.451"></a>
+<span class="sourceLineNo">452</span>        throw ioe;<a name="line.452"></a>
+<span class="sourceLineNo">453</span>      }<a name="line.453"></a>
+<span class="sourceLineNo">454</span>    }<a name="line.454"></a>
+<span class="sourceLineNo">455</span>  }<a name="line.455"></a>
+<span class="sourceLineNo">456</span>}<a name="line.456"></a>
 
 
 

http://git-wip-us.apache.org/repos/asf/hbase-site/blob/7e7efd3e/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
----------------------------------------------------------------------
diff --git a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
index 8ffd272..58b1653 100644
--- a/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
+++ b/devapidocs/src-html/org/apache/hadoop/hbase/replication/regionserver/HBaseInterClusterReplicationEndpoint.html
@@ -233,220 +233,235 @@
 <span class="sourceLineNo">225</span>    return entryLists;<a name="line.225"></a>
 <span class="sourceLineNo">226</span>  }<a name="line.226"></a>
 <span class="sourceLineNo">227</span><a name="line.227"></a>
-<span class="sourceLineNo">228</span>  /**<a name="line.228"></a>
-<span class="sourceLineNo">229</span>   * Do the shipping logic<a name="line.229"></a>
-<span class="sourceLineNo">230</span>   */<a name="line.230"></a>
-<span class="sourceLineNo">231</span>  @Override<a name="line.231"></a>
-<span class="sourceLineNo">232</span>  public boolean replicate(ReplicateContext replicateContext) {<a name="line.232"></a>
-<span class="sourceLineNo">233</span>    CompletionService&lt;Integer&gt; pool = new ExecutorCompletionService&lt;&gt;(this.exec);<a name="line.233"></a>
-<span class="sourceLineNo">234</span>    List&lt;List&lt;Entry&gt;&gt; batches;<a name="line.234"></a>
-<span class="sourceLineNo">235</span>    String walGroupId = replicateContext.getWalGroupId();<a name="line.235"></a>
-<span class="sourceLineNo">236</span>    int sleepMultiplier = 1;<a name="line.236"></a>
-<span class="sourceLineNo">237</span><a name="line.237"></a>
-<span class="sourceLineNo">238</span>    if (!peersSelected &amp;&amp; this.isRunning()) {<a name="line.238"></a>
-<span class="sourceLineNo">239</span>      connectToPeers();<a name="line.239"></a>
-<span class="sourceLineNo">240</span>      peersSelected = true;<a name="line.240"></a>
-<span class="sourceLineNo">241</span>    }<a name="line.241"></a>
-<span class="sourceLineNo">242</span><a name="line.242"></a>
-<span class="sourceLineNo">243</span>    int numSinks = replicationSinkMgr.getNumSinks();<a name="line.243"></a>
-<span class="sourceLineNo">244</span>    if (numSinks == 0) {<a name="line.244"></a>
-<span class="sourceLineNo">245</span>      LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.245"></a>
-<span class="sourceLineNo">246</span>          + " with the same set of edits.");<a name="line.246"></a>
-<span class="sourceLineNo">247</span>      return false;<a name="line.247"></a>
-<span class="sourceLineNo">248</span>    }<a name="line.248"></a>
+<span class="sourceLineNo">228</span>  private void reconnectToPeerCluster() {<a name="line.228"></a>
+<span class="sourceLineNo">229</span>    ClusterConnection connection = null;<a name="line.229"></a>
+<span class="sourceLineNo">230</span>    try {<a name="line.230"></a>
+<span class="sourceLineNo">231</span>      connection = (ClusterConnection) ConnectionFactory.createConnection(this.conf);<a name="line.231"></a>
+<span class="sourceLineNo">232</span>    } catch (IOException ioe) {<a name="line.232"></a>
+<span class="sourceLineNo">233</span>      LOG.warn("Failed to create connection for peer cluster", ioe);<a name="line.233"></a>
+<span class="sourceLineNo">234</span>    }<a name="line.234"></a>
+<span class="sourceLineNo">235</span>    if (connection != null) {<a name="line.235"></a>
+<span class="sourceLineNo">236</span>      this.conn = connection;<a name="line.236"></a>
+<span class="sourceLineNo">237</span>    }<a name="line.237"></a>
+<span class="sourceLineNo">238</span>  }<a name="line.238"></a>
+<span class="sourceLineNo">239</span><a name="line.239"></a>
+<span class="sourceLineNo">240</span>  /**<a name="line.240"></a>
+<span class="sourceLineNo">241</span>   * Do the shipping logic<a name="line.241"></a>
+<span class="sourceLineNo">242</span>   */<a name="line.242"></a>
+<span class="sourceLineNo">243</span>  @Override<a name="line.243"></a>
+<span class="sourceLineNo">244</span>  public boolean replicate(ReplicateContext replicateContext) {<a name="line.244"></a>
+<span class="sourceLineNo">245</span>    CompletionService&lt;Integer&gt; pool = new ExecutorCompletionService&lt;&gt;(this.exec);<a name="line.245"></a>
+<span class="sourceLineNo">246</span>    List&lt;List&lt;Entry&gt;&gt; batches;<a name="line.246"></a>
+<span class="sourceLineNo">247</span>    String walGroupId = replicateContext.getWalGroupId();<a name="line.247"></a>
+<span class="sourceLineNo">248</span>    int sleepMultiplier = 1;<a name="line.248"></a>
 <span class="sourceLineNo">249</span><a name="line.249"></a>
-<span class="sourceLineNo">250</span>    batches = createBatches(replicateContext.getEntries());<a name="line.250"></a>
-<span class="sourceLineNo">251</span><a name="line.251"></a>
-<span class="sourceLineNo">252</span>    while (this.isRunning() &amp;&amp; !exec.isShutdown()) {<a name="line.252"></a>
-<span class="sourceLineNo">253</span>      if (!isPeerEnabled()) {<a name="line.253"></a>
-<span class="sourceLineNo">254</span>        if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.254"></a>
-<span class="sourceLineNo">255</span>          sleepMultiplier++;<a name="line.255"></a>
-<span class="sourceLineNo">256</span>        }<a name="line.256"></a>
-<span class="sourceLineNo">257</span>        continue;<a name="line.257"></a>
-<span class="sourceLineNo">258</span>      }<a name="line.258"></a>
-<span class="sourceLineNo">259</span>      try {<a name="line.259"></a>
-<span class="sourceLineNo">260</span>        int futures = 0;<a name="line.260"></a>
-<span class="sourceLineNo">261</span>        for (int i=0; i&lt;batches.size(); i++) {<a name="line.261"></a>
-<span class="sourceLineNo">262</span>          List&lt;Entry&gt; entries = batches.get(i);<a name="line.262"></a>
-<span class="sourceLineNo">263</span>          if (!entries.isEmpty()) {<a name="line.263"></a>
-<span class="sourceLineNo">264</span>            if (LOG.isTraceEnabled()) {<a name="line.264"></a>
-<span class="sourceLineNo">265</span>              LOG.trace("Submitting " + entries.size() +<a name="line.265"></a>
-<span class="sourceLineNo">266</span>                  " entries of total size " + replicateContext.getSize());<a name="line.266"></a>
-<span class="sourceLineNo">267</span>            }<a name="line.267"></a>
-<span class="sourceLineNo">268</span>            // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.268"></a>
-<span class="sourceLineNo">269</span>            pool.submit(createReplicator(entries, i));<a name="line.269"></a>
-<span class="sourceLineNo">270</span>            futures++;<a name="line.270"></a>
-<span class="sourceLineNo">271</span>          }<a name="line.271"></a>
-<span class="sourceLineNo">272</span>        }<a name="line.272"></a>
-<span class="sourceLineNo">273</span>        IOException iox = null;<a name="line.273"></a>
-<span class="sourceLineNo">274</span><a name="line.274"></a>
-<span class="sourceLineNo">275</span>        long lastWriteTime = 0;<a name="line.275"></a>
-<span class="sourceLineNo">276</span>        for (int i=0; i&lt;futures; i++) {<a name="line.276"></a>
-<span class="sourceLineNo">277</span>          try {<a name="line.277"></a>
-<span class="sourceLineNo">278</span>            // wait for all futures, remove successful parts<a name="line.278"></a>
-<span class="sourceLineNo">279</span>            // (only the remaining parts will be retried)<a name="line.279"></a>
-<span class="sourceLineNo">280</span>            Future&lt;Integer&gt; f = pool.take();<a name="line.280"></a>
-<span class="sourceLineNo">281</span>            int index = f.get().intValue();<a name="line.281"></a>
-<span class="sourceLineNo">282</span>            List&lt;Entry&gt; batch = batches.get(index);<a name="line.282"></a>
-<span class="sourceLineNo">283</span>            batches.set(index, Collections.&lt;Entry&gt;emptyList()); // remove successful batch<a name="line.283"></a>
-<span class="sourceLineNo">284</span>            // Find the most recent write time in the batch<a name="line.284"></a>
-<span class="sourceLineNo">285</span>            long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.285"></a>
-<span class="sourceLineNo">286</span>            if (writeTime &gt; lastWriteTime) {<a name="line.286"></a>
-<span class="sourceLineNo">287</span>              lastWriteTime = writeTime;<a name="line.287"></a>
-<span class="sourceLineNo">288</span>            }<a name="line.288"></a>
-<span class="sourceLineNo">289</span>          } catch (InterruptedException ie) {<a name="line.289"></a>
-<span class="sourceLineNo">290</span>            iox =  new IOException(ie);<a name="line.290"></a>
-<span class="sourceLineNo">291</span>          } catch (ExecutionException ee) {<a name="line.291"></a>
-<span class="sourceLineNo">292</span>            // cause must be an IOException<a name="line.292"></a>
-<span class="sourceLineNo">293</span>            iox = (IOException)ee.getCause();<a name="line.293"></a>
-<span class="sourceLineNo">294</span>          }<a name="line.294"></a>
-<span class="sourceLineNo">295</span>        }<a name="line.295"></a>
-<span class="sourceLineNo">296</span>        if (iox != null) {<a name="line.296"></a>
-<span class="sourceLineNo">297</span>          // if we had any exceptions, try again<a name="line.297"></a>
-<span class="sourceLineNo">298</span>          throw iox;<a name="line.298"></a>
-<span class="sourceLineNo">299</span>        }<a name="line.299"></a>
-<span class="sourceLineNo">300</span>        // update metrics<a name="line.300"></a>
-<span class="sourceLineNo">301</span>        if (lastWriteTime &gt; 0) {<a name="line.301"></a>
-<span class="sourceLineNo">302</span>          this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.302"></a>
-<span class="sourceLineNo">303</span>        }<a name="line.303"></a>
-<span class="sourceLineNo">304</span>        return true;<a name="line.304"></a>
-<span class="sourceLineNo">305</span><a name="line.305"></a>
-<span class="sourceLineNo">306</span>      } catch (IOException ioe) {<a name="line.306"></a>
-<span class="sourceLineNo">307</span>        // Didn't ship anything, but must still age the last time we did<a name="line.307"></a>
-<span class="sourceLineNo">308</span>        this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.308"></a>
-<span class="sourceLineNo">309</span>        if (ioe instanceof RemoteException) {<a name="line.309"></a>
-<span class="sourceLineNo">310</span>          ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.310"></a>
-<span class="sourceLineNo">311</span>          LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.311"></a>
-<span class="sourceLineNo">312</span>          if (ioe instanceof TableNotFoundException) {<a name="line.312"></a>
-<span class="sourceLineNo">313</span>            if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.313"></a>
-<span class="sourceLineNo">314</span>                + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.314"></a>
-<span class="sourceLineNo">315</span>              sleepMultiplier++;<a name="line.315"></a>
-<span class="sourceLineNo">316</span>            }<a name="line.316"></a>
-<span class="sourceLineNo">317</span>          } else {<a name="line.317"></a>
-<span class="sourceLineNo">318</span>            LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.318"></a>
-<span class="sourceLineNo">319</span>            replicationSinkMgr.chooseSinks();<a name="line.319"></a>
-<span class="sourceLineNo">320</span>          }<a name="line.320"></a>
-<span class="sourceLineNo">321</span>        } else {<a name="line.321"></a>
-<span class="sourceLineNo">322</span>          if (ioe instanceof SocketTimeoutException) {<a name="line.322"></a>
-<span class="sourceLineNo">323</span>            // This exception means we waited for more than 60s and nothing<a name="line.323"></a>
-<span class="sourceLineNo">324</span>            // happened, the cluster is alive and calling it right away<a name="line.324"></a>
-<span class="sourceLineNo">325</span>            // even for a test just makes things worse.<a name="line.325"></a>
-<span class="sourceLineNo">326</span>            sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.326"></a>
-<span class="sourceLineNo">327</span>              "call to the remote cluster timed out, which is usually " +<a name="line.327"></a>
-<span class="sourceLineNo">328</span>              "caused by a machine failure or a massive slowdown",<a name="line.328"></a>
-<span class="sourceLineNo">329</span>              this.socketTimeoutMultiplier);<a name="line.329"></a>
-<span class="sourceLineNo">330</span>          } else if (ioe instanceof ConnectException) {<a name="line.330"></a>
-<span class="sourceLineNo">331</span>            LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.331"></a>
-<span class="sourceLineNo">332</span>            replicationSinkMgr.chooseSinks();<a name="line.332"></a>
-<span class="sourceLineNo">333</span>          } else {<a name="line.333"></a>
-<span class="sourceLineNo">334</span>            LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.334"></a>
+<span class="sourceLineNo">250</span>    if (!peersSelected &amp;&amp; this.isRunning()) {<a name="line.250"></a>
+<span class="sourceLineNo">251</span>      connectToPeers();<a name="line.251"></a>
+<span class="sourceLineNo">252</span>      peersSelected = true;<a name="line.252"></a>
+<span class="sourceLineNo">253</span>    }<a name="line.253"></a>
+<span class="sourceLineNo">254</span><a name="line.254"></a>
+<span class="sourceLineNo">255</span>    int numSinks = replicationSinkMgr.getNumSinks();<a name="line.255"></a>
+<span class="sourceLineNo">256</span>    if (numSinks == 0) {<a name="line.256"></a>
+<span class="sourceLineNo">257</span>      LOG.warn("No replication sinks found, returning without replicating. The source should retry"<a name="line.257"></a>
+<span class="sourceLineNo">258</span>          + " with the same set of edits.");<a name="line.258"></a>
+<span class="sourceLineNo">259</span>      return false;<a name="line.259"></a>
+<span class="sourceLineNo">260</span>    }<a name="line.260"></a>
+<span class="sourceLineNo">261</span><a name="line.261"></a>
+<span class="sourceLineNo">262</span>    batches = createBatches(replicateContext.getEntries());<a name="line.262"></a>
+<span class="sourceLineNo">263</span><a name="line.263"></a>
+<span class="sourceLineNo">264</span>    while (this.isRunning() &amp;&amp; !exec.isShutdown()) {<a name="line.264"></a>
+<span class="sourceLineNo">265</span>      if (!isPeerEnabled()) {<a name="line.265"></a>
+<span class="sourceLineNo">266</span>        if (sleepForRetries("Replication is disabled", sleepMultiplier)) {<a name="line.266"></a>
+<span class="sourceLineNo">267</span>          sleepMultiplier++;<a name="line.267"></a>
+<span class="sourceLineNo">268</span>        }<a name="line.268"></a>
+<span class="sourceLineNo">269</span>        continue;<a name="line.269"></a>
+<span class="sourceLineNo">270</span>      }<a name="line.270"></a>
+<span class="sourceLineNo">271</span>      if (this.conn == null || this.conn.isClosed()) {<a name="line.271"></a>
+<span class="sourceLineNo">272</span>        reconnectToPeerCluster();<a name="line.272"></a>
+<span class="sourceLineNo">273</span>      }<a name="line.273"></a>
+<span class="sourceLineNo">274</span>      try {<a name="line.274"></a>
+<span class="sourceLineNo">275</span>        int futures = 0;<a name="line.275"></a>
+<span class="sourceLineNo">276</span>        for (int i=0; i&lt;batches.size(); i++) {<a name="line.276"></a>
+<span class="sourceLineNo">277</span>          List&lt;Entry&gt; entries = batches.get(i);<a name="line.277"></a>
+<span class="sourceLineNo">278</span>          if (!entries.isEmpty()) {<a name="line.278"></a>
+<span class="sourceLineNo">279</span>            if (LOG.isTraceEnabled()) {<a name="line.279"></a>
+<span class="sourceLineNo">280</span>              LOG.trace("Submitting " + entries.size() +<a name="line.280"></a>
+<span class="sourceLineNo">281</span>                  " entries of total size " + replicateContext.getSize());<a name="line.281"></a>
+<span class="sourceLineNo">282</span>            }<a name="line.282"></a>
+<span class="sourceLineNo">283</span>            // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource<a name="line.283"></a>
+<span class="sourceLineNo">284</span>            pool.submit(createReplicator(entries, i));<a name="line.284"></a>
+<span class="sourceLineNo">285</span>            futures++;<a name="line.285"></a>
+<span class="sourceLineNo">286</span>          }<a name="line.286"></a>
+<span class="sourceLineNo">287</span>        }<a name="line.287"></a>
+<span class="sourceLineNo">288</span>        IOException iox = null;<a name="line.288"></a>
+<span class="sourceLineNo">289</span><a name="line.289"></a>
+<span class="sourceLineNo">290</span>        long lastWriteTime = 0;<a name="line.290"></a>
+<span class="sourceLineNo">291</span>        for (int i=0; i&lt;futures; i++) {<a name="line.291"></a>
+<span class="sourceLineNo">292</span>          try {<a name="line.292"></a>
+<span class="sourceLineNo">293</span>            // wait for all futures, remove successful parts<a name="line.293"></a>
+<span class="sourceLineNo">294</span>            // (only the remaining parts will be retried)<a name="line.294"></a>
+<span class="sourceLineNo">295</span>            Future&lt;Integer&gt; f = pool.take();<a name="line.295"></a>
+<span class="sourceLineNo">296</span>            int index = f.get().intValue();<a name="line.296"></a>
+<span class="sourceLineNo">297</span>            List&lt;Entry&gt; batch = batches.get(index);<a name="line.297"></a>
+<span class="sourceLineNo">298</span>            batches.set(index, Collections.&lt;Entry&gt;emptyList()); // remove successful batch<a name="line.298"></a>
+<span class="sourceLineNo">299</span>            // Find the most recent write time in the batch<a name="line.299"></a>
+<span class="sourceLineNo">300</span>            long writeTime = batch.get(batch.size() - 1).getKey().getWriteTime();<a name="line.300"></a>
+<span class="sourceLineNo">301</span>            if (writeTime &gt; lastWriteTime) {<a name="line.301"></a>
+<span class="sourceLineNo">302</span>              lastWriteTime = writeTime;<a name="line.302"></a>
+<span class="sourceLineNo">303</span>            }<a name="line.303"></a>
+<span class="sourceLineNo">304</span>          } catch (InterruptedException ie) {<a name="line.304"></a>
+<span class="sourceLineNo">305</span>            iox =  new IOException(ie);<a name="line.305"></a>
+<span class="sourceLineNo">306</span>          } catch (ExecutionException ee) {<a name="line.306"></a>
+<span class="sourceLineNo">307</span>            // cause must be an IOException<a name="line.307"></a>
+<span class="sourceLineNo">308</span>            iox = (IOException)ee.getCause();<a name="line.308"></a>
+<span class="sourceLineNo">309</span>          }<a name="line.309"></a>
+<span class="sourceLineNo">310</span>        }<a name="line.310"></a>
+<span class="sourceLineNo">311</span>        if (iox != null) {<a name="line.311"></a>
+<span class="sourceLineNo">312</span>          // if we had any exceptions, try again<a name="line.312"></a>
+<span class="sourceLineNo">313</span>          throw iox;<a name="line.313"></a>
+<span class="sourceLineNo">314</span>        }<a name="line.314"></a>
+<span class="sourceLineNo">315</span>        // update metrics<a name="line.315"></a>
+<span class="sourceLineNo">316</span>        if (lastWriteTime &gt; 0) {<a name="line.316"></a>
+<span class="sourceLineNo">317</span>          this.metrics.setAgeOfLastShippedOp(lastWriteTime, walGroupId);<a name="line.317"></a>
+<span class="sourceLineNo">318</span>        }<a name="line.318"></a>
+<span class="sourceLineNo">319</span>        return true;<a name="line.319"></a>
+<span class="sourceLineNo">320</span><a name="line.320"></a>
+<span class="sourceLineNo">321</span>      } catch (IOException ioe) {<a name="line.321"></a>
+<span class="sourceLineNo">322</span>        // Didn't ship anything, but must still age the last time we did<a name="line.322"></a>
+<span class="sourceLineNo">323</span>        this.metrics.refreshAgeOfLastShippedOp(walGroupId);<a name="line.323"></a>
+<span class="sourceLineNo">324</span>        if (ioe instanceof RemoteException) {<a name="line.324"></a>
+<span class="sourceLineNo">325</span>          ioe = ((RemoteException) ioe).unwrapRemoteException();<a name="line.325"></a>
+<span class="sourceLineNo">326</span>          LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);<a name="line.326"></a>
+<span class="sourceLineNo">327</span>          if (ioe instanceof TableNotFoundException) {<a name="line.327"></a>
+<span class="sourceLineNo">328</span>            if (sleepForRetries("A table is missing in the peer cluster. "<a name="line.328"></a>
+<span class="sourceLineNo">329</span>                + "Replication cannot proceed without losing data.", sleepMultiplier)) {<a name="line.329"></a>
+<span class="sourceLineNo">330</span>              sleepMultiplier++;<a name="line.330"></a>
+<span class="sourceLineNo">331</span>            }<a name="line.331"></a>
+<span class="sourceLineNo">332</span>          } else {<a name="line.332"></a>
+<span class="sourceLineNo">333</span>            LOG.warn("Peer encountered RemoteException, rechecking all sinks: ", ioe);<a name="line.333"></a>
+<span class="sourceLineNo">334</span>            replicationSinkMgr.chooseSinks();<a name="line.334"></a>
 <span class="sourceLineNo">335</span>          }<a name="line.335"></a>
-<span class="sourceLineNo">336</span>        }<a name="line.336"></a>
-<span class="sourceLineNo">337</span>        if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.337"></a>
-<span class="sourceLineNo">338</span>          sleepMultiplier++;<a name="line.338"></a>
-<span class="sourceLineNo">339</span>        }<a name="line.339"></a>
-<span class="sourceLineNo">340</span>      }<a name="line.340"></a>
-<span class="sourceLineNo">341</span>    }<a name="line.341"></a>
-<span class="sourceLineNo">342</span>    return false; // in case we exited before replicating<a name="line.342"></a>
-<span class="sourceLineNo">343</span>  }<a name="line.343"></a>
-<span class="sourceLineNo">344</span><a name="line.344"></a>
-<span class="sourceLineNo">345</span>  protected boolean isPeerEnabled() {<a name="line.345"></a>
-<span class="sourceLineNo">346</span>    return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.346"></a>
-<span class="sourceLineNo">347</span>  }<a name="line.347"></a>
-<span class="sourceLineNo">348</span><a name="line.348"></a>
-<span class="sourceLineNo">349</span>  @Override<a name="line.349"></a>
-<span class="sourceLineNo">350</span>  protected void doStop() {<a name="line.350"></a>
-<span class="sourceLineNo">351</span>    disconnect(); //don't call super.doStop()<a name="line.351"></a>
-<span class="sourceLineNo">352</span>    if (this.conn != null) {<a name="line.352"></a>
-<span class="sourceLineNo">353</span>      try {<a name="line.353"></a>
-<span class="sourceLineNo">354</span>        this.conn.close();<a name="line.354"></a>
-<span class="sourceLineNo">355</span>        this.conn = null;<a name="line.355"></a>
-<span class="sourceLineNo">356</span>      } catch (IOException e) {<a name="line.356"></a>
-<span class="sourceLineNo">357</span>        LOG.warn("Failed to close the connection");<a name="line.357"></a>
-<span class="sourceLineNo">358</span>      }<a name="line.358"></a>
-<span class="sourceLineNo">359</span>    }<a name="line.359"></a>
-<span class="sourceLineNo">360</span>    // Allow currently running replication tasks to finish<a name="line.360"></a>
-<span class="sourceLineNo">361</span>    exec.shutdown();<a name="line.361"></a>
-<span class="sourceLineNo">362</span>    try {<a name="line.362"></a>
-<span class="sourceLineNo">363</span>      exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.363"></a>
-<span class="sourceLineNo">364</span>    } catch (InterruptedException e) {<a name="line.364"></a>
-<span class="sourceLineNo">365</span>    }<a name="line.365"></a>
-<span class="sourceLineNo">366</span>    // Abort if the tasks did not terminate in time<a name="line.366"></a>
-<span class="sourceLineNo">367</span>    if (!exec.isTerminated()) {<a name="line.367"></a>
-<span class="sourceLineNo">368</span>      String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.368"></a>
-<span class="sourceLineNo">369</span>          "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.369"></a>
-<span class="sourceLineNo">370</span>          "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.370"></a>
-<span class="sourceLineNo">371</span>      abortable.abort(errMsg, new IOException(errMsg));<a name="line.371"></a>
-<span class="sourceLineNo">372</span>    }<a name="line.372"></a>
-<span class="sourceLineNo">373</span>    notifyStopped();<a name="line.373"></a>
-<span class="sourceLineNo">374</span>  }<a name="line.374"></a>
-<span class="sourceLineNo">375</span><a name="line.375"></a>
-<span class="sourceLineNo">376</span>  // is this needed? Nobody else will call doStop() otherwise<a name="line.376"></a>
-<span class="sourceLineNo">377</span>  @Override<a name="line.377"></a>
-<span class="sourceLineNo">378</span>  public State stopAndWait() {<a name="line.378"></a>
-<span class="sourceLineNo">379</span>    doStop();<a name="line.379"></a>
-<span class="sourceLineNo">380</span>    return super.stopAndWait();<a name="line.380"></a>
-<span class="sourceLineNo">381</span>  }<a name="line.381"></a>
-<span class="sourceLineNo">382</span><a name="line.382"></a>
-<span class="sourceLineNo">383</span>  @VisibleForTesting<a name="line.383"></a>
-<span class="sourceLineNo">384</span>  protected Replicator createReplicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.384"></a>
-<span class="sourceLineNo">385</span>    return new Replicator(entries, ordinal);<a name="line.385"></a>
-<span class="sourceLineNo">386</span>  }<a name="line.386"></a>
-<span class="sourceLineNo">387</span><a name="line.387"></a>
-<span class="sourceLineNo">388</span>  @VisibleForTesting<a name="line.388"></a>
-<span class="sourceLineNo">389</span>  protected class Replicator implements Callable&lt;Integer&gt; {<a name="line.389"></a>
-<span class="sourceLineNo">390</span>    private List&lt;Entry&gt; entries;<a name="line.390"></a>
-<span class="sourceLineNo">391</span>    private int ordinal;<a name="line.391"></a>
-<span class="sourceLineNo">392</span>    public Replicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.392"></a>
-<span class="sourceLineNo">393</span>      this.entries = entries;<a name="line.393"></a>
-<span class="sourceLineNo">394</span>      this.ordinal = ordinal;<a name="line.394"></a>
-<span class="sourceLineNo">395</span>    }<a name="line.395"></a>
-<span class="sourceLineNo">396</span><a name="line.396"></a>
-<span class="sourceLineNo">397</span>    protected void replicateEntries(BlockingInterface rrs, final List&lt;Entry&gt; batch,<a name="line.397"></a>
-<span class="sourceLineNo">398</span>        String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.398"></a>
-<span class="sourceLineNo">399</span>        throws IOException {<a name="line.399"></a>
-<span class="sourceLineNo">400</span>      if (LOG.isTraceEnabled()) {<a name="line.400"></a>
-<span class="sourceLineNo">401</span>        long size = 0;<a name="line.401"></a>
-<span class="sourceLineNo">402</span>        for (Entry e: entries) {<a name="line.402"></a>
-<span class="sourceLineNo">403</span>          size += e.getKey().estimatedSerializedSizeOf();<a name="line.403"></a>
-<span class="sourceLineNo">404</span>          size += e.getEdit().estimatedSerializedSizeOf();<a name="line.404"></a>
-<span class="sourceLineNo">405</span>        }<a name="line.405"></a>
-<span class="sourceLineNo">406</span>        LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.406"></a>
-<span class="sourceLineNo">407</span>            entries.size() + " entries with total size " + size + " bytes to " +<a name="line.407"></a>
-<span class="sourceLineNo">408</span>            replicationClusterId);<a name="line.408"></a>
-<span class="sourceLineNo">409</span>      }<a name="line.409"></a>
-<span class="sourceLineNo">410</span>      try {<a name="line.410"></a>
-<span class="sourceLineNo">411</span>        ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.411"></a>
-<span class="sourceLineNo">412</span>          replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.412"></a>
-<span class="sourceLineNo">413</span>        if (LOG.isTraceEnabled()) {<a name="line.413"></a>
-<span class="sourceLineNo">414</span>          LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.414"></a>
-<span class="sourceLineNo">415</span>        }<a name="line.415"></a>
-<span class="sourceLineNo">416</span>      } catch (IOException e) {<a name="line.416"></a>
-<span class="sourceLineNo">417</span>        if (LOG.isTraceEnabled()) {<a name="line.417"></a>
-<span class="sourceLineNo">418</span>          LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.418"></a>
-<span class="sourceLineNo">419</span>        }<a name="line.419"></a>
-<span class="sourceLineNo">420</span>        throw e;<a name="line.420"></a>
-<span class="sourceLineNo">421</span>      }<a name="line.421"></a>
-<span class="sourceLineNo">422</span>    }<a name="line.422"></a>
-<span class="sourceLineNo">423</span><a name="line.423"></a>
-<span class="sourceLineNo">424</span>    @Override<a name="line.424"></a>
-<span class="sourceLineNo">425</span>    public Integer call() throws IOException {<a name="line.425"></a>
-<span class="sourceLineNo">426</span>      SinkPeer sinkPeer = null;<a name="line.426"></a>
-<span class="sourceLineNo">427</span>      try {<a name="line.427"></a>
-<span class="sourceLineNo">428</span>        sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.428"></a>
-<span class="sourceLineNo">429</span>        BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.429"></a>
-<span class="sourceLineNo">430</span>        replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.430"></a>
-<span class="sourceLineNo">431</span>        replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.431"></a>
-<span class="sourceLineNo">432</span>        return ordinal;<a name="line.432"></a>
-<span class="sourceLineNo">433</span>      } catch (IOException ioe) {<a name="line.433"></a>
-<span class="sourceLineNo">434</span>        if (sinkPeer != null) {<a name="line.434"></a>
-<span class="sourceLineNo">435</span>          replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.435"></a>
-<span class="sourceLineNo">436</span>        }<a name="line.436"></a>
-<span class="sourceLineNo">437</span>        throw ioe;<a name="line.437"></a>
-<span class="sourceLineNo">438</span>      }<a name="line.438"></a>
-<span class="sourceLineNo">439</span>    }<a name="line.439"></a>
-<span class="sourceLineNo">440</span>  }<a name="line.440"></a>
-<span class="sourceLineNo">441</span>}<a name="line.441"></a>
+<span class="sourceLineNo">336</span>        } else {<a name="line.336"></a>
+<span class="sourceLineNo">337</span>          if (ioe instanceof SocketTimeoutException) {<a name="line.337"></a>
+<span class="sourceLineNo">338</span>            // This exception means we waited for more than 60s and nothing<a name="line.338"></a>
+<span class="sourceLineNo">339</span>            // happened, the cluster is alive and calling it right away<a name="line.339"></a>
+<span class="sourceLineNo">340</span>            // even for a test just makes things worse.<a name="line.340"></a>
+<span class="sourceLineNo">341</span>            sleepForRetries("Encountered a SocketTimeoutException. Since the " +<a name="line.341"></a>
+<span class="sourceLineNo">342</span>              "call to the remote cluster timed out, which is usually " +<a name="line.342"></a>
+<span class="sourceLineNo">343</span>              "caused by a machine failure or a massive slowdown",<a name="line.343"></a>
+<span class="sourceLineNo">344</span>              this.socketTimeoutMultiplier);<a name="line.344"></a>
+<span class="sourceLineNo">345</span>          } else if (ioe instanceof ConnectException) {<a name="line.345"></a>
+<span class="sourceLineNo">346</span>            LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);<a name="line.346"></a>
+<span class="sourceLineNo">347</span>            replicationSinkMgr.chooseSinks();<a name="line.347"></a>
+<span class="sourceLineNo">348</span>          } else {<a name="line.348"></a>
+<span class="sourceLineNo">349</span>            LOG.warn("Can't replicate because of a local or network error: ", ioe);<a name="line.349"></a>
+<span class="sourceLineNo">350</span>          }<a name="line.350"></a>
+<span class="sourceLineNo">351</span>        }<a name="line.351"></a>
+<span class="sourceLineNo">352</span>        if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {<a name="line.352"></a>
+<span class="sourceLineNo">353</span>          sleepMultiplier++;<a name="line.353"></a>
+<span class="sourceLineNo">354</span>        }<a name="line.354"></a>
+<span class="sourceLineNo">355</span>      }<a name="line.355"></a>
+<span class="sourceLineNo">356</span>    }<a name="line.356"></a>
+<span class="sourceLineNo">357</span>    return false; // in case we exited before replicating<a name="line.357"></a>
+<span class="sourceLineNo">358</span>  }<a name="line.358"></a>
+<span class="sourceLineNo">359</span><a name="line.359"></a>
+<span class="sourceLineNo">360</span>  protected boolean isPeerEnabled() {<a name="line.360"></a>
+<span class="sourceLineNo">361</span>    return ctx.getReplicationPeer().getPeerState() == PeerState.ENABLED;<a name="line.361"></a>
+<span class="sourceLineNo">362</span>  }<a name="line.362"></a>
+<span class="sourceLineNo">363</span><a name="line.363"></a>
+<span class="sourceLineNo">364</span>  @Override<a name="line.364"></a>
+<span class="sourceLineNo">365</span>  protected void doStop() {<a name="line.365"></a>
+<span class="sourceLineNo">366</span>    disconnect(); //don't call super.doStop()<a name="line.366"></a>
+<span class="sourceLineNo">367</span>    if (this.conn != null) {<a name="line.367"></a>
+<span class="sourceLineNo">368</span>      try {<a name="line.368"></a>
+<span class="sourceLineNo">369</span>        this.conn.close();<a name="line.369"></a>
+<span class="sourceLineNo">370</span>        this.conn = null;<a name="line.370"></a>
+<span class="sourceLineNo">371</span>      } catch (IOException e) {<a name="line.371"></a>
+<span class="sourceLineNo">372</span>        LOG.warn("Failed to close the connection");<a name="line.372"></a>
+<span class="sourceLineNo">373</span>      }<a name="line.373"></a>
+<span class="sourceLineNo">374</span>    }<a name="line.374"></a>
+<span class="sourceLineNo">375</span>    // Allow currently running replication tasks to finish<a name="line.375"></a>
+<span class="sourceLineNo">376</span>    exec.shutdown();<a name="line.376"></a>
+<span class="sourceLineNo">377</span>    try {<a name="line.377"></a>
+<span class="sourceLineNo">378</span>      exec.awaitTermination(maxTerminationWait, TimeUnit.MILLISECONDS);<a name="line.378"></a>
+<span class="sourceLineNo">379</span>    } catch (InterruptedException e) {<a name="line.379"></a>
+<span class="sourceLineNo">380</span>    }<a name="line.380"></a>
+<span class="sourceLineNo">381</span>    // Abort if the tasks did not terminate in time<a name="line.381"></a>
+<span class="sourceLineNo">382</span>    if (!exec.isTerminated()) {<a name="line.382"></a>
+<span class="sourceLineNo">383</span>      String errMsg = "HBaseInterClusterReplicationEndpoint termination failed. The " +<a name="line.383"></a>
+<span class="sourceLineNo">384</span>          "ThreadPoolExecutor failed to finish all tasks within " + maxTerminationWait + "ms. " +<a name="line.384"></a>
+<span class="sourceLineNo">385</span>          "Aborting to prevent Replication from deadlocking. See HBASE-16081.";<a name="line.385"></a>
+<span class="sourceLineNo">386</span>      abortable.abort(errMsg, new IOException(errMsg));<a name="line.386"></a>
+<span class="sourceLineNo">387</span>    }<a name="line.387"></a>
+<span class="sourceLineNo">388</span>    notifyStopped();<a name="line.388"></a>
+<span class="sourceLineNo">389</span>  }<a name="line.389"></a>
+<span class="sourceLineNo">390</span><a name="line.390"></a>
+<span class="sourceLineNo">391</span>  // is this needed? Nobody else will call doStop() otherwise<a name="line.391"></a>
+<span class="sourceLineNo">392</span>  @Override<a name="line.392"></a>
+<span class="sourceLineNo">393</span>  public State stopAndWait() {<a name="line.393"></a>
+<span class="sourceLineNo">394</span>    doStop();<a name="line.394"></a>
+<span class="sourceLineNo">395</span>    return super.stopAndWait();<a name="line.395"></a>
+<span class="sourceLineNo">396</span>  }<a name="line.396"></a>
+<span class="sourceLineNo">397</span><a name="line.397"></a>
+<span class="sourceLineNo">398</span>  @VisibleForTesting<a name="line.398"></a>
+<span class="sourceLineNo">399</span>  protected Replicator createReplicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.399"></a>
+<span class="sourceLineNo">400</span>    return new Replicator(entries, ordinal);<a name="line.400"></a>
+<span class="sourceLineNo">401</span>  }<a name="line.401"></a>
+<span class="sourceLineNo">402</span><a name="line.402"></a>
+<span class="sourceLineNo">403</span>  @VisibleForTesting<a name="line.403"></a>
+<span class="sourceLineNo">404</span>  protected class Replicator implements Callable&lt;Integer&gt; {<a name="line.404"></a>
+<span class="sourceLineNo">405</span>    private List&lt;Entry&gt; entries;<a name="line.405"></a>
+<span class="sourceLineNo">406</span>    private int ordinal;<a name="line.406"></a>
+<span class="sourceLineNo">407</span>    public Replicator(List&lt;Entry&gt; entries, int ordinal) {<a name="line.407"></a>
+<span class="sourceLineNo">408</span>      this.entries = entries;<a name="line.408"></a>
+<span class="sourceLineNo">409</span>      this.ordinal = ordinal;<a name="line.409"></a>
+<span class="sourceLineNo">410</span>    }<a name="line.410"></a>
+<span class="sourceLineNo">411</span><a name="line.411"></a>
+<span class="sourceLineNo">412</span>    protected void replicateEntries(BlockingInterface rrs, final List&lt;Entry&gt; batch,<a name="line.412"></a>
+<span class="sourceLineNo">413</span>        String replicationClusterId, Path baseNamespaceDir, Path hfileArchiveDir)<a name="line.413"></a>
+<span class="sourceLineNo">414</span>        throws IOException {<a name="line.414"></a>
+<span class="sourceLineNo">415</span>      if (LOG.isTraceEnabled()) {<a name="line.415"></a>
+<span class="sourceLineNo">416</span>        long size = 0;<a name="line.416"></a>
+<span class="sourceLineNo">417</span>        for (Entry e: entries) {<a name="line.417"></a>
+<span class="sourceLineNo">418</span>          size += e.getKey().estimatedSerializedSizeOf();<a name="line.418"></a>
+<span class="sourceLineNo">419</span>          size += e.getEdit().estimatedSerializedSizeOf();<a name="line.419"></a>
+<span class="sourceLineNo">420</span>        }<a name="line.420"></a>
+<span class="sourceLineNo">421</span>        LOG.trace("Replicating batch " + System.identityHashCode(entries) + " of " +<a name="line.421"></a>
+<span class="sourceLineNo">422</span>            entries.size() + " entries with total size " + size + " bytes to " +<a name="line.422"></a>
+<span class="sourceLineNo">423</span>            replicationClusterId);<a name="line.423"></a>
+<span class="sourceLineNo">424</span>      }<a name="line.424"></a>
+<span class="sourceLineNo">425</span>      try {<a name="line.425"></a>
+<span class="sourceLineNo">426</span>        ReplicationProtbufUtil.replicateWALEntry(rrs, batch.toArray(new Entry[batch.size()]),<a name="line.426"></a>
+<span class="sourceLineNo">427</span>          replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.427"></a>
+<span class="sourceLineNo">428</span>        if (LOG.isTraceEnabled()) {<a name="line.428"></a>
+<span class="sourceLineNo">429</span>          LOG.trace("Completed replicating batch " + System.identityHashCode(entries));<a name="line.429"></a>
+<span class="sourceLineNo">430</span>        }<a name="line.430"></a>
+<span class="sourceLineNo">431</span>      } catch (IOException e) {<a name="line.431"></a>
+<span class="sourceLineNo">432</span>        if (LOG.isTraceEnabled()) {<a name="line.432"></a>
+<span class="sourceLineNo">433</span>          LOG.trace("Failed replicating batch " + System.identityHashCode(entries), e);<a name="line.433"></a>
+<span class="sourceLineNo">434</span>        }<a name="line.434"></a>
+<span class="sourceLineNo">435</span>        throw e;<a name="line.435"></a>
+<span class="sourceLineNo">436</span>      }<a name="line.436"></a>
+<span class="sourceLineNo">437</span>    }<a name="line.437"></a>
+<span class="sourceLineNo">438</span><a name="line.438"></a>
+<span class="sourceLineNo">439</span>    @Override<a name="line.439"></a>
+<span class="sourceLineNo">440</span>    public Integer call() throws IOException {<a name="line.440"></a>
+<span class="sourceLineNo">441</span>      SinkPeer sinkPeer = null;<a name="line.441"></a>
+<span class="sourceLineNo">442</span>      try {<a name="line.442"></a>
+<span class="sourceLineNo">443</span>        sinkPeer = replicationSinkMgr.getReplicationSink();<a name="line.443"></a>
+<span class="sourceLineNo">444</span>        BlockingInterface rrs = sinkPeer.getRegionServer();<a name="line.444"></a>
+<span class="sourceLineNo">445</span>        replicateEntries(rrs, entries, replicationClusterId, baseNamespaceDir, hfileArchiveDir);<a name="line.445"></a>
+<span class="sourceLineNo">446</span>        replicationSinkMgr.reportSinkSuccess(sinkPeer);<a name="line.446"></a>
+<span class="sourceLineNo">447</span>        return ordinal;<a name="line.447"></a>
+<span class="sourceLineNo">448</span>      } catch (IOException ioe) {<a name="line.448"></a>
+<span class="sourceLineNo">449</span>        if (sinkPeer != null) {<a name="line.449"></a>
+<span class="sourceLineNo">450</span>          replicationSinkMgr.reportBadSink(sinkPeer);<a name="line.450"></a>
+<span class="sourceLineNo">451</span>        }<a name="line.451"></a>
+<span class="sourceLineNo">452</span>        throw ioe;<a name="line.452"></a>
+<span class="sourceLineNo">453</span>      }<a name="line.453"></a>
+<span class="sourceLineNo">454</span>    }<a name="line.454"></a>
+<span class="sourceLineNo">455</span>  }<a name="line.455"></a>
+<span class="sourceLineNo">456</span>}<a name="line.456"></a>