You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tinkerpop.apache.org by sp...@apache.org on 2021/04/20 14:32:19 UTC

[tinkerpop] branch TINKERPOP-2550 created (now b954df8)

This is an automated email from the ASF dual-hosted git repository.

spmallette pushed a change to branch TINKERPOP-2550
in repository https://gitbox.apache.org/repos/asf/tinkerpop.git.


      at b954df8  TINKERPOP-2550 Fixed bug in java Client initialization

This branch includes the following new commits:

     new b954df8  TINKERPOP-2550 Fixed bug in java Client initialization

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[tinkerpop] 01/01: TINKERPOP-2550 Fixed bug in java Client initialization

Posted by sp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

spmallette pushed a commit to branch TINKERPOP-2550
in repository https://gitbox.apache.org/repos/asf/tinkerpop.git

commit b954df8e123f3362641deacf9822459ac741fe98
Author: Stephen Mallette <st...@amazon.com>
AuthorDate: Tue Apr 20 10:31:08 2021 -0400

    TINKERPOP-2550 Fixed bug in java Client initialization
---
 CHANGELOG.asciidoc                                 |  1 +
 .../apache/tinkerpop/gremlin/driver/Client.java    | 25 +++++++---
 .../tinkerpop/gremlin/driver/ConnectionPool.java   | 17 ++-----
 .../gremlin/driver/SimpleSocketServer.java         |  2 +-
 .../WebSocketClientBehaviorIntegrateTest.java      | 58 +++++++++++++++++++++-
 5 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc
index b5c5e21..aa0dee6 100644
--- a/CHANGELOG.asciidoc
+++ b/CHANGELOG.asciidoc
@@ -34,6 +34,7 @@ image::https://raw.githubusercontent.com/apache/tinkerpop/master/docs/static/ima
 * Exposed barrier size with getter for `NoOpBarrierStep`.
 * Bumped to Netty 4.1.61.
 * Added `max_content_length` as a Python driver setting.
+* Fixed bug in Java `Client` initialization where certain configurations might produce a deadlock.
 * Ensured that `barrier()` additions by strategies were controlled solely by `LazyBarrierStrategy`.
 * Fixed `NullPointerException` in `ResponseMessage` deserialization for GraphSON.
 * Enabled the Gremlin.Net driver to repair its connection pool after the server was temporarily unavailable.
diff --git a/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/Client.java b/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/Client.java
index af4ab31..7e23eed 100644
--- a/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/Client.java
+++ b/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/Client.java
@@ -18,6 +18,8 @@
  */
 package org.apache.tinkerpop.gremlin.driver;
 
+import org.apache.commons.lang.exception.ExceptionUtils;
+import org.apache.commons.lang3.concurrent.BasicThreadFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.tinkerpop.gremlin.driver.exception.ConnectionException;
@@ -41,6 +43,8 @@ import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CompletionException;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicReference;
@@ -511,19 +515,24 @@ public abstract class Client {
          */
         @Override
         protected void initializeImplementation() {
+            // use a special executor here to initialize the Host instances as the worker thread pool may be
+            // insufficiently sized for this task and the parallel initialization of the ConnectionPool. if too small
+            // tasks may be schedule in such a way as to produce a deadlock: TINKERPOP-2550
+            //
+            // the cost of this single threaded executor here should be fairly small because it is only used once at
+            // initialization and shutdown. since users will typically construct a Client once for the life of their
+            // application there shouldn't be tons of thread pools being created and destroyed.
+            final BasicThreadFactory threadFactory = new BasicThreadFactory.Builder().namingPattern("gremlin-driver-init-%d").build();
+            final ExecutorService hostExecutor = Executors.newSingleThreadExecutor(threadFactory);
             try {
                 CompletableFuture.allOf(cluster.allHosts().stream()
-                        .map(host -> CompletableFuture.runAsync(() -> initializeConnectionSetupForHost.accept(host), cluster.executor()))
+                        .map(host -> CompletableFuture.runAsync(() -> initializeConnectionSetupForHost.accept(host), hostExecutor))
                         .toArray(CompletableFuture[]::new))
                         .join();
             } catch (CompletionException ex) {
-                Throwable cause = null;
-                Throwable result = ex;
-                if (null != (cause = ex.getCause())) {
-                    result = cause;
-                }
-
-                logger.error("", result);
+                logger.error("", ExceptionUtils.getRootCause(ex));
+            } finally {
+                hostExecutor.shutdown();
             }
         }
 
diff --git a/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java b/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java
index 295c9d0..48e6093 100644
--- a/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java
+++ b/gremlin-driver/src/main/java/org/apache/tinkerpop/gremlin/driver/ConnectionPool.java
@@ -18,6 +18,7 @@
  */
 package org.apache.tinkerpop.gremlin.driver;
 
+import org.apache.commons.lang.exception.ExceptionUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.tinkerpop.gremlin.driver.exception.ConnectionException;
@@ -95,9 +96,9 @@ final class ConnectionPool {
         this.connections = new CopyOnWriteArrayList<>();
 
         try {
-            final List<CompletableFuture<Void>> connCreationFutures = new ArrayList<>();
+            final List<CompletableFuture<Void>> connectionCreationFutures = new ArrayList<>();
             for (int i = 0; i < minPoolSize; i++) {
-                connCreationFutures.add(CompletableFuture.runAsync(() -> {
+                connectionCreationFutures.add(CompletableFuture.runAsync(() -> {
                     try {
                         this.connections.add(new Connection(host.getHostUri(), this, settings.maxInProcessPerConnection));
                     } catch (ConnectionException e) {
@@ -106,7 +107,7 @@ final class ConnectionPool {
                 }, cluster.executor()));
             }
 
-            CompletableFuture.allOf(connCreationFutures.toArray(new CompletableFuture[0])).join();
+            CompletableFuture.allOf(connectionCreationFutures.toArray(new CompletableFuture[0])).join();
         } catch (CancellationException ce) {
             logger.warn("Initialization of connections cancelled for {}", getPoolInfo(), ce);
             throw ce;
@@ -118,15 +119,7 @@ final class ConnectionPool {
                     " Successful connections=" + this.connections.size() +
                     ". Closing the connection pool.";
 
-
-            Throwable cause = null;
-            Throwable result = ce;
-
-            if (null != (cause = result.getCause())) {
-                result = cause;
-            }
-
-            throw new CompletionException(errMsg, result);
+            throw new CompletionException(errMsg, ExceptionUtils.getRootCause(ce));
         }
 
         this.open = new AtomicInteger(connections.size());
diff --git a/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/SimpleSocketServer.java b/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/SimpleSocketServer.java
index d604fad..84eaf4a 100644
--- a/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/SimpleSocketServer.java
+++ b/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/SimpleSocketServer.java
@@ -36,7 +36,7 @@ public class SimpleSocketServer {
     private EventLoopGroup bossGroup;
     private EventLoopGroup workerGroup;
 
-    public Channel start(ChannelInitializer<SocketChannel>  channelInitializer) throws InterruptedException {
+    public Channel start(final ChannelInitializer<SocketChannel> channelInitializer) throws InterruptedException {
         bossGroup = new NioEventLoopGroup(1);
         workerGroup = new NioEventLoopGroup();
         final ServerBootstrap b = new ServerBootstrap();
diff --git a/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/WebSocketClientBehaviorIntegrateTest.java b/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/WebSocketClientBehaviorIntegrateTest.java
index ad4219d..0eb34f6 100644
--- a/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/WebSocketClientBehaviorIntegrateTest.java
+++ b/gremlin-driver/src/test/java/org/apache/tinkerpop/gremlin/driver/WebSocketClientBehaviorIntegrateTest.java
@@ -32,9 +32,13 @@ import org.apache.tinkerpop.gremlin.util.Log4jRecordingAppender;
 
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import static org.hamcrest.core.Is.is;
-import static org.junit.Assert.assertThat;
+import static org.hamcrest.MatcherAssert.assertThat;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 
@@ -85,6 +89,58 @@ public class WebSocketClientBehaviorIntegrateTest {
     }
 
     /**
+     * Constructs a deadlock situation when initializing a {@link Client} object in sessionless form that leads to
+     * hanging behavior in low resource environments (TINKERPOP-2504) and for certain configurations of the
+     * {@link Cluster} object where there are simply not enough threads to properly allow the {@link Host} and its
+     * related {@link ConnectionPool} objects to spin up properly - see TINKERPOP-2550.
+     */
+    @Test
+    public void shouldNotDeadlockOnInitialization() throws Exception {
+        // it seems you cah add the same host more than once so while kinda weird it is helpful in faithfully
+        // recreating the deadlock situation, though it can/will happen with just one host. workerPoolSize at
+        // "1" also helps faithfully reproduce the problem though it can happen at larger pool sizes depending
+        // on the timing/interleaving of tasks. the larger connection pool sizes may not be required given the
+        // other settings at play but again, just trying to make sure the deadlock state is consistently produced
+        // and a larger pool size will mean more time to elapse scheduling connection creation tasks which may
+        // further improve chances of scheduling conflicts that produce the deadlock.
+        //
+        // to force this test to a fail state, change ClusteredClient.initializeImplementation() to use the
+        // standard Cluster.executor rather than the hostExecutor (which is a single threaded independent thread
+        // pool used just for the purpose of initializing the hosts).
+        final Cluster cluster = Cluster.build("localhost").
+                addContactPoint("localhost").
+                addContactPoint("localhost").port(SimpleSocketServer.PORT).
+                workerPoolSize(1).
+                minConnectionPoolSize(32).maxConnectionPoolSize(32).create();
+
+        final AtomicBoolean failed = new AtomicBoolean(false);
+        final ExecutorService executor = Executors.newSingleThreadExecutor();
+        executor.submit(() -> {
+            try {
+                final Client client = cluster.connect();
+
+                // test will hang in init() where the Host and ConnectionPool are started up
+                client.init();
+            } catch (Exception ex) {
+                // should not "fail" - just hang and then timeout during the executor shutdown as there is
+                // a deadlock state, but we have this here just in case. a failed assertion of this value
+                // below could be interesting
+                logger.error("Client initialization failed with exception which was unexpected", ex);
+                failed.set(true);
+            } finally {
+                cluster.close();
+            }
+        });
+
+        executor.shutdown();
+
+        // 30 seconds should be ample time, even for travis. the deadlock state happens quite immediately in
+        // testing and in most situations this test should zip by in subsecond pace
+        assertThat(executor.awaitTermination(30, TimeUnit.SECONDS), is(true));
+        assertThat(failed.get(), is(false));
+    }
+
+    /**
      * Test a scenario when server closes a connection which does not have any active requests. Such connection
      * should be destroyed and replaced by another connection on next request.
      */