You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bs...@apache.org on 2018/11/28 21:29:43 UTC

[geode] branch develop updated: GEODE-6035 Increase backlog for peer-to-peer connection formation

This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new 0eb9444  GEODE-6035 Increase backlog for peer-to-peer connection formation
0eb9444 is described below

commit 0eb9444fe7f294ed9796920fe5df0cca802cd634
Author: Bruce Schuchardt <bs...@pivotal.io>
AuthorDate: Wed Nov 28 13:27:32 2018 -0800

    GEODE-6035 Increase backlog for peer-to-peer connection formation
    
    I've made a sweep through the code to make TCP/IP backlog defaults
    consistent.  All of them now default to 1280 but all are limited
    by the operating system limit.  On Linux this is usually 128 and
    is configured with the somaxconn setting.
    
    The figure 1280 was recommended by Pivotal field engineers.
    
    This also adds the following LinuxSystemStats:
    
        tcpExtSynCookiesRecv    The number of TCP/IP SYN cookies received due to a full server socket backlog.
                                If this is non-zero consider disabling SYN cookies because they form sub-optimal connections.
                                units: cookies received
        tcpExtSynCookiesSent    The number of TCP/IP SYN cookies sent due to a full server socket backlog.
                                If this is non-zero consider disabling SYN cookies because they form sub-optimal connections.
                                units: cookies sent
        tcpExtListenDrops       The number of TCP/IP connection requests that have been dropped due to a full backlog.
                                If this is large increase the OS SOMAXCONN setting and increase socket backlog settings.
                                units: requests
        tcpExtListenOverflows   The number of TCP/IP connection requests that could not be queued due to a small backlog.
                                These are either dropped (tcpExtListenDrops) or handled via cookies (tcpSynCookiesSent).
                                In either case you should consider increasing SOMAXCONN and increasing backlog settings.
                                units: requests
        soMaxConn               Maximum TCP/IP server socket connection request backlog
                                units: connection requests
---
 .../GemFireStatSamplerIntegrationTest.java         |  9 ++--
 .../distributed/internal/tcpserver/TcpServer.java  |  2 -
 .../apache/geode/internal/cache/properties.html    | 24 +++++++--
 .../internal/cache/tier/sockets/AcceptorImpl.java  |  2 +-
 .../statistics/platform/LinuxProcFsStatistics.java | 58 ++++++++++++++++++++++
 .../statistics/platform/LinuxSystemStats.java      | 31 ++++++++++++
 .../org/apache/geode/internal/tcp/TCPConduit.java  | 44 ++++++++++------
 7 files changed, 142 insertions(+), 28 deletions(-)

diff --git a/geode-core/src/integrationTest/java/org/apache/geode/internal/statistics/GemFireStatSamplerIntegrationTest.java b/geode-core/src/integrationTest/java/org/apache/geode/internal/statistics/GemFireStatSamplerIntegrationTest.java
index 48f03ae..215cd35 100644
--- a/geode-core/src/integrationTest/java/org/apache/geode/internal/statistics/GemFireStatSamplerIntegrationTest.java
+++ b/geode-core/src/integrationTest/java/org/apache/geode/internal/statistics/GemFireStatSamplerIntegrationTest.java
@@ -426,13 +426,11 @@ public class GemFireStatSamplerIntegrationTest extends StatSamplerTestCase {
     System.setProperty(HostStatSampler.TEST_FILE_SIZE_LIMIT_IN_KB_PROPERTY, "true");
     Properties props = createGemFireProperties();
     props.setProperty(STATISTIC_ARCHIVE_FILE, archiveFileName);
-    props.setProperty(ARCHIVE_FILE_SIZE_LIMIT, "1");
-    props.setProperty(ARCHIVE_DISK_SPACE_LIMIT, "12");
+    props.setProperty(ARCHIVE_FILE_SIZE_LIMIT, "2");
+    props.setProperty(ARCHIVE_DISK_SPACE_LIMIT, "14");
     props.setProperty(STATISTIC_SAMPLE_RATE, String.valueOf(sampleRate));
     connect(props);
 
-    assertTrue(getGemFireStatSampler().waitForInitialization(5000));
-
     boolean exists1 = false;
     boolean exists2 = false;
     boolean exists3 = false;
@@ -441,7 +439,7 @@ public class GemFireStatSamplerIntegrationTest extends StatSamplerTestCase {
     boolean done = false;
     try {
       for (StopWatch time = new StopWatch(true); !done
-          && time.elapsedTimeMillis() < 10 * sampleRate;) {
+          && time.elapsedTimeMillis() < 15 * sampleRate;) {
         exists1 = exists1 || archiveFile1.exists();
         exists2 = exists2 || archiveFile2.exists();
         exists3 = exists3 || archiveFile3.exists();
@@ -455,6 +453,7 @@ public class GemFireStatSamplerIntegrationTest extends StatSamplerTestCase {
     } catch (InterruptedException e) {
       Thread.currentThread().interrupt();
     }
+    assertTrue(getGemFireStatSampler().waitForInitialization(5000));
     assertTrue("Waiting for archive files to exist:" + " exists1=" + exists1 + " exists2=" + exists2
         + " exists3=" + exists3 + " exists4=" + exists4 + " exists=" + exists, done);
 
diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
index 761bb3b..3abcc5c 100755
--- a/geode-core/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
+++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/tcpserver/TcpServer.java
@@ -119,8 +119,6 @@ public class TcpServer {
   // no longer static so that tests can test this system property
   private final int READ_TIMEOUT =
       Integer.getInteger(DistributionConfig.GEMFIRE_PREFIX + "TcpServer.READ_TIMEOUT", 60 * 1000);
-  // This is for backwards compatibility. The p2p.backlog flag used to be the only way to configure
-  // the locator backlog.
   private static final int P2P_BACKLOG = Integer.getInteger("p2p.backlog", 1000);
   private static final int BACKLOG =
       Integer.getInteger(DistributionConfig.GEMFIRE_PREFIX + "TcpServer.BACKLOG", P2P_BACKLOG);
diff --git a/geode-core/src/main/java/org/apache/geode/internal/cache/properties.html b/geode-core/src/main/java/org/apache/geode/internal/cache/properties.html
index dea752a..24b9782 100755
--- a/geode-core/src/main/java/org/apache/geode/internal/cache/properties.html
+++ b/geode-core/src/main/java/org/apache/geode/internal/cache/properties.html
@@ -262,12 +262,12 @@ TBA
 <dd>
 <em>Public:</em> false
 <p>
-<em>Integer</em> (default is 1000)
+<em>Integer</em> (default is 1280)
 <p>
 See <code>org.apache.geode.internal.cache.tier.sockets.AcceptorImpl</code>
 constructor.
 <p>
-This is the TCP accept backlog for the acceptor thread's listening socket.
+This is the TCP/IP "accept" backlog for client/server communications.
 </dd>
 
 <!-- -------------------------------------------------------  -->
@@ -2357,6 +2357,20 @@ TBA
 </dd>
 
 <!-- -------------------------------------------------------  -->
+    <dt><strong>gemfire.TcpServer.BACKLOG</strong></dt>
+    <dd>
+        <em>Public:</em> false
+        <p>
+            <em>Integer</em> default is 1280 but is limited by the OS somaxconn setting
+        <p>
+        <pre>
+   This property establishes a Locator's TCP/IP "accept" backlog
+   for locator communications.
+</pre>
+        <p>
+            TBA
+    </dd>
+<!-- -------------------------------------------------------  -->
 <dt><strong>gemfire.validateMessageSize</strong></dt>
 <dd>
 <em>Public:</em> false
@@ -2616,13 +2630,13 @@ TBA
 <dd>
 <em>Public:</em> false
 <p>
-<em>Integer</em> (default is 50)
+<em>Integer</em> (default is 1280 but limited by OS somaxconn setting)
 <p>
 See <code>org.apache.geode.internal.tcp.TCPConduit#BACKLOG</code>.
 <p>
 <pre>
-  backlog is the "accept" backlog configuration parameter all
-  conduits server socket */
+  backlog is the TCP/IP "accept" backlog configuration parameter for cluster
+  communications
 </pre>
 <p>
 TBA
diff --git a/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/AcceptorImpl.java b/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/AcceptorImpl.java
index 158fc68..deae928 100755
--- a/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/AcceptorImpl.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/cache/tier/sockets/AcceptorImpl.java
@@ -229,7 +229,7 @@ public class AcceptorImpl implements Acceptor, Runnable, CommBufferPool {
   /**
    * The default value of the {@link ServerSocket} {@link #BACKLOG_PROPERTY_NAME}system property
    */
-  private static final int DEFAULT_BACKLOG = 1000;
+  private static final int DEFAULT_BACKLOG = 1280;
 
   /**
    * The system property name for setting the {@link ServerSocket}backlog
diff --git a/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxProcFsStatistics.java b/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxProcFsStatistics.java
index 395ce0f..4dc1639 100644
--- a/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxProcFsStatistics.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxProcFsStatistics.java
@@ -30,6 +30,9 @@ import org.apache.logging.log4j.Logger;
 import org.apache.geode.distributed.internal.DistributionConfig;
 
 public class LinuxProcFsStatistics {
+  private static boolean soMaxConnProcessed;
+  private static int soMaxConn;
+
   private enum CPU {
     USER,
     NICE,
@@ -200,6 +203,7 @@ public class LinuxProcFsStatistics {
     getMemInfo(ints);
     getDiskStats(longs);
     getNetStats(longs);
+    getNetStatStats(longs, ints);
     if (hasProcVmStat) {
       getVmStats(longs);
     }
@@ -335,6 +339,60 @@ public class LinuxProcFsStatistics {
   }
 
   /*
+   * TcpExt:=0 SyncookiesSent=1
+   * ListenOverflows=20 ListenDrops=21
+   */
+  private static void getNetStatStats(long[] longs, int[] ints) {
+    InputStreamReader isr;
+    BufferedReader br = null;
+    try {
+      isr = new InputStreamReader(new FileInputStream("/proc/net/netstat"));
+      br = new BufferedReader(isr);
+      String line;
+      do {
+        br.readLine(); // header
+        line = br.readLine();
+      } while (line != null && !line.startsWith("TcpExt:"));
+
+      st.setString(line);
+      st.skipTokens(1);
+      long tcpSyncookiesSent = st.nextTokenAsLong();
+      long tcpSyncookiesRecv = st.nextTokenAsLong();
+      st.skipTokens(17);
+      long tcpListenOverflows = st.nextTokenAsLong();
+      long tcpListenDrops = st.nextTokenAsLong();
+
+      longs[LinuxSystemStats.tcpExtSynCookiesRecvLONG] = tcpSyncookiesRecv;
+      longs[LinuxSystemStats.tcpExtSynCookiesSentLONG] = tcpSyncookiesSent;
+      longs[LinuxSystemStats.tcpExtListenDropsLONG] = tcpListenDrops;
+      longs[LinuxSystemStats.tcpExtListenOverflowsLONG] = tcpListenOverflows;
+
+      if (!soMaxConnProcessed) {
+        br.close();
+        isr = new InputStreamReader(new FileInputStream("/proc/sys/net/core/somaxconn"));
+        br = new BufferedReader(isr);
+        line = br.readLine();
+        st.setString(line);
+        soMaxConn = st.nextTokenAsInt();
+        soMaxConnProcessed = true;
+      }
+
+      ints[LinuxSystemStats.tcpSOMaxConnINT] = soMaxConn;
+
+    } catch (NoSuchElementException nsee) {
+    } catch (IOException ioe) {
+    } finally {
+      st.releaseResources();
+      if (br != null) {
+        try {
+          br.close();
+        } catch (IOException ignore) {
+        }
+      }
+    }
+  }
+
+  /*
    * Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes
    * packets errs drop fifo colls carrier compressed lo:1908275823 326949246 0 0 0 0 0 0 1908275823
    * 326949246 0 0 0 0 0 0
diff --git a/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxSystemStats.java b/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxSystemStats.java
index 662c95f..9da3715 100644
--- a/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxSystemStats.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/statistics/platform/LinuxSystemStats.java
@@ -49,6 +49,7 @@ public class LinuxSystemStats {
   static final int dirtyMemoryINT = 17;
   static final int cpuNonUserINT = 18;
   static final int cpuStealINT = 19;
+  static final int tcpSOMaxConnINT = 20;
 
   static final int loopbackPacketsLONG = 0;
   static final int loopbackBytesLONG = 1;
@@ -78,6 +79,11 @@ public class LinuxSystemStats {
   static final int iosInProgressLONG = 25;
   static final int timeIosInProgressLONG = 26;
   static final int ioTimeLONG = 27;
+  static final int tcpExtSynCookiesRecvLONG = 28;
+  static final int tcpExtSynCookiesSentLONG = 29;
+  static final int tcpExtListenDropsLONG = 30;
+  static final int tcpExtListenOverflowsLONG = 31;
+
 
   static final int loadAverage1DOUBLE = 0;
   static final int loadAverage15DOUBLE = 1;
@@ -149,6 +155,9 @@ public class LinuxSystemStats {
             f.createIntGauge("cpuSteal",
                 "Steal time is the amount of time the operating system wanted to execute, but was not allowed to by the hypervisor.",
                 "%"),
+            f.createIntGauge("soMaxConn",
+                "Maximum TCP/IP server socket connection request backlog",
+                "connection requests"),
 
             f.createLongCounter("loopbackPackets",
                 "The number of network packets sent (or received) on the loopback interface",
@@ -219,6 +228,23 @@ public class LinuxSystemStats {
             f.createLongCounter("diskTime",
                 "The total number of milliseconds that measures both completed disk operations and any accumulating backlog of in progress ops.",
                 "milliseconds"),
+            f.createLongCounter("tcpExtSynCookiesRecv",
+                "The number of TCP/IP SYN cookies received due to a full server socket backlog.  "
+                    + "If this is non-zero consider disabling SYN cookies because they form sub-optimal connections.",
+                "cookies received"),
+            f.createLongCounter("tcpExtSynCookiesSent",
+                "The number of TCP/IP SYN cookies sent due to a full server socket backlog.  "
+                    + "If this is non-zero consider disabling SYN cookies because they form sub-optimal connections.",
+                "cookies sent"),
+            f.createLongCounter("tcpExtListenDrops",
+                "The number of TCP/IP connection requests that have been dropped due to a full backlog.  "
+                    + "If this is large increase the OS SOMAXCONN setting and increase socket backlog settings",
+                "requests"),
+            f.createLongCounter("tcpExtListenOverflows",
+                "The number of TCP/IP connection requests that could not be queued due to a small backlog.  "
+                    + "These are either dropped (tcpExtListenDrops) or handled via cookies (tcpSynCookiesSent).  "
+                    + "In either case you should consider increasing SOMAXCONN and increasing backlog settings.",
+                "requests"),
 
 
             f.createDoubleGauge("loadAverage1",
@@ -251,6 +277,7 @@ public class LinuxSystemStats {
     checkOffset("dirtyMemory", dirtyMemoryINT);
     checkOffset("cpuNonUser", cpuNonUserINT);
     checkOffset("cpuSteal", cpuStealINT);
+    checkOffset("soMaxConn", tcpSOMaxConnINT);
 
     checkOffset("loopbackPackets", loopbackPacketsLONG);
     checkOffset("loopbackBytes", loopbackBytesLONG);
@@ -280,6 +307,10 @@ public class LinuxSystemStats {
     checkOffset("diskOpsInProgress", iosInProgressLONG);
     checkOffset("diskTimeInProgress", timeIosInProgressLONG);
     checkOffset("diskTime", ioTimeLONG);
+    checkOffset("tcpExtSynCookiesRecv", tcpExtSynCookiesRecvLONG);
+    checkOffset("tcpExtSynCookiesSent", tcpExtSynCookiesSentLONG);
+    checkOffset("tcpExtListenDrops", tcpExtListenDropsLONG);
+    checkOffset("tcpExtListenOverflows", tcpExtListenOverflowsLONG);
 
     checkOffset("loadAverage1", loadAverage1DOUBLE);
     checkOffset("loadAverage15", loadAverage15DOUBLE);
diff --git a/geode-core/src/main/java/org/apache/geode/internal/tcp/TCPConduit.java b/geode-core/src/main/java/org/apache/geode/internal/tcp/TCPConduit.java
index 0b3b438..0057847 100644
--- a/geode-core/src/main/java/org/apache/geode/internal/tcp/TCPConduit.java
+++ b/geode-core/src/main/java/org/apache/geode/internal/tcp/TCPConduit.java
@@ -89,7 +89,18 @@ public class TCPConduit implements Runnable {
   private static int LISTENER_CLOSE_TIMEOUT;
 
   /**
-   * backlog is the "accept" backlog configuration parameter all conduits server socket
+   * BACKLOG is the "accept" backlog configuration parameter for the conduits server socket.
+   * In most operating systems this is limited to 128 by default and you must change
+   * the OS setting somaxconn to go beyond that limit. Note that setting this too high
+   * can have ramifications when disconnecting from the distributed system if a thread
+   * is trying to connect to another member that is not accepting connections quickly
+   * enough. Setting it too low can also have adverse effects because backlog overflows
+   * aren't handled well by most tcp/ip implementations, causing connect timeouts instead
+   * of expected ServerRefusedConnection exceptions.
+   * <p>
+   * Normally the backlog isn't that important because if it's full of connection requests
+   * a SYN "cookie" mechanism is used to bypass the backlog queue. If this is turned off
+   * though connection requests are dropped when the queue is full.
    */
   private static int BACKLOG;
 
@@ -142,8 +153,8 @@ public class TCPConduit implements Runnable {
     // only use direct buffers if we are using nio
     useDirectBuffers = USE_NIO && !Boolean.getBoolean("p2p.nodirectBuffers");
     LISTENER_CLOSE_TIMEOUT = Integer.getInteger("p2p.listenerCloseTimeout", 60000).intValue();
-    // fix for bug 37730
-    BACKLOG = Integer.getInteger("p2p.backlog", HANDSHAKE_POOL_SIZE + 1).intValue();
+    // note: bug 37730 concerned this defaulting to 50
+    BACKLOG = Integer.getInteger("p2p.backlog", 1280).intValue();
   }
 
   ///////////////// permanent conduit state
@@ -412,23 +423,24 @@ public class TCPConduit implements Runnable {
    * this.bindAddress, which must be set before invoking this method.
    */
   private void createServerSocket() {
-    int p = this.port;
-    int b = BACKLOG;
+    int serverPort = this.port;
+    int connectionRequestBacklog = BACKLOG;
     InetAddress bindAddress = this.address;
 
     try {
       if (this.useNIO) {
-        if (p <= 0) {
+        if (serverPort <= 0) {
 
-          socket = socketCreator.createServerSocketUsingPortRange(bindAddress, b, isBindAddress,
+          socket = socketCreator.createServerSocketUsingPortRange(bindAddress,
+              connectionRequestBacklog, isBindAddress,
               this.useNIO, 0, tcpPortRange);
         } else {
           ServerSocketChannel channel = ServerSocketChannel.open();
           socket = channel.socket();
 
           InetSocketAddress inetSocketAddress =
-              new InetSocketAddress(isBindAddress ? bindAddress : null, p);
-          socket.bind(inetSocketAddress, b);
+              new InetSocketAddress(isBindAddress ? bindAddress : null, serverPort);
+          socket.bind(inetSocketAddress, connectionRequestBacklog);
         }
 
         if (useNIO) {
@@ -450,11 +462,13 @@ public class TCPConduit implements Runnable {
         channel = socket.getChannel();
       } else {
         try {
-          if (p <= 0) {
-            socket = socketCreator.createServerSocketUsingPortRange(bindAddress, b, isBindAddress,
+          if (serverPort <= 0) {
+            socket = socketCreator.createServerSocketUsingPortRange(bindAddress,
+                connectionRequestBacklog, isBindAddress,
                 this.useNIO, this.tcpBufferSize, tcpPortRange);
           } else {
-            socket = socketCreator.createServerSocket(p, b, isBindAddress ? bindAddress : null,
+            socket = socketCreator.createServerSocket(serverPort, connectionRequestBacklog,
+                isBindAddress ? bindAddress : null,
                 this.tcpBufferSize);
           }
           int newSize = socket.getReceiveBufferSize();
@@ -473,7 +487,7 @@ public class TCPConduit implements Runnable {
     } catch (IOException io) {
       throw new ConnectionException(
           String.format("While creating ServerSocket on port %s with address %s",
-              new Object[] {Integer.valueOf(p), bindAddress}),
+              new Object[] {Integer.valueOf(serverPort), bindAddress}),
           io);
     }
   }
@@ -652,8 +666,6 @@ public class TCPConduit implements Runnable {
                 ex);
             break;
           }
-          othersock.setSoTimeout(0);
-          socketCreator.handshakeIfSocketIsSSL(othersock, idleConnectionTimeout);
         }
         if (stopped) {
           try {
@@ -759,6 +771,8 @@ public class TCPConduit implements Runnable {
 
   protected void basicAcceptConnection(Socket othersock) {
     try {
+      othersock.setSoTimeout(0);
+      socketCreator.handshakeIfSocketIsSSL(othersock, idleConnectionTimeout);
       getConTable().acceptConnection(othersock, new PeerConnectionFactory());
     } catch (IOException io) {
       // exception is logged by the Connection