You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ja...@apache.org on 2023/09/22 08:05:23 UTC

[solr] branch main updated: SOLR-16968: The MemoryCircuitBreaker now uses average heap usage (#1905)

This is an automated email from the ASF dual-hosted git repository.

janhoy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 8ca4a5d657f SOLR-16968: The MemoryCircuitBreaker now uses average heap usage (#1905)
8ca4a5d657f is described below

commit 8ca4a5d657f04d5467d42334d5ed7c9efac2b9ea
Author: Jan Høydahl <ja...@apache.org>
AuthorDate: Fri Sep 22 10:05:15 2023 +0200

    SOLR-16968: The MemoryCircuitBreaker now uses average heap usage (#1905)
---
 solr/CHANGES.txt                                   |  2 +
 .../src/java/org/apache/solr/core/SolrCore.java    | 17 ++++-
 .../circuitbreaker/AveragingMetricProvider.java    | 80 ++++++++++++++++++++++
 .../solr/util/circuitbreaker/CircuitBreaker.java   |  9 ++-
 .../util/circuitbreaker/CircuitBreakerManager.java | 14 ++++
 .../circuitbreaker/CircuitBreakerRegistry.java     | 75 +++++++++++++++++---
 .../util/circuitbreaker/MemoryCircuitBreaker.java  | 70 +++++++++++++------
 .../apache/solr/util/BaseTestCircuitBreaker.java   | 31 ++++++---
 8 files changed, 253 insertions(+), 45 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 70a93e99ab6..18114626a2b 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -129,6 +129,8 @@ Improvements
 
 * SOLR-16970: SOLR_OPTS is now able to override options set by the Solr control scripts, "bin/solr" and "bin/solr.cmd". (Houston Putman)
 
+* SOLR-16968: The MemoryCircuitBreaker now uses average heap usage over the last 30 seconds (janhoy, Christine Poerschke)
+
 * SOLR-14886: Suppress stack traces in query response (Isabelle Giguere via Alex Deparvu)
 
 * SOLR-16461: `/solr/coreName/replication?command=backup` now has a v2 equivalent, available at
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java
index 944d4684016..f5e28b34aa0 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCore.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java
@@ -1088,9 +1088,6 @@ public class SolrCore implements SolrInfoBean, Closeable {
       solrMetricsContext = coreMetricManager.getSolrMetricsContext();
       this.coreMetricManager.loadReporters();
 
-      // init pluggable circuit breakers
-      initPlugins(null, CircuitBreaker.class);
-
       if (updateHandler == null) {
         directoryFactory = initDirectoryFactory();
         recoveryStrategyBuilder = initRecoveryStrategyBuilder();
@@ -1115,6 +1112,9 @@ public class SolrCore implements SolrInfoBean, Closeable {
       // initialize core metrics
       initializeMetrics(solrMetricsContext, null);
 
+      // init pluggable circuit breakers, after metrics because some circuit breakers use metrics
+      initPlugins(null, CircuitBreaker.class);
+
       SolrFieldCacheBean solrFieldCacheBean = new SolrFieldCacheBean();
       // this is registered at the CONTAINER level because it's not core-specific - for now we
       // also register it here for back-compat
@@ -1764,6 +1764,17 @@ public class SolrCore implements SolrInfoBean, Closeable {
 
     ExecutorUtil.shutdownAndAwaitTermination(coreAsyncTaskExecutor);
 
+    // Close circuit breakers that may have background threads, before metrics because some circuit
+    // breakers use metrics
+    try {
+      getCircuitBreakerRegistry().close();
+    } catch (Throwable e) {
+      log.error("Exception closing circuit breakers", e);
+      if (e instanceof Error) {
+        throw (Error) e;
+      }
+    }
+
     // stop reporting metrics
     try {
       coreMetricManager.close();
diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java
new file mode 100644
index 00000000000..60161e98181
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/AveragingMetricProvider.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.util.circuitbreaker;
+
+import com.google.common.util.concurrent.AtomicDouble;
+import java.io.Closeable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.solr.logging.CircularList;
+
+/** Averages the metric value over a period of time */
+public class AveragingMetricProvider implements Closeable {
+  private final CircularList<Double> samplesRingBuffer;
+  private ScheduledExecutorService executor;
+  private final AtomicDouble currentAverageValue = new AtomicDouble(-1);
+
+  /**
+   * Creates an instance with an executor that runs every sampleInterval seconds and averages over
+   * numSamples samples.
+   *
+   * @param metricProvider metric provider that will provide a value
+   * @param numSamples number of samples to calculate average for
+   * @param sampleInterval interval between each sample
+   */
+  public AveragingMetricProvider(
+      MetricProvider metricProvider, int numSamples, long sampleInterval) {
+    this.samplesRingBuffer = new CircularList<>(numSamples);
+    executor =
+        Executors.newSingleThreadScheduledExecutor(
+            new SolrNamedThreadFactory(
+                "AveragingMetricProvider-" + metricProvider.getClass().getSimpleName()));
+    executor.scheduleWithFixedDelay(
+        () -> {
+          samplesRingBuffer.add(metricProvider.getMetricValue());
+          currentAverageValue.set(
+              samplesRingBuffer.toList().stream()
+                  .mapToDouble(Double::doubleValue)
+                  .average()
+                  .orElse(-1));
+        },
+        0,
+        sampleInterval,
+        TimeUnit.SECONDS);
+  }
+
+  /**
+   * Return current average. This is a cached value, so calling this method will not incur any
+   * calculations
+   */
+  public double getMetricValue() {
+    return currentAverageValue.get();
+  }
+
+  @Override
+  public void close() {
+    ExecutorUtil.shutdownAndAwaitTermination(executor);
+  }
+
+  /** Interface to provide the metric value. */
+  public interface MetricProvider {
+    double getMetricValue();
+  }
+}
diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java
index 4b9d24bb694..78841cceaf7 100644
--- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java
+++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreaker.java
@@ -17,6 +17,8 @@
 
 package org.apache.solr.util.circuitbreaker;
 
+import java.io.Closeable;
+import java.io.IOException;
 import java.util.List;
 import java.util.Locale;
 import java.util.Set;
@@ -41,7 +43,7 @@ import org.apache.solr.util.plugin.NamedListInitializedPlugin;
  *
  * @lucene.experimental
  */
-public abstract class CircuitBreaker implements NamedListInitializedPlugin {
+public abstract class CircuitBreaker implements NamedListInitializedPlugin, Closeable {
   // Only query requests are checked by default
   private Set<SolrRequestType> requestTypes = Set.of(SolrRequestType.QUERY);
   private final List<SolrRequestType> SUPPORTED_TYPES =
@@ -60,6 +62,11 @@ public abstract class CircuitBreaker implements NamedListInitializedPlugin {
   /** Get error message when the circuit breaker triggers */
   public abstract String getErrorMessage();
 
+  @Override
+  public void close() throws IOException {
+    // Nothing to do by default
+  }
+
   /**
    * Set the request types for which this circuit breaker should be checked. If not called, the
    * circuit breaker will be checked for the {@link SolrRequestType#QUERY} request type only.
diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java
index 02e3c7af676..3ca0c760a86 100644
--- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java
+++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerManager.java
@@ -17,6 +17,7 @@
 
 package org.apache.solr.util.circuitbreaker;
 
+import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import org.apache.solr.common.util.NamedList;
 import org.slf4j.Logger;
@@ -77,6 +78,19 @@ public class CircuitBreakerManager extends CircuitBreaker {
     }
   }
 
+  @Override
+  public void close() throws IOException {
+    try {
+      if (memEnabled) {
+        memCB.close();
+      }
+    } finally {
+      if (cpuEnabled) {
+        cpuCB.close();
+      }
+    }
+  }
+
   // The methods below will be called by super class during init
   public void setMemEnabled(String enabled) {
     this.memEnabled = Boolean.getBoolean(enabled);
diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java
index 84c2f61fb9b..a7081df96f6 100644
--- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java
+++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/CircuitBreakerRegistry.java
@@ -18,12 +18,19 @@
 package org.apache.solr.util.circuitbreaker;
 
 import com.google.common.annotations.VisibleForTesting;
+import java.io.Closeable;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.solr.client.solrj.SolrRequest.SolrRequestType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Keeps track of all registered circuit breaker instances for various request types. Responsible
@@ -32,26 +39,35 @@ import org.apache.solr.client.solrj.SolrRequest.SolrRequestType;
  * @lucene.experimental
  * @since 9.4
  */
-public class CircuitBreakerRegistry {
+public class CircuitBreakerRegistry implements Closeable {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   private final Map<SolrRequestType, List<CircuitBreaker>> circuitBreakerMap = new HashMap<>();
 
   public CircuitBreakerRegistry() {}
 
   public void register(CircuitBreaker circuitBreaker) {
-    circuitBreaker
-        .getRequestTypes()
-        .forEach(
-            r -> {
-              List<CircuitBreaker> list =
-                  circuitBreakerMap.computeIfAbsent(r, k -> new ArrayList<>());
-              list.add(circuitBreaker);
-            });
+    synchronized (circuitBreakerMap) {
+      circuitBreaker
+          .getRequestTypes()
+          .forEach(
+              r -> {
+                List<CircuitBreaker> list =
+                    circuitBreakerMap.computeIfAbsent(r, k -> new ArrayList<>());
+                list.add(circuitBreaker);
+                if (log.isInfoEnabled()) {
+                  log.info(
+                      "Registered circuit breaker {} for request type(s) {}",
+                      circuitBreaker.getClass().getSimpleName(),
+                      r);
+                }
+              });
+    }
   }
 
   @VisibleForTesting
-  public void deregisterAll() {
-    circuitBreakerMap.clear();
+  public void deregisterAll() throws IOException {
+    this.close();
   }
 
   /**
@@ -97,4 +113,41 @@ public class CircuitBreakerRegistry {
   public boolean isEnabled(SolrRequestType requestType) {
     return circuitBreakerMap.containsKey(requestType);
   }
+
+  @Override
+  public void close() throws IOException {
+    synchronized (circuitBreakerMap) {
+      final AtomicInteger closeFailedCounter = new AtomicInteger(0);
+      circuitBreakerMap
+          .values()
+          .forEach(
+              list ->
+                  list.forEach(
+                      it -> {
+                        try {
+                          if (log.isDebugEnabled()) {
+                            log.debug(
+                                "Closed circuit breaker {} for request type(s) {}",
+                                it.getClass().getSimpleName(),
+                                it.getRequestTypes());
+                          }
+                          it.close();
+                        } catch (IOException e) {
+                          if (log.isErrorEnabled()) {
+                            log.error(
+                                String.format(
+                                    Locale.ROOT,
+                                    "Failed to close circuit breaker %s",
+                                    it.getClass().getSimpleName()),
+                                e);
+                          }
+                          closeFailedCounter.incrementAndGet();
+                        }
+                      }));
+      circuitBreakerMap.clear();
+      if (closeFailedCounter.get() > 0) {
+        throw new IOException("Failed to close " + closeFailedCounter.get() + " circuit breakers");
+      }
+    }
+  }
 }
diff --git a/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java b/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java
index 3004d732e4d..4a3eb3f5b9f 100644
--- a/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java
+++ b/solr/core/src/java/org/apache/solr/util/circuitbreaker/MemoryCircuitBreaker.java
@@ -17,32 +17,64 @@
 
 package org.apache.solr.util.circuitbreaker;
 
+import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.lang.management.ManagementFactory;
 import java.lang.management.MemoryMXBean;
+import org.apache.solr.util.RefCounted;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Tracks the current JVM heap usage and triggers if it exceeds the defined percentage of the
- * maximum heap size allocated to the JVM. This circuit breaker is a part of the default
- * CircuitBreakerRegistry so is checked for every request -- hence it is realtime. Once the memory
- * usage goes below the threshold, it will start allowing queries again.
+ * Tracks the current JVM heap usage and triggers if a moving heap usage average over 30 seconds
+ * exceeds the defined percentage of the maximum heap size allocated to the JVM. Once the average
+ * memory usage goes below the threshold, it will start allowing queries again.
  *
  * <p>The memory threshold is defined as a percentage of the maximum memory allocated -- see
- * memThreshold in solrconfig.xml.
+ * memThreshold in <code>solrconfig.xml</code>.
  */
 public class MemoryCircuitBreaker extends CircuitBreaker {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
   private static final MemoryMXBean MEMORY_MX_BEAN = ManagementFactory.getMemoryMXBean();
+  // One shared provider / executor for all instances of this class
+  private static RefCounted<AveragingMetricProvider> averagingMetricProvider;
 
   private long heapMemoryThreshold;
 
   private static final ThreadLocal<Long> seenMemory = ThreadLocal.withInitial(() -> 0L);
   private static final ThreadLocal<Long> allowedMemory = ThreadLocal.withInitial(() -> 0L);
 
+  /** Creates an instance which averages over 6 samples during last 30 seconds. */
   public MemoryCircuitBreaker() {
+    this(6, 5);
+  }
+
+  /**
+   * Constructor that allows override of sample interval for which the memory usage is fetched. This
+   * is provided for testing, not intended for general use because the average metric provider
+   * implementation is the same for all instances of the class.
+   *
+   * @param numSamples number of samples to calculate average for
+   * @param sampleInterval interval between each sample
+   */
+  protected MemoryCircuitBreaker(int numSamples, int sampleInterval) {
     super();
+    synchronized (MemoryCircuitBreaker.class) {
+      if (averagingMetricProvider == null || averagingMetricProvider.getRefcount() == 0) {
+        averagingMetricProvider =
+            new RefCounted<>(
+                new AveragingMetricProvider(
+                    () -> MEMORY_MX_BEAN.getHeapMemoryUsage().getUsed(),
+                    numSamples,
+                    sampleInterval)) {
+              @Override
+              protected void close() {
+                get().close();
+              }
+            };
+      }
+      averagingMetricProvider.incref();
+    }
   }
 
   public void setThreshold(double thresholdValueInPercentage) {
@@ -60,14 +92,11 @@ public class MemoryCircuitBreaker extends CircuitBreaker {
     }
   }
 
-  // TODO: An optimization can be to trip the circuit breaker for a duration of time
-  // after the circuit breaker condition is matched. This will optimize for per call
-  // overhead of calculating the condition parameters but can result in false positives.
   @Override
   public boolean isTripped() {
 
     long localAllowedMemory = getCurrentMemoryThreshold();
-    long localSeenMemory = calculateLiveMemoryUsage();
+    long localSeenMemory = getAvgMemoryUsage();
 
     allowedMemory.set(localAllowedMemory);
 
@@ -76,6 +105,10 @@ public class MemoryCircuitBreaker extends CircuitBreaker {
     return (localSeenMemory >= localAllowedMemory);
   }
 
+  protected long getAvgMemoryUsage() {
+    return (long) averagingMetricProvider.get().getMetricValue();
+  }
+
   @Override
   public String getErrorMessage() {
     return "Memory Circuit Breaker triggered as JVM heap usage values are greater than allocated threshold. "
@@ -89,17 +122,12 @@ public class MemoryCircuitBreaker extends CircuitBreaker {
     return heapMemoryThreshold;
   }
 
-  /**
-   * Calculate the live memory usage for the system. This method has package visibility to allow
-   * using for testing.
-   *
-   * @return Memory usage in bytes.
-   */
-  protected long calculateLiveMemoryUsage() {
-    // NOTE: MemoryUsageGaugeSet provides memory usage statistics but we do not use them
-    // here since it will require extra allocations and incur cost, hence it is cheaper to use
-    // MemoryMXBean directly. Ideally, this call should not add noticeable
-    // latency to a query -- but if it does, please signify on SOLR-14588
-    return MEMORY_MX_BEAN.getHeapMemoryUsage().getUsed();
+  @Override
+  public void close() throws IOException {
+    synchronized (MemoryCircuitBreaker.class) {
+      if (averagingMetricProvider != null && averagingMetricProvider.getRefcount() > 0) {
+        averagingMetricProvider.decref();
+      }
+    }
   }
 }
diff --git a/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java b/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java
index 71c6fe67f8d..14c83df771a 100644
--- a/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java
+++ b/solr/core/src/test/org/apache/solr/util/BaseTestCircuitBreaker.java
@@ -19,6 +19,7 @@ package org.apache.solr.util;
 
 import static org.hamcrest.CoreMatchers.containsString;
 
+import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.List;
@@ -43,6 +44,8 @@ import org.slf4j.LoggerFactory;
 
 public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private static final CircuitBreaker dummyMemBreaker = new MemoryCircuitBreaker();
+  private static final CircuitBreaker dummyCBManager = new CircuitBreakerManager();
 
   protected static void indexDocs() {
     removeAllExistingCircuitBreakers();
@@ -62,11 +65,13 @@ public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 {
   @Override
   public void tearDown() throws Exception {
     super.tearDown();
+    dummyMemBreaker.close();
+    dummyCBManager.close();
   }
 
   @After
   public void after() {
-    h.getCore().getCircuitBreakerRegistry().deregisterAll();
+    removeAllExistingCircuitBreakers();
   }
 
   public void testCBAlwaysTrips() {
@@ -116,9 +121,10 @@ public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 {
   }
 
   public void testBadRequestType() {
+
     expectThrows(
         IllegalArgumentException.class,
-        () -> new MemoryCircuitBreaker().setRequestTypes(List.of("badRequestType")));
+        () -> dummyMemBreaker.setRequestTypes(List.of("badRequestType")));
   }
 
   public void testBuildingMemoryPressure() {
@@ -236,17 +242,21 @@ public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 {
         "//lst[@name='process']/double[@name='time']");
   }
 
-  public void testErrorCode() {
+  public void testErrorCode() throws Exception {
     assertEquals(
         SolrException.ErrorCode.SERVICE_UNAVAILABLE,
-        CircuitBreaker.getErrorCode(List.of(new CircuitBreakerManager())));
+        CircuitBreaker.getErrorCode(List.of(dummyCBManager)));
     assertEquals(
         SolrException.ErrorCode.TOO_MANY_REQUESTS,
-        CircuitBreaker.getErrorCode(List.of(new MemoryCircuitBreaker())));
+        CircuitBreaker.getErrorCode(List.of(dummyMemBreaker)));
   }
 
   private static void removeAllExistingCircuitBreakers() {
-    h.getCore().getCircuitBreakerRegistry().deregisterAll();
+    try {
+      h.getCore().getCircuitBreakerRegistry().deregisterAll();
+    } catch (IOException e) {
+      fail("Failed to unload circuit breakers");
+    }
   }
 
   private static class MockCircuitBreaker extends MemoryCircuitBreaker {
@@ -264,10 +274,12 @@ public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 {
   }
 
   private static class FakeMemoryPressureCircuitBreaker extends MemoryCircuitBreaker {
+    public FakeMemoryPressureCircuitBreaker() {
+      super(1, 1);
+    }
 
     @Override
-    protected long calculateLiveMemoryUsage() {
-      // Return a number large enough to trigger a pushback from the circuit breaker
+    protected long getAvgMemoryUsage() {
       return Long.MAX_VALUE;
     }
   }
@@ -276,11 +288,12 @@ public abstract class BaseTestCircuitBreaker extends SolrTestCaseJ4 {
     private AtomicInteger count;
 
     public BuildingUpMemoryPressureCircuitBreaker() {
+      super(1, 1);
       this.count = new AtomicInteger(0);
     }
 
     @Override
-    protected long calculateLiveMemoryUsage() {
+    protected long getAvgMemoryUsage() {
       int localCount = count.getAndIncrement();
 
       if (localCount >= 4) {