You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2020/12/04 23:00:11 UTC
[lucene-solr] branch branch_8x updated: SOLR-14965: add overseer
queue size metrics (#2040)
This is an automated email from the ASF dual-hosted git repository.
dsmiley pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 7731cd6 SOLR-14965: add overseer queue size metrics (#2040)
7731cd6 is described below
commit 7731cd6c1213964f49da9bd0b7dbeb5567042e80
Author: saatchibhalla <sa...@gmail.com>
AuthorDate: Fri Dec 4 16:47:35 2020 -0500
SOLR-14965: add overseer queue size metrics (#2040)
Adds two metrics to the SolrCloud Overseer: solr_metrics_overseer_stateUpdateQueueSize and solr_metrics_overseer_collectionWorkQueueSize with corresponding entries in the the Prometheus exporter's default/stock configuration.
Co-authored-by: Saatchi Bhalla <s....@salesforce.com>
(cherry picked from commit 19ed90337767412de66ebb1757e63607372ae9b3)
---
solr/CHANGES.txt | 4 +++
.../conf/solr-exporter-config.xml | 34 ++++++++++++++++++++++
.../src/java/org/apache/solr/cloud/Overseer.java | 16 +++++++++-
.../OverseerCollectionConfigSetProcessor.java | 11 ++++---
.../apache/solr/cloud/OverseerTaskProcessor.java | 21 ++++++++-----
.../OverseerCollectionConfigSetProcessorTest.java | 22 +++++++++-----
.../test/org/apache/solr/cloud/OverseerTest.java | 3 ++
.../org/apache/solr/cloud/ZkControllerTest.java | 11 ++++++-
solr/solr-ref-guide/src/metrics-reporting.adoc | 6 ++++
9 files changed, 108 insertions(+), 20 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 15adfee..9a4de19 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -33,6 +33,10 @@ Improvements
* SOLR-15015 : Add interleaving algorithm parameter support in Learning To Rank (Alessandro Benedetti)
+* SOLR-14965: metrics: Adds two metrics to the SolrCloud Overseer: solr_metrics_overseer_stateUpdateQueueSize
+ and solr_metrics_overseer_collectionWorkQueueSize with corresponding entries in the the Prometheus exporter's
+ default/stock configuration. (Saatchi Bhalla, Megan Carey, Andrzej BiaĆecki, David Smiley)
+
Optimizations
---------------------
* SOLR-14975: Optimize CoreContainer.getAllCoreNames, getLoadedCoreNames and getCoreDescriptors. (Bruno Roustant)
diff --git a/solr/contrib/prometheus-exporter/conf/solr-exporter-config.xml b/solr/contrib/prometheus-exporter/conf/solr-exporter-config.xml
index b043835..9b58352 100644
--- a/solr/contrib/prometheus-exporter/conf/solr-exporter-config.xml
+++ b/solr/contrib/prometheus-exporter/conf/solr-exporter-config.xml
@@ -280,6 +280,40 @@
value : $value
}
</str>
+ <!--
+ overseer metrics
+ -->
+ <str>
+ .metrics | to_entries | .[] | select(.key | startswith("solr.overseer")) as $object |
+ $object.value as $value | $value | to_entries | .[] |
+ select(.key | startswith("queue.") and endswith("collectionWorkQueueSize")) as $object |
+ $object.value as $value |
+ {
+ name : "solr_metrics_overseer_collectionWorkQueueSize",
+ type : "GAUGE",
+ help : "See following URL: https://lucene.apache.org/solr/guide/metrics-reporting.html",
+ label_names : [],
+ label_values : [],
+ value : $value
+ }
+ </str>
+ <str>
+ .metrics | to_entries | .[] | select(.key | startswith("solr.overseer")) as $object |
+ $object.value as $value | $value | to_entries | .[] |
+ select(.key | startswith("queue.") and endswith("stateUpdateQueueSize")) as $object |
+ $object.value as $value |
+ {
+ name : "solr_metrics_overseer_stateUpdateQueueSize",
+ type : "GAUGE",
+ help : "See following URL: https://lucene.apache.org/solr/guide/metrics-reporting.html",
+ label_names : [],
+ label_values : [],
+ value : $value
+ }
+ </str>
+ <!--
+ node metrics
+ -->
<str>
.metrics["solr.node"] | to_entries | .[] | select(.key | endswith(".clientErrors")) as $object |
$object.key | split(".")[0] as $category |
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index 90c4e5d..3f52f74 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -67,9 +67,12 @@ import org.apache.solr.common.util.Pair;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CloudConfig;
import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.handler.admin.CollectionsHandler;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.logging.MDCLoggingContext;
+import org.apache.solr.metrics.SolrMetricProducer;
+import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
@@ -93,6 +96,9 @@ public class Overseer implements SolrCloseable {
public static final int NUM_RESPONSES_TO_STORE = 10000;
public static final String OVERSEER_ELECT = "/overseer_elect";
+ private SolrMetricsContext solrMetricsContext;
+ private volatile String metricTag = SolrMetricProducer.getUniqueMetricTag(this, null);
+
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
enum LeaderStatus {DONT_KNOW, NO, YES}
@@ -117,6 +123,8 @@ public class Overseer implements SolrCloseable {
private final Stats zkStats;
+ private SolrMetricsContext clusterStateUpdaterMetricContext;
+
private boolean isClosed = false;
public ClusterStateUpdater(final ZkStateReader reader, final String myId, Stats zkStats) {
@@ -129,6 +137,9 @@ public class Overseer implements SolrCloseable {
this.completedMap = getCompletedMap(zkClient);
this.myId = myId;
this.reader = reader;
+
+ clusterStateUpdaterMetricContext = solrMetricsContext.getChildContext(this);
+ clusterStateUpdaterMetricContext.gauge(null, () -> stateUpdateQueue.getZkStats().getQueueLength(), true, "stateUpdateQueueSize", "queue" );
}
public Stats getStateUpdateQueueStats() {
@@ -491,6 +502,7 @@ public class Overseer implements SolrCloseable {
@Override
public void close() {
this.isClosed = true;
+ clusterStateUpdaterMetricContext.unregister();
}
}
@@ -563,6 +575,8 @@ public class Overseer implements SolrCloseable {
this.zkController = zkController;
this.stats = new Stats();
this.config = config;
+
+ this.solrMetricsContext = new SolrMetricsContext(zkController.getCoreContainer().getMetricManager(), SolrInfoBean.Group.overseer.toString(), metricTag);
}
public synchronized void start(String id) {
@@ -583,7 +597,7 @@ public class Overseer implements SolrCloseable {
ThreadGroup ccTg = new ThreadGroup("Overseer collection creation process.");
OverseerNodePrioritizer overseerPrioritizer = new OverseerNodePrioritizer(reader, getStateUpdateQueue(), adminPath, shardHandler.getShardHandlerFactory(), updateShardHandler.getDefaultHttpClient());
- overseerCollectionConfigSetProcessor = new OverseerCollectionConfigSetProcessor(reader, id, shardHandler, adminPath, stats, Overseer.this, overseerPrioritizer);
+ overseerCollectionConfigSetProcessor = new OverseerCollectionConfigSetProcessor(reader, id, shardHandler, adminPath, stats, Overseer.this, overseerPrioritizer, solrMetricsContext);
ccThread = new OverseerThread(ccTg, overseerCollectionConfigSetProcessor, "OverseerCollectionConfigSetProcessor-" + id);
ccThread.setDaemon(true);
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionConfigSetProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionConfigSetProcessor.java
index 78ddc82..d3819d0 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionConfigSetProcessor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionConfigSetProcessor.java
@@ -26,6 +26,7 @@ import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
+import org.apache.solr.metrics.SolrMetricsContext;
/**
* An {@link OverseerTaskProcessor} that handles:
@@ -37,7 +38,7 @@ public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor
public OverseerCollectionConfigSetProcessor(ZkStateReader zkStateReader, String myId,
final HttpShardHandler shardHandler,
String adminPath, Stats stats, Overseer overseer,
- OverseerNodePrioritizer overseerNodePrioritizer) {
+ OverseerNodePrioritizer overseerNodePrioritizer, SolrMetricsContext solrMetricsContext) {
this(
zkStateReader,
myId,
@@ -49,7 +50,8 @@ public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor
overseer.getCollectionQueue(zkStateReader.getZkClient(), stats),
Overseer.getRunningMap(zkStateReader.getZkClient()),
Overseer.getCompletedMap(zkStateReader.getZkClient()),
- Overseer.getFailureMap(zkStateReader.getZkClient())
+ Overseer.getFailureMap(zkStateReader.getZkClient()),
+ solrMetricsContext
);
}
@@ -62,7 +64,7 @@ public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor
OverseerTaskQueue workQueue,
DistributedMap runningMap,
DistributedMap completedMap,
- DistributedMap failureMap) {
+ DistributedMap failureMap, SolrMetricsContext solrMetricsContext) {
super(
zkStateReader,
myId,
@@ -73,7 +75,8 @@ public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor
workQueue,
runningMap,
completedMap,
- failureMap);
+ failureMap,
+ solrMetricsContext);
}
private static OverseerMessageHandlerSelector getOverseerMessageHandlerSelector(
diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
index cf86033..f7fec75 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
@@ -46,6 +46,7 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.Utils;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
@@ -94,6 +95,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
private boolean isClosed;
private volatile Stats stats;
+ private SolrMetricsContext overseerTaskProcessorMetricsContext;
// Set of tasks that have been picked up for processing but not cleaned up from zk work-queue.
// It may contain tasks that have completed execution, have been entered into the completed/failed map in zk but not
@@ -126,13 +128,14 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
private String thisNode;
public OverseerTaskProcessor(ZkStateReader zkStateReader, String myId,
- Stats stats,
- OverseerMessageHandlerSelector selector,
- OverseerNodePrioritizer prioritizer,
- OverseerTaskQueue workQueue,
- DistributedMap runningMap,
- DistributedMap completedMap,
- DistributedMap failureMap) {
+ Stats stats,
+ OverseerMessageHandlerSelector selector,
+ OverseerNodePrioritizer prioritizer,
+ OverseerTaskQueue workQueue,
+ DistributedMap runningMap,
+ DistributedMap completedMap,
+ DistributedMap failureMap,
+ SolrMetricsContext solrMetricsContext) {
this.zkStateReader = zkStateReader;
this.myId = myId;
this.stats = stats;
@@ -146,6 +149,9 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
this.runningTasks = new HashSet<>();
this.completedTasks = new HashMap<>();
thisNode = Utils.getMDCNode();
+
+ overseerTaskProcessorMetricsContext = solrMetricsContext.getChildContext(this);
+ overseerTaskProcessorMetricsContext.gauge(null, () -> workQueue.getZkStats().getQueueLength(), true, "collectionWorkQueueSize", "queue");
}
@Override
@@ -373,6 +379,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
public void close() {
isClosed = true;
+ overseerTaskProcessorMetricsContext.unregister();
if (tpe != null) {
if (!tpe.isShutdown()) {
ExecutorUtil.shutdownAndAwaitTermination(tpe);
diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java
index fa2ed4e..990f31e 100644
--- a/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/OverseerCollectionConfigSetProcessorTest.java
@@ -65,6 +65,7 @@ import org.apache.solr.core.CoreContainer;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.handler.component.ShardRequest;
+import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.CreateMode;
@@ -126,7 +127,8 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
private static CoreContainer coreContainerMock;
private static UpdateShardHandler updateShardHandlerMock;
private static HttpClient httpClientMock;
-
+ private static SolrMetricsContext solrMetricsContextMock;
+
private static ObjectCache objectCache;
private static AutoScalingConfig autoScalingConfig = new AutoScalingConfig(Collections.emptyMap());
private Map<String, byte[]> zkClientData = new HashMap<>();
@@ -150,8 +152,9 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
OverseerTaskQueue workQueue, DistributedMap runningMap,
Overseer overseer,
DistributedMap completedMap,
- DistributedMap failureMap) {
- super(zkStateReader, myId, shardHandlerFactory, adminPath, new Stats(), overseer, new OverseerNodePrioritizer(zkStateReader, overseer.getStateUpdateQueue(), adminPath, shardHandlerFactory, null), workQueue, runningMap, completedMap, failureMap);
+ DistributedMap failureMap,
+ SolrMetricsContext solrMetricsContext) {
+ super(zkStateReader, myId, shardHandlerFactory, adminPath, new Stats(), overseer, new OverseerNodePrioritizer(zkStateReader, overseer.getStateUpdateQueue(), adminPath, shardHandlerFactory, null), workQueue, runningMap, completedMap, failureMap, solrMetricsContext);
}
@Override
@@ -186,6 +189,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
coreContainerMock = mock(CoreContainer.class);
updateShardHandlerMock = mock(UpdateShardHandler.class);
httpClientMock = mock(HttpClient.class);
+ solrMetricsContextMock = mock(SolrMetricsContext.class);
}
@AfterClass
@@ -210,6 +214,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
coreContainerMock = null;
updateShardHandlerMock = null;
httpClientMock = null;
+ solrMetricsContextMock = null;
}
@Before
@@ -239,6 +244,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
reset(coreContainerMock);
reset(updateShardHandlerMock);
reset(httpClientMock);
+ reset(solrMetricsContextMock);
zkClientData.clear();
collectionsSet.clear();
@@ -507,7 +513,9 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
}}).when(distribStateManagerMock).makePath(anyString());
zkClientData.put("/configs/myconfig", new byte[1]);
-
+
+ when(solrMetricsContextMock.getChildContext(any(Object.class))).thenReturn(solrMetricsContextMock);
+
return liveNodes;
}
@@ -743,7 +751,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
underTest = new OverseerCollectionConfigSetProcessorToBeTested(zkStateReaderMock,
"1234", shardHandlerFactoryMock, ADMIN_PATH, workQueueMock, runningMapMock,
- overseerMock, completedMapMock, failureMapMock);
+ overseerMock, completedMapMock, failureMapMock, solrMetricsContextMock);
if (log.isInfoEnabled()) {
@@ -862,7 +870,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
testTemplate(numberOfNodes, numberOfNodesToCreateOn, createNodeListOptions, replicationFactor, numberOfSlices,
maxShardsPerNode, true);
}
-
+
@Test
public void testNoReplicationCollectionNotCreatedDueToMaxShardsPerNodeLimit()
throws Exception {
@@ -875,7 +883,7 @@ public class OverseerCollectionConfigSetProcessorTest extends SolrTestCaseJ4 {
testTemplate(numberOfNodes, numberOfNodesToCreateOn, createNodeListOptions, replicationFactor, numberOfSlices,
maxShardsPerNode, false);
}
-
+
@Test
public void testReplicationCollectionNotCreatedDueToMaxShardsPerNodeLimit()
throws Exception {
diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
index 339b657..49e7c813 100644
--- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
@@ -78,6 +78,7 @@ import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
+import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.solr.update.UpdateShardHandlerConfig;
import org.apache.solr.util.TimeOut;
@@ -1427,6 +1428,8 @@ public class OverseerTest extends SolrTestCaseJ4 {
CoreContainer mockAlwaysUpCoreContainer = mock(CoreContainer.class,
Mockito.withSettings().defaultAnswer(Mockito.CALLS_REAL_METHODS));
+ SolrMetricManager mockMetricManager = mock(SolrMetricManager.class);
+ when(mockAlwaysUpCoreContainer.getMetricManager()).thenReturn(mockMetricManager);
when(mockAlwaysUpCoreContainer.isShutDown()).thenReturn(testDone); // Allow retry on session expiry
when(mockAlwaysUpCoreContainer.getResourceLoader()).thenReturn(new SolrResourceLoader(createTempDir()));
FieldSetter.setField(zkController, ZkController.class.getDeclaredField("zkClient"), zkClient);
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
index 06b3829..036f140 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkControllerTest.java
@@ -38,6 +38,7 @@ import org.apache.solr.common.util.Utils;
import org.apache.solr.core.*;
import org.apache.solr.handler.admin.CoreAdminHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
+import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.solr.update.UpdateShardHandlerConfig;
import org.apache.solr.util.LogLevel;
@@ -49,6 +50,7 @@ import org.junit.Test;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA;
+import static org.mockito.Mockito.mock;
@Slow
@SolrTestCaseJ4.SuppressSSL
@@ -356,6 +358,7 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
private static class MockCoreContainer extends CoreContainer {
UpdateShardHandler updateShardHandler = new UpdateShardHandler(UpdateShardHandlerConfig.DEFAULT);
+ SolrMetricManager metricManager;
public MockCoreContainer() {
super(SolrXmlConfig.fromString(TEST_PATH(), "<solr/>"));
@@ -363,6 +366,7 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
httpShardHandlerFactory.init(new PluginInfo("shardHandlerFactory", Collections.emptyMap()));
this.shardHandlerFactory = httpShardHandlerFactory;
this.coreAdminHandler = new CoreAdminHandler();
+ this.metricManager = mock(SolrMetricManager.class);
}
@Override
@@ -379,5 +383,10 @@ public class ZkControllerTest extends SolrTestCaseJ4 {
updateShardHandler.close();
super.shutdown();
}
- }
+
+ @Override
+ public SolrMetricManager getMetricManager() {
+ return metricManager;
+ }
+ }
}
diff --git a/solr/solr-ref-guide/src/metrics-reporting.adoc b/solr/solr-ref-guide/src/metrics-reporting.adoc
index e27b7a9..5782aac 100644
--- a/solr/solr-ref-guide/src/metrics-reporting.adoc
+++ b/solr/solr-ref-guide/src/metrics-reporting.adoc
@@ -57,6 +57,12 @@ This registry is returned at `solr.jvm` and includes the following information.
* System properties such as Java information, various installation directory paths, ports, and similar information. You can control what appears here by modifying `solr.xml`.
// TODO for 7.0 fix this
+=== Overseer Registry
+
+This registry is returned at `solr.overseer` when run in SolrCloud mode and includes the following information. When making requests with the <<Metrics API>>, you can specify `&group=overseer` to limit to only these metrics.
+
+* size of the Overseer queues (collection work queue and cluster state update queue)
+
=== Node / CoreContainer Registry
This registry is returned at `solr.node` and includes the following information. When making requests with the <<Metrics API>>, you can specify `&group=node` to limit to only these metrics.