You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2018/05/22 19:57:57 UTC

lucene-solr:jira/solr-11779: SOLR-11779: Collect and track aggregated metrics. Support configuration via clusterprops.json and solr.xml.

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-11779 fc486f208 -> fb1067e44


SOLR-11779: Collect and track aggregated metrics. Support configuration via clusterprops.json
and solr.xml.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/fb1067e4
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/fb1067e4
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/fb1067e4

Branch: refs/heads/jira/solr-11779
Commit: fb1067e447712b4b37c7388df03a96c75e94f5b0
Parents: fc486f2
Author: Andrzej Bialecki <ab...@apache.org>
Authored: Tue May 22 21:55:56 2018 +0200
Committer: Andrzej Bialecki <ab...@apache.org>
Committed: Tue May 22 21:55:56 2018 +0200

----------------------------------------------------------------------
 .../org/apache/solr/core/CoreContainer.java     |  12 +-
 .../org/apache/solr/core/MetricsConfig.java     |  17 +-
 .../org/apache/solr/core/SolrXmlConfig.java     |   4 +
 .../handler/admin/MetricsHistoryHandler.java    | 294 +++++++++++++++++--
 .../solr/metrics/rrd/SolrRrdBackendFactory.java |  30 +-
 .../cloud/MetricsHistoryIntegrationTest.java    |   6 +-
 .../cloud/autoscaling/sim/LiveNodesSet.java     |   8 +-
 .../cloud/autoscaling/sim/SimCloudManager.java  |  37 +--
 .../sim/SimClusterStateProvider.java            |  60 +++-
 .../admin/MetricsHistoryHandlerTest.java        |  11 +-
 .../metrics/rrd/SolrRrdBackendFactoryTest.java  |   7 +-
 .../solr/util/MockSearchableSolrClient.java     |   2 +-
 12 files changed, 411 insertions(+), 77 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/java/org/apache/solr/core/CoreContainer.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index ba36546..aecdfd5 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -100,7 +100,6 @@ import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.metrics.SolrCoreMetricManager;
 import org.apache.solr.metrics.SolrMetricManager;
 import org.apache.solr.metrics.SolrMetricProducer;
-import org.apache.solr.metrics.rrd.SolrRrdBackendFactory;
 import org.apache.solr.request.SolrRequestHandler;
 import org.apache.solr.search.SolrFieldCacheBean;
 import org.apache.solr.security.AuthenticationPlugin;
@@ -553,10 +552,17 @@ public class CoreContainer {
     metricsHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, METRICS_PATH);
 
     if (isZooKeeperAware()) {
+      PluginInfo plugin = cfg.getMetricsConfig().getHistoryHandler();
+      Map<String, Object> initArgs;
+      if (plugin != null && plugin.initArgs != null) {
+        initArgs = plugin.initArgs.asMap(5);
+        initArgs.put(MetricsHistoryHandler.ENABLE_PROP, plugin.isEnabled());
+      } else {
+        initArgs = Collections.emptyMap();
+      }
       metricsHistoryHandler = new MetricsHistoryHandler(getZkController().getNodeName(), metricsHandler,
           new CloudSolrClient.Builder(Collections.singletonList(getZkController().getZkServerAddress()), Optional.empty())
-      .withHttpClient(updateShardHandler.getDefaultHttpClient()).build(), getZkController().getSolrCloudManager(),
-          MetricsHistoryHandler.DEFAULT_COLLECT_PERIOD, SolrRrdBackendFactory.DEFAULT_SYNC_PERIOD);
+      .withHttpClient(updateShardHandler.getDefaultHttpClient()).build(), getZkController().getSolrCloudManager(), initArgs);
       containerHandlers.put(METRICS_HISTORY_PATH, metricsHistoryHandler);
       metricsHistoryHandler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), metricTag, METRICS_HISTORY_PATH);
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/java/org/apache/solr/core/MetricsConfig.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/MetricsConfig.java b/solr/core/src/java/org/apache/solr/core/MetricsConfig.java
index 796483b..fab2553 100644
--- a/solr/core/src/java/org/apache/solr/core/MetricsConfig.java
+++ b/solr/core/src/java/org/apache/solr/core/MetricsConfig.java
@@ -30,16 +30,19 @@ public class MetricsConfig {
   private final PluginInfo meterSupplier;
   private final PluginInfo timerSupplier;
   private final PluginInfo histogramSupplier;
+  private final PluginInfo historyHandler;
 
   private MetricsConfig(PluginInfo[] metricReporters, Set<String> hiddenSysProps,
                         PluginInfo counterSupplier, PluginInfo meterSupplier,
-                        PluginInfo timerSupplier, PluginInfo histogramSupplier) {
+                        PluginInfo timerSupplier, PluginInfo histogramSupplier,
+                        PluginInfo historyHandler) {
     this.metricReporters = metricReporters;
     this.hiddenSysProps = hiddenSysProps;
     this.counterSupplier = counterSupplier;
     this.meterSupplier = meterSupplier;
     this.timerSupplier = timerSupplier;
     this.histogramSupplier = histogramSupplier;
+    this.historyHandler = historyHandler;
   }
 
   public PluginInfo[] getMetricReporters() {
@@ -66,6 +69,10 @@ public class MetricsConfig {
     return histogramSupplier;
   }
 
+  public PluginInfo getHistoryHandler() {
+    return historyHandler;
+  }
+
   public static class MetricsConfigBuilder {
     private PluginInfo[] metricReporterPlugins = new PluginInfo[0];
     private Set<String> hiddenSysProps = new HashSet<>();
@@ -73,6 +80,7 @@ public class MetricsConfig {
     private PluginInfo meterSupplier;
     private PluginInfo timerSupplier;
     private PluginInfo histogramSupplier;
+    private PluginInfo historyHandler;
 
     public MetricsConfigBuilder() {
 
@@ -111,9 +119,14 @@ public class MetricsConfig {
       return this;
     }
 
+    public MetricsConfigBuilder setHistoryHandler(PluginInfo info) {
+      this.historyHandler = info;
+      return this;
+    }
+
     public MetricsConfig build() {
       return new MetricsConfig(metricReporterPlugins, hiddenSysProps, counterSupplier, meterSupplier,
-          timerSupplier, histogramSupplier);
+          timerSupplier, histogramSupplier, historyHandler);
     }
 
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/java/org/apache/solr/core/SolrXmlConfig.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/core/SolrXmlConfig.java b/solr/core/src/java/org/apache/solr/core/SolrXmlConfig.java
index 9737c09..64fe731 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrXmlConfig.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrXmlConfig.java
@@ -483,6 +483,10 @@ public class SolrXmlConfig {
     if (node != null) {
       builder = builder.setHistogramSupplier(new PluginInfo(node, "histogramSupplier", false, false));
     }
+    node = config.getNode("solr/metrics/history", false);
+    if (node != null) {
+      builder = builder.setHistoryHandler(new PluginInfo(node, "history", false, false));
+    }
     PluginInfo[] reporterPlugins = getMetricReporterPluginInfos(config);
     Set<String> hiddenSysProps = getHiddenSysProps(config);
     return builder

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java
index 432ad7e..5a48523 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/MetricsHistoryHandler.java
@@ -28,17 +28,21 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Set;
 import java.util.TimeZone;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ScheduledThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.atomic.DoubleAdder;
 import java.util.function.Function;
 import java.util.stream.Stream;
 
@@ -46,11 +50,17 @@ import com.google.common.annotations.VisibleForTesting;
 import org.apache.solr.api.Api;
 import org.apache.solr.api.ApiBag;
 import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.cloud.NodeStateProvider;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
+import org.apache.solr.client.solrj.cloud.autoscaling.ReplicaInfo;
+import org.apache.solr.client.solrj.cloud.autoscaling.VersionedData;
+import org.apache.solr.cloud.Overseer;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.params.CollectionAdminParams;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
@@ -67,6 +77,7 @@ import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.security.AuthorizationContext;
 import org.apache.solr.security.PermissionNameProvider;
 import org.apache.solr.util.DefaultSolrThreadFactory;
+import org.apache.zookeeper.KeeperException;
 import org.rrd4j.ConsolFun;
 import org.rrd4j.DsType;
 import org.rrd4j.core.ArcDef;
@@ -84,6 +95,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import static java.util.stream.Collectors.toMap;
+import static org.apache.solr.common.params.CommonParams.ID;
 
 /**
  *
@@ -107,38 +119,77 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
     add("os.systemLoadAverage");
   }};
 
+  public static final String NUM_SHARDS_KEY = "numShards";
+  public static final String NUM_REPLICAS_KEY = "numReplicas";
+
+  public static final List<String> DEFAULT_COLLECTION_GAUGES = new ArrayList<String>() {{
+    add(NUM_SHARDS_KEY);
+    add(NUM_REPLICAS_KEY);
+  }};
+
+  public static final String COLLECT_PERIOD_PROP = "collectPeriod";
+  public static final String SYNC_PERIOD_PROP = "syncPeriod";
+  public static final String ENABLE_PROP = "enable";
+  public static final String ENABLE_REPLICAS_PROP = "enableReplicas";
+  public static final String ENABLE_NODES_PROP = "enableNodes";
+
   public static final int DEFAULT_COLLECT_PERIOD = 60;
   public static final String URI_PREFIX = "solr:";
 
   private final SolrRrdBackendFactory factory;
+  private final String nodeName;
   private final SolrClient solrClient;
   private final MetricsHandler metricsHandler;
   private final SolrCloudManager cloudManager;
-  private final ScheduledThreadPoolExecutor collectService;
   private final TimeSource timeSource;
   private final int collectPeriod;
   private final Map<String, List<String>> counters = new HashMap<>();
   private final Map<String, List<String>> gauges = new HashMap<>();
 
+  private ScheduledThreadPoolExecutor collectService;
   private boolean logMissingCollection = true;
+  private boolean enable;
+  private boolean enableReplicas;
+  private boolean enableNodes;
   private String versionString;
 
   public MetricsHistoryHandler(String nodeName, MetricsHandler metricsHandler,
-                               SolrClient solrClient, SolrCloudManager cloudManager, int collectPeriod, int syncPeriod) {
-    factory = new SolrRrdBackendFactory(nodeName, solrClient, CollectionAdminParams.SYSTEM_COLL,
+        SolrClient solrClient, SolrCloudManager cloudManager, Map<String, Object> pluginArgs) {
+
+    Map<String, Object> args = new HashMap<>();
+    // init from optional solr.xml config
+    if (pluginArgs != null) {
+      args.putAll(pluginArgs);
+    }
+    // override from ZK
+    Map<String, Object> props = (Map<String, Object>)cloudManager.getClusterStateProvider()
+        .getClusterProperty("metrics", Collections.emptyMap())
+        .getOrDefault("history", Collections.emptyMap());
+    args.putAll(props);
+
+    this.nodeName = nodeName;
+    this.enable = Boolean.parseBoolean(String.valueOf(args.getOrDefault(ENABLE_PROP, "true")));
+    // default to false - don't collect local per-replica metrics
+    this.enableReplicas = Boolean.parseBoolean(String.valueOf(args.getOrDefault(ENABLE_REPLICAS_PROP, "false")));
+    this.enableNodes = Boolean.parseBoolean(String.valueOf(args.getOrDefault(ENABLE_NODES_PROP, "false")));
+    this.collectPeriod = Integer.parseInt(String.valueOf(args.getOrDefault(COLLECT_PERIOD_PROP, DEFAULT_COLLECT_PERIOD)));
+    int syncPeriod = Integer.parseInt(String.valueOf(args.getOrDefault(SYNC_PERIOD_PROP, SolrRrdBackendFactory.DEFAULT_SYNC_PERIOD)));
+
+    factory = new SolrRrdBackendFactory(solrClient, CollectionAdminParams.SYSTEM_COLL,
             syncPeriod, cloudManager.getTimeSource());
     this.solrClient = solrClient;
     this.metricsHandler = metricsHandler;
     this.cloudManager = cloudManager;
-    this.collectPeriod = collectPeriod;
     this.timeSource = cloudManager.getTimeSource();
 
     counters.put(Group.core.toString(), DEFAULT_CORE_COUNTERS);
     counters.put(Group.node.toString(), Collections.emptyList());
     counters.put(Group.jvm.toString(), Collections.emptyList());
+    counters.put(Group.collection.toString(), Collections.emptyList());
     gauges.put(Group.core.toString(), DEFAULT_CORE_GAUGES);
     gauges.put(Group.node.toString(), DEFAULT_NODE_GAUGES);
     gauges.put(Group.jvm.toString(), DEFAULT_JVM_GAUGES);
+    gauges.put(Group.collection.toString(), DEFAULT_COLLECTION_GAUGES);
 
     versionString = this.getClass().getPackage().getImplementationVersion();
     if (versionString == null) {
@@ -148,14 +199,16 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
       versionString = versionString.substring(0, 24) + "...";
     }
 
-    collectService = (ScheduledThreadPoolExecutor) Executors.newScheduledThreadPool(1,
-        new DefaultSolrThreadFactory("MetricsHistoryHandler"));
-    collectService.setRemoveOnCancelPolicy(true);
-    collectService.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
-    collectService.scheduleWithFixedDelay(() -> collectMetrics(),
-        timeSource.convertDelay(TimeUnit.SECONDS, collectPeriod, TimeUnit.MILLISECONDS),
-        timeSource.convertDelay(TimeUnit.SECONDS, collectPeriod, TimeUnit.MILLISECONDS),
-        TimeUnit.MILLISECONDS);
+    if (enable) {
+      collectService = (ScheduledThreadPoolExecutor) Executors.newScheduledThreadPool(1,
+          new DefaultSolrThreadFactory("MetricsHistoryHandler"));
+      collectService.setRemoveOnCancelPolicy(true);
+      collectService.setExecuteExistingDelayedTasksAfterShutdownPolicy(false);
+      collectService.scheduleWithFixedDelay(() -> collectMetrics(),
+          timeSource.convertDelay(TimeUnit.SECONDS, collectPeriod, TimeUnit.MILLISECONDS),
+          timeSource.convertDelay(TimeUnit.SECONDS, collectPeriod, TimeUnit.MILLISECONDS),
+          TimeUnit.MILLISECONDS);
+    }
   }
 
   public SolrClient getSolrClient() {
@@ -167,6 +220,36 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
     return factory;
   }
 
+  private boolean isOverseerLeader() {
+    ZkNodeProps props = null;
+    try {
+      VersionedData data = cloudManager.getDistribStateManager().getData(
+          Overseer.OVERSEER_ELECT + "/leader");
+      if (data != null && data.getData() != null) {
+        props = ZkNodeProps.load(data.getData());
+      }
+    } catch (KeeperException | IOException | NoSuchElementException e) {
+      log.warn("Could not obtain overseer's address, skipping.", e);
+      return false;
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+      return false;
+    }
+    if (props == null) {
+      return false;
+    }
+    String oid = props.getStr(ID);
+    if (oid == null) {
+      return false;
+    }
+    String[] ids = oid.split("-");
+    if (ids.length != 3) { // unknown format
+      log.warn("Unknown format of leader id, skipping: " + oid);
+      return false;
+    }
+    return nodeName.equals(ids[1]);
+  }
+
   private void collectMetrics() {
     if (metricManager == null) {
       // not inited yet
@@ -205,11 +288,24 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
     }
     logMissingCollection = true;
     // get metrics
-    for (Group group : Arrays.asList(Group.jvm, Group.core, Group.node)) {
+    collectLocalReplicaMetrics();
+    collectGlobalMetrics();
+  }
+
+  private void collectLocalReplicaMetrics() {
+    List<Group> groups = new ArrayList<>();
+    if (enableNodes) {
+      groups.add(Group.jvm);
+      groups.add(Group.node);
+    }
+    if (enableReplicas) {
+      groups.add(Group.core);
+    }
+    for (Group group : groups) {
       if (Thread.interrupted()) {
         return;
       }
-      log.debug("--  collecting " + group + "...");
+      log.debug("--  collecting local " + group + "...");
       ModifiableSolrParams params = new ModifiableSolrParams();
       params.add(MetricsHandler.GROUP_PARAM, group.toString());
       params.add(MetricsHandler.COMPACT_PARAM, "true");
@@ -226,7 +322,11 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
         if (nl != null) {
           for (Iterator<Map.Entry<String, Object>> it = nl.iterator(); it.hasNext(); ) {
             Map.Entry<String, Object> entry = it.next();
-            final String registry = entry.getKey();
+            String reg = entry.getKey();
+            if (group != Group.core) { // add nodeName prefix
+              reg = reg + "." + nodeName;
+            }
+            final String registry = reg;
             RrdDb db = metricManager.getOrCreateMetricHistory(registry, () -> {
               RrdDef def = createDef(registry, group);
               try {
@@ -279,12 +379,18 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
     def.setStartTime(TimeUnit.SECONDS.convert(timeSource.getEpochTimeNs(), TimeUnit.NANOSECONDS) - def.getStep());
 
     // add datasources
-
-    // use NaN when more than 1 sample is missing
-    counters.get(group.toString()).forEach(c ->
-        def.addDatasource(c, DsType.COUNTER, collectPeriod * 2, Double.NaN, Double.NaN));
-    gauges.get(group.toString()).forEach(g ->
-        def.addDatasource(g, DsType.GAUGE, collectPeriod * 2, Double.NaN, Double.NaN));
+    List<Group> groups = new ArrayList<>();
+    groups.add(group);
+    if (group == Group.collection) {
+      groups.add(Group.core);
+    }
+    for (Group g : groups) {
+      // use NaN when more than 1 sample is missing
+      counters.get(g.toString()).forEach(name ->
+          def.addDatasource(name, DsType.COUNTER, collectPeriod * 2, Double.NaN, Double.NaN));
+      gauges.get(g.toString()).forEach(name ->
+          def.addDatasource(name, DsType.GAUGE, collectPeriod * 2, Double.NaN, Double.NaN));
+    }
 
     // add archives
 
@@ -298,6 +404,152 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
     return def;
   }
 
+  private void collectGlobalMetrics() {
+    if (!isOverseerLeader()) {
+      return;
+    }
+    Set<String> nodes = new HashSet<>(cloudManager.getClusterStateProvider().getLiveNodes());
+    NodeStateProvider nodeStateProvider = cloudManager.getNodeStateProvider();
+    Set<String> collTags = new HashSet<>();
+    collTags.addAll(counters.get(Group.core.toString()));
+    collTags.addAll(gauges.get(Group.core.toString()));
+
+    Set<String> nodeTags = new HashSet<>();
+    String nodePrefix = "metrics:" + SolrMetricManager.getRegistryName(Group.node) + ":";
+    counters.get(Group.node.toString()).forEach(name -> {
+      nodeTags.add(nodePrefix + name);
+    });
+    gauges.get(Group.node.toString()).forEach(name -> {
+      nodeTags.add(nodePrefix + name);
+    });
+    String jvmPrefix = "metrics:" + SolrMetricManager.getRegistryName(Group.jvm) + ":";
+    counters.get(Group.jvm.toString()).forEach(name -> {
+      nodeTags.add(jvmPrefix + name);
+    });
+    gauges.get(Group.jvm.toString()).forEach(name -> {
+      nodeTags.add(jvmPrefix + name);
+    });
+
+    // per-registry totals
+    // XXX at the moment the type of metrics that we collect allows
+    // adding all partial values. At some point it may be necessary to implement
+    // other aggregation functions.
+    // group : registry : name : value
+    Map<Group, Map<String, Map<String, Number>>> totals = new HashMap<>();
+
+    // collect and aggregate per-collection totals
+    for (String node : nodes) {
+      if (cloudManager.isClosed() || Thread.interrupted()) {
+        return;
+      }
+      // add core-level stats
+      Map<String, Map<String, List<ReplicaInfo>>> infos = nodeStateProvider.getReplicaInfo(node, collTags);
+      infos.forEach((coll, shards) -> {
+        shards.forEach((sh, replicas) -> {
+          String registry = SolrMetricManager.getRegistryName(Group.collection, coll);
+          Map<String, Number> perReg = totals
+              .computeIfAbsent(Group.collection, g -> new HashMap<>())
+              .computeIfAbsent(registry, r -> new HashMap<>());
+          replicas.forEach(ri -> {
+            collTags.forEach(tag -> {
+              double value = ((Number)ri.getVariable(tag, 0.0)).doubleValue();
+              DoubleAdder adder = (DoubleAdder)perReg.computeIfAbsent(tag, t -> new DoubleAdder());
+              adder.add(value);
+            });
+          });
+        });
+      });
+      // add node-level stats
+      Map<String, Object> nodeValues = nodeStateProvider.getNodeValues(node, nodeTags);
+      for (Group g : Arrays.asList(Group.node, Group.jvm)) {
+        String registry = SolrMetricManager.getRegistryName(g);
+        Map<String, Number> perReg = totals
+            .computeIfAbsent(g, gr -> new HashMap<>())
+            .computeIfAbsent(registry, r -> new HashMap<>());
+        Set<String> names = new HashSet<>();
+        names.addAll(counters.get(g.toString()));
+        names.addAll(gauges.get(g.toString()));
+        names.forEach(name -> {
+          String tag = "metrics:" + registry + ":" + name;
+          double value = ((Number)nodeValues.getOrDefault(tag, 0.0)).doubleValue();
+          DoubleAdder adder = (DoubleAdder)perReg.computeIfAbsent(name, t -> new DoubleAdder());
+          adder.add(value);
+        });
+      }
+    }
+    // add some global collection-level stats
+    try {
+      ClusterState state = cloudManager.getClusterStateProvider().getClusterState();
+      state.forEachCollection(coll -> {
+        String registry = SolrMetricManager.getRegistryName(Group.collection, coll.getName());
+        Map<String, Number> perReg = totals
+            .computeIfAbsent(Group.collection, g -> new HashMap<>())
+            .computeIfAbsent(registry, r -> new HashMap<>());
+        Collection<Slice> slices = coll.getActiveSlices();
+        perReg.put(NUM_SHARDS_KEY, slices.size());
+        DoubleAdder numActiveReplicas = new DoubleAdder();
+        slices.forEach(s -> {
+          s.forEach(r -> {
+            if (r.isActive(state.getLiveNodes())) {
+              numActiveReplicas.add(1.0);
+            }
+          });
+        });
+        perReg.put(NUM_REPLICAS_KEY, numActiveReplicas);
+      });
+    } catch (IOException e) {
+      log.warn("Exception getting cluster state", e);
+    }
+
+    // now update the db-s
+    totals.forEach((group, perGroup) -> {
+      perGroup.forEach((reg, perReg) -> {
+        RrdDb db = metricManager.getOrCreateMetricHistory(reg, () -> {
+          RrdDef def = createDef(reg, group);
+          try {
+            RrdDb newDb = new RrdDb(def, factory);
+            return newDb;
+          } catch (IOException e) {
+            return null;
+          }
+        });
+        if (db == null) {
+          return;
+        }
+        try {
+          // set the timestamp
+          Sample s = db.createSample(TimeUnit.SECONDS.convert(timeSource.getEpochTimeNs(), TimeUnit.NANOSECONDS));
+          AtomicBoolean dirty = new AtomicBoolean(false);
+          List<Group> groups = new ArrayList<>();
+          groups.add(group);
+          if (group == Group.collection) {
+            groups.add(Group.core);
+          }
+          for (Group g : groups) {
+            counters.get(g.toString()).forEach(c -> {
+              Number val = perReg.get(c);
+              if (val != null) {
+                dirty.set(true);
+                s.setValue(c, val.doubleValue());
+              }
+            });
+            gauges.get(g.toString()).forEach(c -> {
+              Number val = perReg.get(c);
+              if (val != null) {
+                dirty.set(true);
+                s.setValue(c, val.doubleValue());
+              }
+            });
+          }
+          if (dirty.get()) {
+            s.update();
+          }
+        } catch (Exception e) {
+        }
+      });
+    });
+  }
+
   @Override
   public void close() {
     log.debug("Closing " + hashCode());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java b/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java
index 4120d97..f2cd7a0 100644
--- a/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java
+++ b/solr/core/src/java/org/apache/solr/metrics/rrd/SolrRrdBackendFactory.java
@@ -75,9 +75,7 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
   public static final String DOC_TYPE = "metrics_rrd";
 
   public static final String DATA_FIELD = "data_bin";
-  public static final String NODE_FIELD = "node_s";
 
-  private final String nodeName;
   private final SolrClient solrClient;
   private final TimeSource timeSource;
   private final String collection;
@@ -90,23 +88,20 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
 
   /**
    * Create a factory.
-   * @param nodeName node name. Documents are stored in a distributed collection and
-   *                 this parameter is needed to avoid namespace conflicts.
    * @param solrClient SolrClient to use
-   * @param collection collection name where documents are stored (typicall this is
+   * @param collection collection name where documents are stored (typically this is
    *                   {@link CollectionAdminParams#SYSTEM_COLL})
    * @param syncPeriod synchronization period in seconds - how often modified
    *                   databases are stored as updated Solr documents
    * @param timeSource time source
    */
-  public SolrRrdBackendFactory(String nodeName, SolrClient solrClient, String collection, int syncPeriod, TimeSource timeSource) {
-    this.nodeName = nodeName;
+  public SolrRrdBackendFactory(SolrClient solrClient, String collection, int syncPeriod, TimeSource timeSource) {
     this.solrClient = solrClient;
     this.timeSource = timeSource;
     this.collection = collection;
     this.syncPeriod = syncPeriod;
     log.debug("Created " + hashCode());
-    this.idPrefixLength = ID_PREFIX.length() + ID_SEP.length() + nodeName.length() + ID_SEP.length();
+    this.idPrefixLength = ID_PREFIX.length() + ID_SEP.length();
     syncService = (ScheduledThreadPoolExecutor) Executors.newScheduledThreadPool(2,
         new DefaultSolrThreadFactory("SolrRrdBackendFactory"));
     syncService.setRemoveOnCancelPolicy(true);
@@ -117,10 +112,6 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
         TimeUnit.MILLISECONDS);
   }
 
-  public String getNodeName() {
-    return nodeName;
-  }
-
   private void ensureOpen() throws IOException {
     if (closed) {
       throw new IOException("Factory already closed");
@@ -191,7 +182,7 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
   byte[] getData(String path) throws IOException {
     try {
       ModifiableSolrParams params = new ModifiableSolrParams();
-      params.add(CommonParams.Q, "{!term f=id}" + ID_PREFIX + ID_SEP + nodeName + ID_SEP + path);
+      params.add(CommonParams.Q, "{!term f=id}" + ID_PREFIX + ID_SEP + path);
       params.add(CommonParams.FQ, CommonParams.TYPE + ":" + DOC_TYPE);
       QueryResponse rsp = solrClient.query(collection, params);
       SolrDocumentList docs = rsp.getResults();
@@ -232,7 +223,6 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
       ModifiableSolrParams params = new ModifiableSolrParams();
       params.add(CommonParams.Q, "*:*");
       params.add(CommonParams.FQ, CommonParams.TYPE + ":" + DOC_TYPE);
-      params.add(CommonParams.FQ, "{!term f=" + NODE_FIELD + "}:" + nodeName);
       params.add(CommonParams.FL, "id");
       params.add(CommonParams.ROWS, String.valueOf(maxLength));
       QueryResponse rsp = solrClient.query(collection, params);
@@ -263,9 +253,7 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
     // remove all Solr docs
     try {
       solrClient.deleteByQuery(collection,
-          "{!term f=" + CommonParams.TYPE + "}:" + DOC_TYPE +
-          " AND {!term f=" + NODE_FIELD + "}:" + nodeName,
-          syncPeriod * 1000);
+          "{!term f=" + CommonParams.TYPE + "}:" + DOC_TYPE, syncPeriod * 1000);
     } catch (SolrServerException e) {
       log.warn("Error deleting RRDs", e);
     }
@@ -283,7 +271,7 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
     }
     // remove Solr doc
     try {
-      solrClient.deleteByQuery(collection, "{!term f=id}" + ID_PREFIX + ID_SEP + nodeName + ID_SEP + path);
+      solrClient.deleteByQuery(collection, "{!term f=id}" + ID_PREFIX + ID_SEP + path);
     } catch (SolrServerException e) {
       log.warn("Error deleting RRD for path " + path, e);
     }
@@ -312,9 +300,8 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
     try {
       syncData.forEach((path, data) -> {
         SolrInputDocument doc = new SolrInputDocument();
-        doc.setField("id", ID_PREFIX + ID_SEP + nodeName + ID_SEP + path);
+        doc.setField("id", ID_PREFIX + ID_SEP + path);
         doc.addField(CommonParams.TYPE, DOC_TYPE);
-        doc.addField(NODE_FIELD, nodeName);
         doc.addField(DATA_FIELD, data);
         doc.setField("timestamp", new Date(TimeUnit.MILLISECONDS.convert(timeSource.getEpochTimeNs(), TimeUnit.NANOSECONDS)));
         try {
@@ -357,9 +344,8 @@ public class SolrRrdBackendFactory extends RrdBackendFactory implements SolrClos
     }
     try {
       ModifiableSolrParams params = new ModifiableSolrParams();
-      params.add(CommonParams.Q, "{!term f=id}" + ID_PREFIX + ID_SEP + nodeName + ID_SEP + path);
+      params.add(CommonParams.Q, "{!term f=id}" + ID_PREFIX + ID_SEP + path);
       params.add(CommonParams.FQ, CommonParams.TYPE + ":" + DOC_TYPE);
-      params.add(CommonParams.FQ, "{!term f=" + NODE_FIELD + "}:" + nodeName);
       params.add(CommonParams.FL, "id");
       QueryResponse rsp = solrClient.query(collection, params);
       SolrDocumentList docs = rsp.getResults();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java
index e349628..001b294 100644
--- a/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/MetricsHistoryIntegrationTest.java
@@ -57,7 +57,7 @@ public class MetricsHistoryIntegrationTest extends SolrCloudTestCase {
 
   @BeforeClass
   public static void setupCluster() throws Exception {
-    boolean simulated = random().nextBoolean();
+    boolean simulated = random().nextBoolean() || true;
     if (simulated) {
       cloudManager = SimCloudManager.createCluster(1, TimeSource.get("simTime:50"));
       solrClient = ((SimCloudManager)cloudManager).simGetSolrClient();
@@ -93,13 +93,13 @@ public class MetricsHistoryIntegrationTest extends SolrCloudTestCase {
   public void testList() throws Exception {
     NamedList<Object> rsp = solrClient.request(createHistoryRequest(params(CommonParams.ACTION, "list")));
     assertNotNull(rsp);
-    // expected solr.jvm, solr.node and solr.core..system replica 1
+    // expected solr.jvm, solr.node and solr.collection..system
     List<String> lst = (List<String>)rsp.get("metrics");
     assertNotNull(lst);
     assertEquals(lst.toString(), 3, lst.size());
     assertTrue(lst.toString(), lst.contains("solr.jvm"));
     assertTrue(lst.toString(), lst.contains("solr.node"));
-    assertTrue(lst.toString(), lst.contains("solr.core..system.shard1.replica_n1"));
+    assertTrue(lst.toString(), lst.contains("solr.collection..system"));
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/LiveNodesSet.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/LiveNodesSet.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/LiveNodesSet.java
index ca4ed71..3f5d5f4 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/LiveNodesSet.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/LiveNodesSet.java
@@ -18,6 +18,7 @@ package org.apache.solr.cloud.autoscaling.sim;
 
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
@@ -28,7 +29,7 @@ import org.apache.solr.common.cloud.LiveNodesListener;
 /**
  * This class represents a set of live nodes and allows adding listeners to track their state.
  */
-public class LiveNodesSet {
+public class LiveNodesSet implements Iterable<String> {
 
   private final Set<String> set = ConcurrentHashMap.newKeySet();
   private final Set<LiveNodesListener> listeners = ConcurrentHashMap.newKeySet();
@@ -100,4 +101,9 @@ public class LiveNodesSet {
     set.clear();
     fireListeners(oldNodes, Collections.emptySortedSet());
   }
+
+  @Override
+  public Iterator<String> iterator() {
+    return set.iterator();
+  }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
index d7bb5b1..06f1a54 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimCloudManager.java
@@ -92,7 +92,6 @@ import org.apache.solr.metrics.AltBufferPoolMetricSet;
 import org.apache.solr.metrics.MetricsMap;
 import org.apache.solr.metrics.OperatingSystemMetricSet;
 import org.apache.solr.metrics.SolrMetricManager;
-import org.apache.solr.metrics.rrd.SolrRrdBackendFactory;
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.util.DefaultSolrThreadFactory;
@@ -116,8 +115,7 @@ public class SimCloudManager implements SolrCloudManager {
   private final DistributedQueueFactory queueFactory;
   private final ObjectCache objectCache = new ObjectCache();
   private final SolrMetricManager metricManager = new SolrMetricManager();
-  private final MetricsHistoryHandler historyHandler;
-  private TimeSource timeSource;
+  private final String metricTag;
 
   private final List<SolrInputDocument> systemColl = Collections.synchronizedList(new ArrayList<>());
   private final MockSearchableSolrClient solrClient;
@@ -128,6 +126,8 @@ public class SimCloudManager implements SolrCloudManager {
   private Overseer.OverseerThread triggerThread;
   private ThreadGroup triggerThreadGroup;
   private SolrResourceLoader loader;
+  private MetricsHistoryHandler historyHandler;
+  private TimeSource timeSource;
 
   private static int nodeIdPort = 10000;
   public static int DEFAULT_DISK = 1000; // 1000 GB
@@ -159,9 +159,10 @@ public class SimCloudManager implements SolrCloudManager {
     stateManager.makePath(ZkStateReader.SOLR_AUTOSCALING_TRIGGER_STATE_PATH);
     stateManager.makePath(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
     stateManager.makePath(ZkStateReader.SOLR_AUTOSCALING_NODE_ADDED_PATH);
+    stateManager.makePath(Overseer.OVERSEER_ELECT);
 
     // register common metrics
-    String metricTag = Integer.toHexString(hashCode());
+    metricTag = Integer.toHexString(hashCode());
     String registryName = SolrMetricManager.getRegistryName(SolrInfoBean.Group.jvm);
     metricManager.registerAll(registryName, new AltBufferPoolMetricSet(), true, "buffers");
     metricManager.registerAll(registryName, new ClassLoadingGaugeSet(), true, "classes");
@@ -223,10 +224,6 @@ public class SimCloudManager implements SolrCloudManager {
     this.simCloudManagerPool = ExecutorUtil.newMDCAwareFixedThreadPool(200, new DefaultSolrThreadFactory("simCloudManagerPool"));
 
     this.autoScalingHandler = new AutoScalingHandler(this, loader);
-    MetricsHandler metricsHandler = new MetricsHandler(metricManager);
-    this.historyHandler = new MetricsHistoryHandler("1.0.0.1:1111_solr", metricsHandler, solrClient, this,
-        MetricsHistoryHandler.DEFAULT_COLLECT_PERIOD, SolrRrdBackendFactory.DEFAULT_SYNC_PERIOD);
-    this.historyHandler.initializeMetrics(metricManager, SolrMetricManager.getRegistryName(SolrInfoBean.Group.node), metricTag, CommonParams.METRICS_HISTORY_PATH);
 
 
     triggerThreadGroup = new ThreadGroup("Simulated Overseer autoscaling triggers");
@@ -247,13 +244,7 @@ public class SimCloudManager implements SolrCloudManager {
   public static SimCloudManager createCluster(int numNodes, TimeSource timeSource) throws Exception {
     SimCloudManager cloudManager = new SimCloudManager(timeSource);
     for (int i = 1; i <= numNodes; i++) {
-      Map<String, Object> values = createNodeValues(null);
-//      if (i == 1) { // designated Overseer ?
-        //values.put(ImplicitSnitch.NODEROLE, "overseer");
-//      }
-      String nodeId = (String)values.get(ImplicitSnitch.NODE);
-      cloudManager.getSimClusterStateProvider().simAddNode(nodeId);
-      cloudManager.getSimNodeStateProvider().simSetNodeValues(nodeId, values);
+      cloudManager.simAddNode();
     }
     return cloudManager;
   }
@@ -405,6 +396,12 @@ public class SimCloudManager implements SolrCloudManager {
     clusterStateProvider.simAddNode(nodeId);
     nodeStateProvider.simSetNodeValues(nodeId, values);
     LOG.trace("-- added node " + nodeId);
+    // initialize history handler if this is the first node
+    if (historyHandler == null && liveNodesSet.size() == 1) {
+      MetricsHandler metricsHandler = new MetricsHandler(metricManager);
+      historyHandler = new MetricsHistoryHandler(nodeId, metricsHandler, solrClient, this, Collections.emptyMap());
+      historyHandler.initializeMetrics(metricManager, SolrMetricManager.getRegistryName(SolrInfoBean.Group.node), metricTag, CommonParams.METRICS_HISTORY_PATH);
+    }
     return nodeId;
   }
 
@@ -672,7 +669,11 @@ public class SimCloudManager implements SolrCloudManager {
         if (autoscaling) {
           autoScalingHandler.handleRequest(queryRequest, queryResponse);
         } else {
-          historyHandler.handleRequest(queryRequest, queryResponse);
+          if (historyHandler != null) {
+            historyHandler.handleRequest(queryRequest, queryResponse);
+          } else {
+            throw new UnsupportedOperationException("must add at least 1 node first");
+          }
         }
         if (queryResponse.getException() != null) {
           LOG.debug("-- exception handling request", queryResponse.getException());
@@ -823,7 +824,9 @@ public class SimCloudManager implements SolrCloudManager {
 
   @Override
   public void close() throws IOException {
-    IOUtils.closeQuietly(historyHandler);
+    if (historyHandler != null) {
+      IOUtils.closeQuietly(historyHandler);
+    }
     IOUtils.closeQuietly(clusterStateProvider);
     IOUtils.closeQuietly(nodeStateProvider);
     IOUtils.closeQuietly(stateManager);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
index 4c23cc4..ca2dd48 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/sim/SimClusterStateProvider.java
@@ -29,6 +29,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
@@ -50,10 +51,12 @@ import org.apache.solr.client.solrj.impl.ClusterStateProvider;
 import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.apache.solr.cloud.ActionThrottle;
+import org.apache.solr.cloud.Overseer;
 import org.apache.solr.cloud.api.collections.AddReplicaCmd;
 import org.apache.solr.cloud.api.collections.Assign;
 import org.apache.solr.cloud.api.collections.CreateCollectionCmd;
 import org.apache.solr.cloud.api.collections.CreateShardCmd;
+import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler;
 import org.apache.solr.cloud.api.collections.SplitShardCmd;
 import org.apache.solr.cloud.overseer.ClusterStateMutator;
 import org.apache.solr.cloud.overseer.CollectionMutator;
@@ -69,6 +72,7 @@ import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.ZkNodeProps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.cloud.rule.ImplicitSnitch;
+import org.apache.solr.common.params.CollectionAdminParams;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.CommonAdminParams;
 import org.apache.solr.common.params.CoreAdminParams;
@@ -127,6 +131,8 @@ public class SimClusterStateProvider implements ClusterStateProvider {
 
 
   private volatile int clusterStateVersion = 0;
+  private volatile String overseerLeader = null;
+
   private Map<String, Object> lastSavedProperties = null;
 
   private AtomicReference<Map<String, DocCollection>> collectionsStatesRef = new AtomicReference<>();
@@ -236,6 +242,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
     }
     liveNodes.add(nodeId);
     createEphemeralLiveNode(nodeId);
+    updateOverseerLeader();
     nodeReplicaMap.putIfAbsent(nodeId, new ArrayList<>());
   }
 
@@ -257,6 +264,7 @@ public class SimClusterStateProvider implements ClusterStateProvider {
       }
       // remove ephemeral nodes
       stateManager.getRoot().removeEphemeralChildren(nodeId);
+      updateOverseerLeader();
       // create a nodeLost marker if needed
       AutoScalingConfig cfg = stateManager.getAutoScalingConfig(null);
       if (cfg.hasTriggerForEvents(TriggerEventType.NODELOST)) {
@@ -271,6 +279,35 @@ public class SimClusterStateProvider implements ClusterStateProvider {
     }
   }
 
+  private synchronized void updateOverseerLeader() throws Exception {
+    if (overseerLeader != null && liveNodes.contains(overseerLeader)) {
+      return;
+    }
+    String path = Overseer.OVERSEER_ELECT + "/leader";
+    if (liveNodes.isEmpty()) {
+      overseerLeader = null;
+      // remove it from ZK
+      try {
+        cloudManager.getDistribStateManager().removeData(path, -1);
+      } catch (NoSuchElementException e) {
+        // ignore
+      }
+      return;
+    }
+    // pick first
+    overseerLeader = liveNodes.iterator().next();
+    LOG.debug("--- new Overseer leader: " + overseerLeader);
+    // record it in ZK
+    Map<String, Object> id = new HashMap<>();
+    id.put("id", cloudManager.getTimeSource().getTimeNs() +
+        "-" + overseerLeader + "-n_0000000000");
+    try {
+      cloudManager.getDistribStateManager().makePath(path, Utils.toJSON(id), CreateMode.EPHEMERAL, false);
+    } catch (Exception e) {
+      LOG.warn("Exception saving overseer leader id", e);
+    }
+  }
+
   // this method needs to be called under a lock
   private void setReplicaStates(String nodeId, Replica.State state, Set<String> changedCollections) {
     List<ReplicaInfo> replicas = nodeReplicaMap.get(nodeId);
@@ -1020,6 +1057,22 @@ public class SimClusterStateProvider implements ClusterStateProvider {
     }
   }
 
+  public synchronized void createSystemCollection() throws IOException {
+    try {
+      if (simListCollections().contains(CollectionAdminParams.SYSTEM_COLL)) {
+        return;
+      }
+      ZkNodeProps props = new ZkNodeProps(
+          NAME, CollectionAdminParams.SYSTEM_COLL,
+          REPLICATION_FACTOR, "1",
+          OverseerCollectionMessageHandler.NUM_SLICES, "1"
+      );
+      simCreateCollection(props, new NamedList());
+    } catch (Exception e) {
+      throw new IOException(e);
+    }
+  }
+
   /**
    * Simulate an update by modifying replica metrics.
    * The following core metrics are updated:
@@ -1044,7 +1097,12 @@ public class SimClusterStateProvider implements ClusterStateProvider {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection not set");
     }
     if (!simListCollections().contains(collection)) {
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection '" + collection + "' doesn't exist");
+      if (CollectionAdminParams.SYSTEM_COLL.equals(collection)) {
+        // auto-create
+        createSystemCollection();
+      } else {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Collection '" + collection + "' doesn't exist");
+      }
     }
     // always reset first to get the current metrics - it's easier than to keep matching
     // Replica with ReplicaInfo where the current real counts are stored

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java
index e8e50ed..e1e230f 100644
--- a/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/admin/MetricsHistoryHandlerTest.java
@@ -17,7 +17,9 @@
 
 package org.apache.solr.handler.admin;
 
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.solr.client.solrj.SolrClient;
 import org.apache.solr.client.solrj.cloud.SolrCloudManager;
@@ -56,6 +58,9 @@ public class MetricsHistoryHandlerTest extends SolrCloudTestCase {
   @BeforeClass
   public static void beforeClass() throws Exception {
     simulated = random().nextBoolean();
+    Map<String, Object> args = new HashMap<>();
+    args.put(MetricsHistoryHandler.SYNC_PERIOD_PROP, 1);
+    args.put(MetricsHistoryHandler.COLLECT_PERIOD_PROP, 1);
     if (simulated) {
       SPEED = 50;
       cloudManager = SimCloudManager.createCluster(1, TimeSource.get("simTime:" + SPEED));
@@ -67,7 +72,8 @@ public class MetricsHistoryHandlerTest extends SolrCloudTestCase {
       solrClient = ((SimCloudManager)cloudManager).simGetSolrClient();
       // need to register the factory here, before we start the real cluster
       metricsHandler = new MetricsHandler(metricManager);
-      handler = new MetricsHistoryHandler("localhost:1234_solr", metricsHandler, solrClient, cloudManager, 1, 1);
+      handler = new MetricsHistoryHandler(cloudManager.getClusterStateProvider().getLiveNodes().iterator().next(),
+          metricsHandler, solrClient, cloudManager, args);
       handler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), "", CommonParams.METRICS_HISTORY_PATH);
     }
     configureCluster(1)
@@ -78,7 +84,7 @@ public class MetricsHistoryHandlerTest extends SolrCloudTestCase {
       metricManager = cluster.getJettySolrRunner(0).getCoreContainer().getMetricManager();
       solrClient = cluster.getSolrClient();
       metricsHandler = new MetricsHandler(metricManager);
-      handler = new MetricsHistoryHandler(cluster.getJettySolrRunner(0).getNodeName(), metricsHandler, solrClient, cloudManager, 1, 1);
+      handler = new MetricsHistoryHandler(cluster.getJettySolrRunner(0).getNodeName(), metricsHandler, solrClient, cloudManager, args);
       handler.initializeMetrics(metricManager, SolrInfoBean.Group.node.toString(), "", CommonParams.METRICS_HISTORY_PATH);
       SPEED = 1;
     }
@@ -106,6 +112,7 @@ public class MetricsHistoryHandlerTest extends SolrCloudTestCase {
   public void testBasic() throws Exception {
     timeSource.sleep(10000);
     List<String> list = handler.getFactory().list(100);
+    // solr.jvm, solr.node, solr.collection..system
     assertEquals(list.toString(), 3, list.size());
     for (String path : list) {
       RrdDb db = new RrdDb(MetricsHistoryHandler.URI_PREFIX + path, true, handler.getFactory());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java b/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java
index a4fd67f..0855508 100644
--- a/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java
+++ b/solr/core/src/test/org/apache/solr/metrics/rrd/SolrRrdBackendFactoryTest.java
@@ -42,20 +42,19 @@ import org.rrd4j.core.Sample;
  */
 public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 {
 
-  private static final String nodeName = "localhost:1234_solr";
   private SolrRrdBackendFactory factory;
   private MockSearchableSolrClient solrClient;
   private TimeSource timeSource;
 
   @Before
-  public void setup() throws Exception {
+  public void setup() {
     solrClient = new MockSearchableSolrClient();
     if (random().nextBoolean()) {
       timeSource = TimeSource.NANO_TIME;
     } else {
       timeSource = TimeSource.get("simTime:50");
     }
-    factory = new SolrRrdBackendFactory(nodeName, solrClient, CollectionAdminParams.SYSTEM_COLL, 1, timeSource);
+    factory = new SolrRrdBackendFactory(solrClient, CollectionAdminParams.SYSTEM_COLL, 1, timeSource);
   }
 
   @After
@@ -85,7 +84,7 @@ public class SolrRrdBackendFactoryTest extends SolrTestCaseJ4 {
     timeSource.sleep(2000);
     // there should be one sync data
     assertEquals(solrClient.docs.toString(), 1, solrClient.docs.size());
-    String id = SolrRrdBackendFactory.ID_PREFIX + SolrRrdBackendFactory.ID_SEP + nodeName + SolrRrdBackendFactory.ID_SEP + "foo";
+    String id = SolrRrdBackendFactory.ID_PREFIX + SolrRrdBackendFactory.ID_SEP + "foo";
     SolrInputDocument doc = solrClient.docs.get(CollectionAdminParams.SYSTEM_COLL).get(id);
     long timestamp = ((Date)doc.getFieldValue("timestamp")).getTime();
     timeSource.sleep(2000);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fb1067e4/solr/core/src/test/org/apache/solr/util/MockSearchableSolrClient.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/util/MockSearchableSolrClient.java b/solr/core/src/test/org/apache/solr/util/MockSearchableSolrClient.java
index 11e58c3..4711b80 100644
--- a/solr/core/src/test/org/apache/solr/util/MockSearchableSolrClient.java
+++ b/solr/core/src/test/org/apache/solr/util/MockSearchableSolrClient.java
@@ -46,7 +46,7 @@ public class MockSearchableSolrClient extends SolrClient {
   }
 
   @Override
-  public NamedList<Object> request(SolrRequest request, String coll) throws SolrServerException, IOException {
+  public synchronized NamedList<Object> request(SolrRequest request, String coll) throws SolrServerException, IOException {
     if (coll == null) {
       coll = request.getParams().get("collection");
     }