You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2020/10/20 15:22:09 UTC

[lucene-solr] branch branch_8_6 updated: SOLR-14948: Autoscaling maxComputeOperations override causes exceptions.

This is an automated email from the ASF dual-hosted git repository.

ab pushed a commit to branch branch_8_6
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8_6 by this push:
     new d1c30a2  SOLR-14948: Autoscaling maxComputeOperations override causes exceptions.
d1c30a2 is described below

commit d1c30a271ebd2916f1788d1e32569b4580ff8e2b
Author: Andrzej Bialecki <ab...@apache.org>
AuthorDate: Tue Oct 20 15:23:03 2020 +0200

    SOLR-14948: Autoscaling maxComputeOperations override causes exceptions.
---
 solr/CHANGES.txt                                   |  3 +-
 .../solr/cloud/autoscaling/ComputePlanAction.java  | 61 ++++++++++++++++---
 .../cloud/autoscaling/ComputePlanActionTest.java   | 71 +++++++++++++++++-----
 3 files changed, 110 insertions(+), 25 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index be5fb9f..2f3f970 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -10,7 +10,8 @@ Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this r
 
 Bug Fixes
 ---------------------
-(No changes)
+
+* SOLR-14948: Autoscaling maxComputeOperations override causes exceptions. (ab)
 
 ==================  8.6.3 ==================
 
diff --git a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
index 33bf6b0..834c6e3 100644
--- a/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
+++ b/solr/core/src/java/org/apache/solr/cloud/autoscaling/ComputePlanAction.java
@@ -51,6 +51,8 @@ import static org.apache.solr.cloud.autoscaling.TriggerEvent.NODE_NAMES;
 public class ComputePlanAction extends TriggerActionBase {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
+  public static final String DIAGNOSTICS = "__compute_diag__";
+
   // accept all collections by default
   Predicate<String> collectionsPredicate = s -> true;
 
@@ -127,8 +129,12 @@ public class ComputePlanAction extends TriggerActionBase {
         int opCount = 0;
         int opLimit = maxOperations;
         if (requestedOperations > 0) {
+          log.debug("-- adjusting limit due to explicitly requested number of ops={}", requestedOperations);
           opLimit = requestedOperations;
         }
+        addDiagnostics(event, "maxOperations", maxOperations);
+        addDiagnostics(event, "requestedOperations", requestedOperations);
+        addDiagnostics(event, "opLimit", opLimit);
         do {
           // computing changes in large clusters may take a long time
           if (Thread.currentThread().isInterrupted()) {
@@ -154,6 +160,8 @@ public class ComputePlanAction extends TriggerActionBase {
             if (requestedOperations < 0) {
               //uncomment the following to log zero operations
 //              PolicyHelper.logState(cloudManager, initialSuggester);
+              log.debug("-- no more operations suggested, stopping after {} ops...", (opCount - 1));
+              addDiagnostics(event, "noSuggestionsStopAfter", (opCount - 1));
               break;
             } else {
               log.info("Computed plan empty, remained {} requested ops to try.", opCount - opLimit);
@@ -171,6 +179,10 @@ public class ComputePlanAction extends TriggerActionBase {
             operations.add(operation);
             return operations;
           });
+          if (opCount >= opLimit) {
+            log.debug("-- reached limit of maxOps={}, stopping.", opLimit);
+            addDiagnostics(event, "opLimitReached", true);
+          }
         } while (opCount < opLimit);
       } finally {
         releasePolicySession(sessionWrapper, session);
@@ -187,6 +199,14 @@ public class ComputePlanAction extends TriggerActionBase {
 
   }
 
+  private void addDiagnostics(TriggerEvent event, String key, Object value) {
+    if (log.isDebugEnabled()) {
+      Map<String, Object> diag = (Map<String, Object>) event.getProperties()
+          .computeIfAbsent(DIAGNOSTICS, n -> new HashMap<>());
+      diag.put(key, value);
+    }
+  }
+
   protected int getMaxNumOps(TriggerEvent event, AutoScalingConfig autoScalingConfig, ClusterState clusterState) {
     // estimate a maximum default limit that should be sufficient for most purposes:
     // number of nodes * total number of replicas * 3
@@ -203,14 +223,26 @@ public class ComputePlanAction extends TriggerActionBase {
       totalRF.addAndGet(rf * coll.getSlices().size());
     });
     int totalMax = clusterState.getLiveNodes().size() * totalRF.get() * 3;
-    int maxOp = (Integer) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS, totalMax);
+    addDiagnostics(event, "estimatedMaxOps", totalMax);
+    int maxOp = ((Number) autoScalingConfig.getProperties().getOrDefault(AutoScalingParams.MAX_COMPUTE_OPERATIONS, totalMax)).intValue();
     Object o = event.getProperty(AutoScalingParams.MAX_COMPUTE_OPERATIONS, maxOp);
-    try {
-      return Integer.parseInt(String.valueOf(o));
-    } catch (Exception e) {
-      log.warn("Invalid '{}' event property: {}, using default {}", AutoScalingParams.MAX_COMPUTE_OPERATIONS, o, maxOp);
-      return maxOp;
+    if (o != null) {
+      try {
+        maxOp = Integer.parseInt(String.valueOf(o));
+      } catch (Exception e) {
+        log.warn("Invalid '{}' event property: {}, using default {}", AutoScalingParams.MAX_COMPUTE_OPERATIONS, o, maxOp);
+      }
     }
+    if (maxOp < 0) {
+      // unlimited
+      maxOp = Integer.MAX_VALUE;
+    } else if (maxOp < 1) {
+      // try at least one operation
+      log.debug("-- estimated maxOp={}, resetting to 1...", maxOp);
+      maxOp = 1;
+    }
+    log.debug("-- estimated total max ops={}, effective maxOps={}", totalMax, maxOp);
+    return maxOp;
   }
 
   protected int getRequestedNumOps(TriggerEvent event) {
@@ -276,19 +308,27 @@ public class ComputePlanAction extends TriggerActionBase {
       case MOVEREPLICA:
         Suggester s = session.getSuggester(action)
                 .hint(Suggester.Hint.SRC_NODE, event.getProperty(NODE_NAMES));
-        if (applyCollectionHints(cloudManager, s) == 0) return NoneSuggester.get(session);
+        if (applyCollectionHints(cloudManager, s) == 0) {
+          addDiagnostics(event, "noRelevantCollections", true);
+          return NoneSuggester.get(session);
+        }
         return s;
       case DELETENODE:
         int start = (Integer)event.getProperty(START, 0);
         @SuppressWarnings({"unchecked"})
         List<String> srcNodes = (List<String>) event.getProperty(NODE_NAMES);
         if (srcNodes.isEmpty() || start >= srcNodes.size()) {
+          addDiagnostics(event, "noSourceNodes", true);
           return NoneSuggester.get(session);
         }
         String sourceNode = srcNodes.get(start);
         s = session.getSuggester(action)
                 .hint(Suggester.Hint.SRC_NODE, event.getProperty(NODE_NAMES));
-        if (applyCollectionHints(cloudManager, s) == 0) return NoneSuggester.get(session);
+        if (applyCollectionHints(cloudManager, s) == 0) {
+          log.debug("-- no relevant collections on {}, no operations computed.", srcNodes);
+          addDiagnostics(event, "noRelevantCollections", true);
+          return NoneSuggester.get(session);
+        }
         s.hint(Suggester.Hint.SRC_NODE, Collections.singletonList(sourceNode));
         event.getProperties().put(START, ++start);
         return s;
@@ -340,11 +380,16 @@ public class ComputePlanAction extends TriggerActionBase {
                             .forEach(collShards::add);
                   }
                 });
+        log.debug("-- NODE_ADDED: ADDREPLICA suggester configured with {} collection/shard hints.", collShards.size());
+        addDiagnostics(event, "relevantCollShard", collShards);
         suggester.hint(Suggester.Hint.COLL_SHARD, collShards);
         suggester.hint(Suggester.Hint.REPLICATYPE, replicaType);
         break;
       case MOVEREPLICA:
+        log.debug("-- NODE_ADDED event specified MOVEREPLICA - no hints added.");
+        break;
       case NONE:
+        log.debug("-- NODE_ADDED event specified NONE - no operations suggested.");
         break;
       default:
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
diff --git a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
index eab52d5..617f64b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/autoscaling/ComputePlanActionTest.java
@@ -582,7 +582,7 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     int numShards = 1;
     int numCollections = 5;
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, null);
   }
 
   @Test
@@ -591,7 +591,7 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     int numShards = 1;
     int numCollections = 5;
 
-    nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(collectionNamePrefix, numShards, numCollections);
+    nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(collectionNamePrefix, numShards, numCollections, null);
   }
 
   @Test
@@ -601,9 +601,19 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     int numShards = 2;
     int numCollections = 5;
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, null);
   }
-  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int numShards, int numCollections) throws Exception {
+
+  @Test
+  public void testNodeAddedTriggerWithAddReplicaPreferredOp_2Shard_OpLimit() throws Exception {
+    String collectionNamePrefix = "testNodeAddedTriggerWithAddReplicaPreferredOp_2Shard";
+    int numShards = 2;
+    int numCollections = 5;
+
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, 1);
+  }
+
+  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int numShards, int numCollections, Integer maxOps) throws Exception {
     String setTriggerCommand = "{" +
         "'set-trigger' : {" +
         "'name' : 'node_added_trigger'," +
@@ -623,10 +633,10 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
         "    ]" +
         "}";
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, setTriggerCommand, setClusterPolicyCommand);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, setTriggerCommand, setClusterPolicyCommand, maxOps);
   }
 
-  private void nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(String collectionNamePrefix, int numShards, int numCollections) throws Exception {
+  private void nodeAddedTriggerWithAddReplicaPreferredOpReplicaType(String collectionNamePrefix, int numShards, int numCollections, Integer maxOps) throws Exception {
     String setTriggerCommand = "{" +
         "'set-trigger' : {" +
         "'name' : 'node_added_trigger'," +
@@ -647,13 +657,15 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
         "    ]" +
         "}";
 
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, setTriggerCommand, setClusterPolicyCommand, 0, 1, 0);
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, setTriggerCommand, setClusterPolicyCommand, maxOps, 0, 1, 0);
   }
 
-  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand) throws Exception {
-    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, setTriggerCommand, setClusterPolicyCommand, 1, null, null);
+  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand, Integer maxOps) throws Exception {
+    nodeAddedTriggerWithAddReplicaPreferredOp(collectionNamePrefix, numShards, numCollections, setTriggerCommand, setClusterPolicyCommand, maxOps, 1, null, null);
   }
-  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand, Integer nNrtReplicas, Integer nTlogReplicas, Integer nPullReplicas) throws Exception {
+  private void nodeAddedTriggerWithAddReplicaPreferredOp(String collectionNamePrefix, int numShards, int numCollections, String setTriggerCommand, String setClusterPolicyCommand,
+                                                         Integer maxOps,
+                                                         Integer nNrtReplicas, Integer nTlogReplicas, Integer nPullReplicas) throws Exception {
     CloudSolrClient solrClient = cluster.getSolrClient();
     @SuppressWarnings({"rawtypes"})
     SolrRequest req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setTriggerCommand);
@@ -664,6 +676,16 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     response = solrClient.request(req);
     assertEquals(response.get("result").toString(), "success");
 
+    if (maxOps != null) {
+      String setMaxOpsCommand = "{" +
+          " 'set-properties': {" +
+          "   'maxComputeOperations': " + maxOps +
+          "  }" +
+          "}";
+      req = AutoScalingRequest.create(SolrRequest.METHOD.POST, setMaxOpsCommand);
+      response = solrClient.request(req);
+      assertEquals(response.get("result").toString(), "success");
+    }
 
     CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionNamePrefix + "_0",
         "conf", numShards, nNrtReplicas, nTlogReplicas, nPullReplicas).setMaxShardsPerNode(2);
@@ -682,7 +704,13 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     @SuppressWarnings({"rawtypes"})
     List operations = (List) actionContext.get("operations");
     assertNotNull(operations);
-    assertEquals(numShards, operations.size());
+    int numExpectedOps;
+    if (maxOps != null && maxOps > 0) {
+      numExpectedOps = maxOps;
+    } else {
+      numExpectedOps = numShards;
+    }
+    assertEquals(numExpectedOps, operations.size());
     Set<String> affectedShards = new HashSet<>(2);
     for (Object operation : operations) {
       assertTrue(operation instanceof CollectionAdminRequest.AddReplica);
@@ -691,7 +719,7 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
       assertEquals(collectionNamePrefix + "_0", addReplica.getCollection());
       affectedShards.add(addReplica.getShard());
     }
-    assertEquals(numShards, affectedShards.size());
+    assertEquals(numExpectedOps, affectedShards.size());
 
     for (int i = 1; i < numCollections; i++) {
       create = CollectionAdminRequest.createCollection(collectionNamePrefix + "_" + i,
@@ -711,7 +739,12 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
     actionContext = actionContextPropsRef.get();
     operations = (List) actionContext.get("operations");
     assertNotNull(operations);
-    assertEquals(numCollections * numShards, operations.size());
+    if (maxOps != null && maxOps > 0) {
+      numExpectedOps = maxOps;
+    } else {
+      numExpectedOps = numCollections * numShards;
+    }
+    assertEquals(numExpectedOps, operations.size());
     Set<String> affectedCollections = new HashSet<>(numCollections);
     affectedShards = new HashSet<>(numShards);
     Set<Pair<String, String>> affectedCollShards = new HashSet<>(numCollections * numShards);
@@ -723,9 +756,15 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
       affectedShards.add(addReplica.getShard());
       affectedCollShards.add(new Pair<>(addReplica.getCollection(), addReplica.getShard()));
     }
-    assertEquals(numCollections, affectedCollections.size());
-    assertEquals(numShards, affectedShards.size());
-    assertEquals(numCollections * numShards, affectedCollShards.size());
+    if (maxOps != null && maxOps > 0) {
+      assertEquals(numExpectedOps, affectedCollections.size());
+      assertEquals(numExpectedOps, affectedShards.size());
+      assertEquals(numExpectedOps, affectedCollShards.size());
+    } else {
+      assertEquals(numCollections, affectedCollections.size());
+      assertEquals(numShards, affectedShards.size());
+      assertEquals(numCollections * numShards, affectedCollShards.size());
+    }
   }
 
   @Test