You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by mc...@apache.org on 2019/04/01 17:38:12 UTC

[incubator-pinot] branch master updated: Add more logs to ServiceStatus (#4037)

This is an automated email from the ASF dual-hosted git repository.

mcvsubbu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new e032ebf  Add more logs to ServiceStatus (#4037)
e032ebf is described below

commit e032ebf92ec12b0cb242200fef306f54015e3444
Author: Subbu Subramaniam <mc...@users.noreply.github.com>
AuthorDate: Mon Apr 1 10:38:07 2019 -0700

    Add more logs to ServiceStatus (#4037)
    
    * Add more logs to ServiceStatus
    
    When we declare a service good due to its reaching the percentage threshold, it is useful to
    log information about at least one partition in the resources that are not started up
    correctly.
    
    Also included a max number of resources that we log so that we don't get extra long log lines
    
    * Addressed review comments
    
    * Fixed misleading log message
    
    It is possible that some resources converge between calls to get ServiceStatus after we
    return with a minimum threshold percentage of resources good. So, we need to be careful
    about not logging a misleading message about some resources not converging. We take care
    to log a small number of resources that may be off, so that we don't flood the logs.
---
 .../apache/pinot/common/utils/ServiceStatus.java   | 140 ++++++++++++++-------
 1 file changed, 94 insertions(+), 46 deletions(-)

diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/ServiceStatus.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/ServiceStatus.java
index f0ce6f8..3fc8d01 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/utils/ServiceStatus.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/ServiceStatus.java
@@ -24,6 +24,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import org.apache.commons.math3.util.Pair;
 import org.apache.helix.HelixAdmin;
 import org.apache.helix.HelixDataAccessor;
 import org.apache.helix.HelixManager;
@@ -51,6 +52,10 @@ public class ServiceStatus {
   public static String STATUS_DESCRIPTION_NONE = "None";
   public static String STATUS_DESCRIPTION_INIT = "Init";
 
+  private static String _noHelixState = "Helix state does not exist";
+
+  private static int MAX_RESOURCE_NAMES_TO_LOG = 5;
+
   /**
    * Callback that returns the status of the service.
    */
@@ -171,7 +176,7 @@ public class ServiceStatus {
       _minResourcesStartCount = (int)Math.round(Math.ceil(minResourcesStartPercent * _numTotalResourcesToMonitor/100));
 
       LOGGER.info("Monitoring {} resources: {} for start up of instance {}", _numTotalResourcesToMonitor,
-          _resourcesToMonitor, _instanceName);
+          getResourceListAsString(), _instanceName);
     }
 
     public IdealStateMatchServiceStatusCallback(HelixManager helixManager, String clusterName, String instanceName,
@@ -186,7 +191,7 @@ public class ServiceStatus {
 
       _minResourcesStartCount = (int)Math.round(Math.ceil(minResourcesStartPercent * _numTotalResourcesToMonitor/100));
       LOGGER.info("Monitoring {} resources: {} for start up of instance {}", _numTotalResourcesToMonitor,
-          _resourcesToMonitor, _instanceName);
+          getResourceListAsString(), _instanceName);
     }
 
     protected abstract T getState(String resourceName);
@@ -233,64 +238,107 @@ public class ServiceStatus {
           _resourceIterator = _resourcesToMonitor.iterator();
         }
         resourceName = _resourceIterator.next();
-        IdealState idealState = getResourceIdealState(resourceName);
+        Pair<Status, String> statusPair = evaluateResourceStatus(resourceName);
 
-        // If the resource has been removed or disabled, ignore it
-        if (idealState == null || !idealState.isEnabled()) {
+        Status status = statusPair.getFirst();
+        if (status == Status.GOOD) {
+          // Resource is done starting up, remove it from the set
           _resourceIterator.remove();
-          continue;
-        }
+        } else {
+          _statusDescription = String
+              .format("%s, waitingFor=%s, resource=%s, numResourcesLeft=%d, numTotalResources=%d, minStartCount=%d,",
+                  statusPair.getSecond(), getMatchName(), resourceName, _resourcesToMonitor.size(),
+                  _numTotalResourcesToMonitor, _minResourcesStartCount);
 
-        String descriptionSuffix = String
-            .format("waitingFor=%s, resource=%s, numResourcesLeft=%d, numTotalResources=%d", getMatchName(),
-                resourceName, _resourcesToMonitor.size(), _numTotalResourcesToMonitor);
-        T helixState = getState(resourceName);
-        if (helixState == null) {
-          _statusDescription = "Helix state does not exist: " + descriptionSuffix;
-          return Status.STARTING;
+          return status;
         }
+      }
 
-        // Check that all partitions that are supposed to be in any state other than OFFLINE have the same status in the
-        // external view or went to ERROR state (which means that we tried to load the segments/resources but failed for
-        // some reason)
-        Map<String, String> partitionStateMap = getPartitionStateMap(helixState);
-        for (String partitionName : idealState.getPartitionSet()) {
-          String idealStateStatus = idealState.getInstanceStateMap(partitionName).get(_instanceName);
-
-          // Skip this partition if it is not assigned to this instance or if the instance should be offline
-          if (idealStateStatus == null || "OFFLINE".equals(idealStateStatus)) {
-            continue;
-          }
+      // At this point, one of the following conditions hold:
+      // 1. We entered the loop above, and all the remaining resources ended up in GOOOD state.
+      //    In that case _resourcesToMonitor would be empty.
+      // 2. We entered the loop above and cleared most of the remaining resources, but some small
+      //    number are still not converged. In that case, we exited the loop because we have met
+      //    the threshold of resources that need to be GOOD. We will then scan the remaining and
+      //    print some details of the ones that are remaining (upto a limit of MAX_RESOURCE_NAMES_TO_LOG)
+      //    and are still not in converged state. We walk through the remaining ones (and may clear
+      //    mores resources from _resourcesToMonitor that are GOOD state)
+      // 3. We did not execute the loop at all (the percentage threshold satisfied right away). We will do
+      //    the same action as for (2) above.
+      // In all three cases above, we need to return Status.GOOD
 
-          // If the instance state is not ERROR and is not the same as what's expected from the ideal state, then it
-          // hasn't finished starting up
-          String currentStateStatus = partitionStateMap.get(partitionName);
-          if (!idealStateStatus.equals(currentStateStatus)) {
-            if ("ERROR".equals(currentStateStatus)) {
-              LOGGER.error(String.format("Resource: %s, partition: %s is in ERROR state", resourceName, partitionName));
-            } else {
-              _statusDescription = String
-                  .format("partition=%s, expected=%s, found=%s, %s", partitionName,
-                      idealStateStatus, currentStateStatus, descriptionSuffix);
-              return Status.STARTING;
+      if (_resourcesToMonitor.isEmpty()) {
+        _statusDescription = STATUS_DESCRIPTION_NONE;
+        LOGGER.info("Instance {} has finished starting up", _instanceName);
+      } else {
+        int logCount = MAX_RESOURCE_NAMES_TO_LOG;
+        _resourceIterator = _resourcesToMonitor.iterator();
+        while (_resourceIterator.hasNext()) {
+          String resource = _resourceIterator.next();
+          Pair<Status, String> statusPair = evaluateResourceStatus(resource);
+          if (statusPair.getFirst() == Status.GOOD) {
+            _resourceIterator.remove();
+          } else {
+            LOGGER.info("Resource: {}, StatusDescription: {}", resource, statusPair.getSecond());
+            if (--logCount <= 0) {
+              break;
             }
           }
         }
+        _statusDescription = String.format("waitingFor=%s, numResourcesLeft=%d, numTotalResources=%d, minStartCount=%d,"
+                + " resourceList=%s", getMatchName(), _resourcesToMonitor.size(), _numTotalResourcesToMonitor,
+            _minResourcesStartCount, getResourceListAsString());
+        LOGGER.info("Instance {} returning GOOD because {}", _instanceName, _statusDescription);
+      }
+
+      return Status.GOOD;
+    }
 
-        // Resource is done starting up, remove it from the set
-        _resourceIterator.remove();
+    private Pair<Status, String> evaluateResourceStatus(String resourceName) {
+      IdealState idealState = getResourceIdealState(resourceName);
+      // If the resource has been removed or disabled, ignore it
+      if (idealState == null || !idealState.isEnabled()) {
+        return new Pair(Status.GOOD, STATUS_DESCRIPTION_NONE);
       }
 
-      if (_resourcesToMonitor.isEmpty()) {
-        _statusDescription = STATUS_DESCRIPTION_NONE;
-        LOGGER.info("Instance {} has finished starting up", _instanceName);
-      } else {
-        _statusDescription = String.format("waitingFor=%s, numResourcesLeft=%d, numTotalResources=%d, minStartCount=%d",
-            getMatchName(), _resourcesToMonitor.size(), _numTotalResourcesToMonitor, _minResourcesStartCount);
-        LOGGER.info("Instance {} returning GOOD because {}", _statusDescription);
+      T helixState = getState(resourceName);
+      if (helixState == null) {
+        return new Pair(Status.STARTING, _noHelixState);
       }
 
-      return Status.GOOD;
+      // Check that all partitions that are supposed to be in any state other than OFFLINE have the same status in the
+      // external view or went to ERROR state (which means that we tried to load the segments/resources but failed for
+      // some reason)
+      Map<String, String> partitionStateMap = getPartitionStateMap(helixState);
+      for (String partitionName : idealState.getPartitionSet()) {
+        String idealStateStatus = idealState.getInstanceStateMap(partitionName).get(_instanceName);
+
+        // Skip this partition if it is not assigned to this instance or if the instance should be offline
+        if (idealStateStatus == null || "OFFLINE".equals(idealStateStatus)) {
+          continue;
+        }
+
+        // If the instance state is not ERROR and is not the same as what's expected from the ideal state, then it
+        // hasn't finished starting up
+        String currentStateStatus = partitionStateMap.get(partitionName);
+        if (!idealStateStatus.equals(currentStateStatus)) {
+          if ("ERROR".equals(currentStateStatus)) {
+            LOGGER.error(String.format("Resource: %s, partition: %s is in ERROR state", resourceName, partitionName));
+          } else {
+            String description = String.format("partition=%s, expected=%s, found=%s", partitionName,
+                idealStateStatus, currentStateStatus);
+            return new Pair(Status.STARTING, description);
+          }
+        }
+      }
+      return new Pair(Status.GOOD, STATUS_DESCRIPTION_NONE);
+    }
+
+    private String getResourceListAsString() {
+      if (_resourcesToMonitor.size() <= MAX_RESOURCE_NAMES_TO_LOG) {
+        return _resourcesToMonitor.toString();
+      }
+      return "[" + _resourcesToMonitor.iterator().next() + ",...]";
     }
 
     @Override


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org