You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2018/10/18 21:18:00 UTC
hive git commit: HIVE-20752 : In case of LLAP start failure add info
how to find YARN logs (Miklos Gergely via Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master f89274ab4 -> a2bbf7925
HIVE-20752 : In case of LLAP start failure add info how to find YARN logs (Miklos Gergely via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a2bbf792
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a2bbf792
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a2bbf792
Branch: refs/heads/master
Commit: a2bbf7925ea84f322ff336851203e656ad5afd7a
Parents: f89274a
Author: Miklos Gergely <mg...@hortonworks.com>
Authored: Thu Oct 18 14:17:22 2018 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Thu Oct 18 14:17:22 2018 -0700
----------------------------------------------------------------------
.../llap/cli/LlapStatusOptionsProcessor.java | 26 +-
.../hive/llap/cli/LlapStatusServiceDriver.java | 426 ++++++++-----------
.../llap/cli/TestLlapStatusServiceDriver.java | 98 +++++
3 files changed, 292 insertions(+), 258 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/a2bbf792/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java
index e88c819..dca0c7b 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusOptionsProcessor.java
@@ -18,7 +18,6 @@
package org.apache.hadoop.hive.llap.cli;
-import java.util.Arrays;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
@@ -29,23 +28,29 @@ import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
+import com.google.common.annotations.VisibleForTesting;
+
public class LlapStatusOptionsProcessor {
private static final String LLAPSTATUS_CONSTANT = "llapstatus";
- private static final long FIND_YARN_APP_TIMEOUT_MS = 20 * 1000l; // 20seconds to wait for app to be visible
-
- private static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000l; // 1 seconds wait until subsequent status
- private static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000l; // 5 minutes timeout for watch mode
- private static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f;
+ @VisibleForTesting
+ public static final long FIND_YARN_APP_TIMEOUT_MS = 20 * 1000l; // 20seconds to wait for app to be visible
+ @VisibleForTesting
+ public static final long DEFAULT_STATUS_REFRESH_INTERVAL_MS = 1 * 1000l; // 1 seconds wait until subsequent status
+ @VisibleForTesting
+ public static final long DEFAULT_WATCH_MODE_TIMEOUT_MS = 5 * 60 * 1000l; // 5 minutes timeout for watch mode
+ @VisibleForTesting
+ public static final float DEFAULT_RUNNING_NODES_THRESHOLD = 1.0f;
// TODO: why doesn't this use one of the existing options implementations?!
enum OptionConstants {
NAME("name", 'n', "LLAP cluster name", true),
FIND_APP_TIMEOUT("findAppTimeout", 'f',
- "Amount of time(s) that the tool will sleep to wait for the YARN application to start. negative values=wait forever, 0=Do not wait. default=" +
- TimeUnit.SECONDS.convert(FIND_YARN_APP_TIMEOUT_MS, TimeUnit.MILLISECONDS) + "s", true),
+ "Amount of time(s) that the tool will sleep to wait for the YARN application to start. negative values=wait " +
+ "forever, 0=Do not wait. default=" + TimeUnit.SECONDS.convert(FIND_YARN_APP_TIMEOUT_MS, TimeUnit.MILLISECONDS) +
+ "s", true),
OUTPUT_FILE("outputFile", 'o', "File to which output should be written (Default stdout)", true),
WATCH_MODE("watch", 'w', "Watch mode waits until all LLAP daemons are running or subset of the nodes are " +
"running (threshold can be specified via -r option) (Default wait until all nodes are running)", false),
@@ -59,7 +64,8 @@ public class LlapStatusOptionsProcessor {
" in watch mode. Valid only for watch mode. (Default " +
TimeUnit.SECONDS.convert(DEFAULT_STATUS_REFRESH_INTERVAL_MS, TimeUnit.MILLISECONDS) + "s)", true),
WATCH_MODE_TIMEOUT("watchTimeout", 't', "Exit watch mode if the desired state is not attained until the specified" +
- " timeout. (Default " + TimeUnit.SECONDS.convert(DEFAULT_WATCH_MODE_TIMEOUT_MS, TimeUnit.MILLISECONDS) +"s)", true),
+ " timeout. (Default " + TimeUnit.SECONDS.convert(DEFAULT_WATCH_MODE_TIMEOUT_MS, TimeUnit.MILLISECONDS) +"s)",
+ true),
HIVECONF("hiveconf", null, "Use value for given property. Overridden by explicit parameters", "property=value", 2),
HELP("help", 'H', "Print help information", false);
@@ -238,7 +244,7 @@ public class LlapStatusOptionsProcessor {
}
watchTimeoutMs = TimeUnit.MILLISECONDS.convert(watchTimeoutSec, TimeUnit.SECONDS);
}
-
+
boolean isLaunched = !commandLine.hasOption(OptionConstants.NOT_LAUNCHED.getLongOpt());
float runningNodesThreshold = DEFAULT_RUNNING_NODES_THRESHOLD;
http://git-wip-us.apache.org/repos/asf/hive/blob/a2bbf792/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
index e0ada45..a521799 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
@@ -64,13 +64,13 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class LlapStatusServiceDriver {
+ private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class);
+ private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole");
private static final EnumSet<State> NO_YARN_SERVICE_INFO_STATES = EnumSet.of(
State.APP_NOT_FOUND, State.COMPLETE, State.LAUNCHING);
private static final EnumSet<State> LAUNCHING_STATES = EnumSet.of(
State.LAUNCHING, State.RUNNING_PARTIAL, State.RUNNING_ALL);
- private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class);
- private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole");
// Defining a bunch of configs here instead of in HiveConf. These are experimental, and mainly
// for use when retry handling is fixed in Yarn/Hadoop
@@ -80,18 +80,15 @@ public class LlapStatusServiceDriver {
// The following two keys should ideally be used to control RM connect timeouts. However,
// they don't seem to work. The IPC timeout needs to be set instead.
@InterfaceAudience.Private
- private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS =
- CONF_PREFIX + "yarn.rm.connect.max-wait-ms";
+ private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS = CONF_PREFIX + "yarn.rm.connect.max-wait-ms";
private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000l;
@InterfaceAudience.Private
- private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS =
- CONF_PREFIX + "yarn.rm.connect.retry-interval.ms";
+ private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS = CONF_PREFIX + "yarn.rm.connect.retry-interval.ms";
private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000l;
// As of Hadoop 2.7 - this is what controls the RM timeout.
@InterfaceAudience.Private
- private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES =
- CONF_PREFIX + "ipc.client.max-retries";
+ private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES = CONF_PREFIX + "ipc.client.max-retries";
private static final int CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 2;
@InterfaceAudience.Private
private static final String CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS =
@@ -107,22 +104,20 @@ public class LlapStatusServiceDriver {
private static final String
CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT = "2000, 1";
- private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS =
- CONF_PREFIX + "zk-registry.timeout-ms";
+ private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS = CONF_PREFIX + "zk-registry.timeout-ms";
private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 20000l;
private static final long LOG_SUMMARY_INTERVAL = 15000L; // Log summary every ~15 seconds.
-
private static final String LLAP_KEY = "llap";
+
private final Configuration conf;
- private final Clock clock = new SystemClock();
private String appName = null;
+ private String applicationId = null;
private ServiceClient serviceClient = null;
private Configuration llapRegistryConf = null;
private LlapRegistryService llapRegistry = null;
- @VisibleForTesting
- AppStatusBuilder appStatusBuilder;
+ private AppStatusBuilder appStatusBuilder;
public LlapStatusServiceDriver() {
SessionState ss = SessionState.get();
@@ -145,21 +140,17 @@ public class LlapStatusServiceDriver {
CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT));
conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
- conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS,
- CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT));
+ conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS, CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT));
conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
- conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT));
+ conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT));
conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
- conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES,
- CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT));
+ conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES, CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT));
conf.setLong(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_RETRY_INTERVAL_KEY,
- conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS,
- CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT));
+ conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS, CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT));
- HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (conf
- .getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) +
- "ms"));
+ HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (
+ conf.getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) + "ms"));
llapRegistryConf = new Configuration(conf);
}
@@ -167,11 +158,10 @@ public class LlapStatusServiceDriver {
/**
* Parse command line options.
*
- * @param args
* @return command line options.
*/
- public LlapStatusOptions parseOptions(String[] args) throws
- LlapStatusCliException {
+ @VisibleForTesting
+ public LlapStatusOptions parseOptions(String[] args) throws LlapStatusCliException {
LlapStatusOptionsProcessor optionsProcessor = new LlapStatusOptionsProcessor();
LlapStatusOptions options;
@@ -211,9 +201,7 @@ public class LlapStatusServiceDriver {
LOG.info(message);
return ExitCode.INCORRECT_USAGE.getInt();
}
- if (LOG.isDebugEnabled()) {
- LOG.debug("Using appName: {}", appName);
- }
+ LOG.debug("Using appName: {}", appName);
llapRegistryConf.set(HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname, "@" + appName);
}
@@ -224,8 +212,7 @@ public class LlapStatusServiceDriver {
}
} catch (Exception e) {
LlapStatusCliException le = new LlapStatusCliException(
- LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED,
- "Failed to create service client", e);
+ LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_CREATE_FAILED, "Failed to create service client", e);
logError(le);
return le.getExitCode().getInt();
}
@@ -233,8 +220,7 @@ public class LlapStatusServiceDriver {
// Get the App report from YARN
ApplicationReport appReport;
try {
- appReport = LlapSliderUtils.getAppReport(appName, serviceClient,
- options.getFindAppTimeoutMs());
+ appReport = LlapSliderUtils.getAppReport(appName, serviceClient, options.getFindAppTimeoutMs());
} catch (LlapStatusCliException e) {
logError(e);
return e.getExitCode().getInt();
@@ -256,11 +242,9 @@ public class LlapStatusServiceDriver {
} else {
// Get information from YARN Service
try {
- ret = populateAppStatusFromServiceStatus(appName, serviceClient,
- appStatusBuilder);
+ ret = populateAppStatusFromServiceStatus(appName, serviceClient, appStatusBuilder);
} catch (LlapStatusCliException e) {
- // In case of failure, send back whatever is constructed so far -
- // which would be from the AppReport
+ // In case of failure, send back whatever is constructed so far - which would be from the AppReport
logError(e);
return e.getExitCode().getInt();
}
@@ -279,14 +263,11 @@ public class LlapStatusServiceDriver {
return ret.getInt();
} finally {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Final AppState: " + appStatusBuilder.toString());
- }
+ LOG.debug("Final AppState: " + appStatusBuilder.toString());
}
}
- public void outputJson(PrintWriter writer) throws
- LlapStatusCliException {
+ public void outputJson(PrintWriter writer) throws LlapStatusCliException {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false);
mapper.setSerializationInclusion(JsonSerialize.Inclusion.NON_NULL);
@@ -295,28 +276,26 @@ public class LlapStatusServiceDriver {
writer.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(appStatusBuilder));
} catch (IOException e) {
LOG.warn("Failed to create JSON", e);
- throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON",
- e);
+ throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON", e);
}
}
/**
* Populates parts of the AppStatus
*
- * @param appReport
- * @param appStatusBuilder
* @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible
* @throws LlapStatusCliException
*/
- private ExitCode processAppReport(ApplicationReport appReport,
- AppStatusBuilder appStatusBuilder) throws
- LlapStatusCliException {
+ private ExitCode processAppReport(ApplicationReport appReport, AppStatusBuilder appStatusBuilder)
+ throws LlapStatusCliException {
if (appReport == null) {
appStatusBuilder.setState(State.APP_NOT_FOUND);
LOG.info("No Application Found");
return ExitCode.SUCCESS;
}
+ applicationId = appReport.getApplicationId().toString();
+
// TODO Maybe add the YARN URL for the app.
appStatusBuilder.setAmInfo(
new LlapStatusHelpers.AmInfo().setAppName(appReport.getName()).setAppType(appReport.getApplicationType()));
@@ -328,17 +307,17 @@ public class LlapStatusServiceDriver {
appStatusBuilder.setState(State.LAUNCHING);
return ExitCode.SUCCESS;
case ACCEPTED:
- appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString());
+ appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId);
appStatusBuilder.setState(State.LAUNCHING);
return ExitCode.SUCCESS;
case RUNNING:
- appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString());
+ appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId);
// If the app state is running, get additional information from YARN Service
return ExitCode.SUCCESS;
case FINISHED:
case FAILED:
case KILLED:
- appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString());
+ appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(applicationId);
appStatusBuilder.setAppFinishTime(appReport.getFinishTime());
appStatusBuilder.setState(State.COMPLETE);
// add log links and other diagnostics from YARN Service
@@ -352,33 +331,23 @@ public class LlapStatusServiceDriver {
/**
* Populates information from YARN Service Status.
*
- * @param appName
- * @param serviceClient
- * @param appStatusBuilder
- * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future
- * progress not possible
+ * @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible
* @throws LlapStatusCliException
*/
- private ExitCode populateAppStatusFromServiceStatus(String appName,
- ServiceClient serviceClient, AppStatusBuilder appStatusBuilder)
- throws LlapStatusCliException {
+ private ExitCode populateAppStatusFromServiceStatus(String appName, ServiceClient serviceClient,
+ AppStatusBuilder appStatusBuilder) throws LlapStatusCliException {
ExitCode exitCode = ExitCode.YARN_ERROR;
try {
Service service = serviceClient.getStatus(appName);
if (service != null) {
// How to get config paths and AmInfo
ServiceState state = service.getState();
- appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0
- : service.getLaunchTime().getTime());
- appStatusBuilder.setDesiredInstances(
- service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0
- : service.getComponent(LLAP_KEY).getNumberOfContainers()
- .intValue());
- appStatusBuilder.setLiveInstances(
- service.getComponent(LLAP_KEY).getContainers().size());
+ appStatusBuilder.setAppStartTime(service.getLaunchTime() == null ? 0 : service.getLaunchTime().getTime());
+ appStatusBuilder.setDesiredInstances(service.getComponent(LLAP_KEY).getNumberOfContainers() == null ? 0
+ : service.getComponent(LLAP_KEY).getNumberOfContainers().intValue());
+ appStatusBuilder.setLiveInstances(service.getComponent(LLAP_KEY).getContainers().size());
for (Container cont : service.getComponent(LLAP_KEY).getContainers()) {
- LlapInstance llapInstance = new LlapInstance(cont.getHostname(),
- cont.getId());
+ LlapInstance llapInstance = new LlapInstance(cont.getHostname(), cont.getId());
appStatusBuilder.addNewRunningLlapInstance(llapInstance);
}
if (state == ServiceState.STABLE) {
@@ -389,8 +358,7 @@ public class LlapStatusServiceDriver {
}
} catch (IOException | YarnException e) {
LlapStatusCliException le = new LlapStatusCliException(
- LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER,
- "Failed to get service status", e);
+ LlapStatusServiceDriver.ExitCode.SERVICE_CLIENT_ERROR_OTHER, "Failed to get service status", e);
logError(le);
exitCode = le.getExitCode();
}
@@ -400,13 +368,11 @@ public class LlapStatusServiceDriver {
/**
* Populate additional information for containers from the LLAP registry. Must be invoked
* after YARN Service status and diagnostics.
- * @param appStatusBuilder
* @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible
* @throws LlapStatusCliException
*/
- private ExitCode populateAppStatusFromLlapRegistry(
- AppStatusBuilder appStatusBuilder, long watchTimeoutMs) throws
- LlapStatusCliException {
+ private ExitCode populateAppStatusFromLlapRegistry(AppStatusBuilder appStatusBuilder, long watchTimeoutMs)
+ throws LlapStatusCliException {
if (llapRegistry == null) {
try {
@@ -425,9 +391,7 @@ public class LlapStatusServiceDriver {
}
if (serviceInstances == null || serviceInstances.isEmpty()) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("No information found in the LLAP registry");
- }
+ LOG.debug("No information found in the LLAP registry");
appStatusBuilder.setLiveInstances(0);
appStatusBuilder.setState(State.LAUNCHING);
appStatusBuilder.clearRunningLlapInstances();
@@ -461,8 +425,8 @@ public class LlapStatusServiceDriver {
appStatusBuilder.setLiveInstances(validatedInstances.size());
appStatusBuilder.setLaunchingInstances(llapExtraInstances.size());
- if (appStatusBuilder.getDesiredInstances() != null && validatedInstances
- .size() >= appStatusBuilder.getDesiredInstances()) {
+ if (appStatusBuilder.getDesiredInstances() != null &&
+ validatedInstances.size() >= appStatusBuilder.getDesiredInstances()) {
appStatusBuilder.setState(State.RUNNING_ALL);
if (validatedInstances.size() > appStatusBuilder.getDesiredInstances()) {
LOG.warn("Found more entries in LLAP registry, as compared to desired entries");
@@ -493,107 +457,13 @@ public class LlapStatusServiceDriver {
return ExitCode.SUCCESS;
}
- private static String constructCompletedContainerDiagnostics(List<LlapInstance> completedInstances) {
- StringBuilder sb = new StringBuilder();
- if (completedInstances == null || completedInstances.size() == 0) {
- return "";
- } else {
- // TODO HIVE-15865 Ideally sort these by completion time, once that is available.
- boolean isFirst = true;
- for (LlapInstance instance : completedInstances) {
- if (!isFirst) {
- sb.append("\n");
- } else {
- isFirst = false;
- }
-
- if (instance.getYarnContainerExitStatus() ==
- ContainerExitStatus.KILLED_EXCEEDED_PMEM ||
- instance.getYarnContainerExitStatus() ==
- ContainerExitStatus.KILLED_EXCEEDED_VMEM) {
- sb.append("\tKILLED container (by YARN for exceeding memory limits): ");
- } else {
- // TODO HIVE-15865 Handle additional reasons like OS launch failed
- sb.append("\tFAILED container: ");
- }
- sb.append(" ").append(instance.getContainerId());
- sb.append(", Logs at: ").append(instance.getLogUrl());
- }
- }
- return sb.toString();
- }
-
- /**
- * Helper method to construct a diagnostic message from a complete
- * AppStatusBuilder.
- *
- * @return
- */
- private static String constructDiagnostics(
- AppStatusBuilder appStatusBuilder) {
- StringBuilder sb = new StringBuilder();
-
- switch (appStatusBuilder.getState()) {
- case APP_NOT_FOUND:
- sb.append("LLAP status unknown. Awaiting app launch");
- break;
- case LAUNCHING:
- // This is a catch all state - when containers have not started yet, or LLAP has not started yet.
- if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) {
- sb.append("LLAP Starting up with AppId=")
- .append(appStatusBuilder.getAmInfo().getAppId()).append(".");
- if (appStatusBuilder.getDesiredInstances() != null) {
- sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances");
- }
-
- String containerDiagnostics = constructCompletedContainerDiagnostics(
- appStatusBuilder.getCompletedInstances());
- if (StringUtils.isNotEmpty(containerDiagnostics)) {
- sb.append("\n").append(containerDiagnostics);
- }
- } else {
- sb.append("Awaiting LLAP startup");
- }
- break;
- case RUNNING_PARTIAL:
- sb.append("LLAP Starting up with ApplicationId=")
- .append(appStatusBuilder.getAmInfo().getAppId());
- sb.append(" Started").append(appStatusBuilder.getLiveInstances())
- .append("/").append(appStatusBuilder.getDesiredInstances())
- .append(" instances");
- String containerDiagnostics = constructCompletedContainerDiagnostics(
- appStatusBuilder.getCompletedInstances());
- if (StringUtils.isNotEmpty(containerDiagnostics)) {
- sb.append("\n").append(containerDiagnostics);
- }
-
- // TODO HIVE-15865: Include information about pending requests, and last
- // allocation time once YARN Service provides this information.
- break;
- case RUNNING_ALL:
- sb.append("LLAP Application running with ApplicationId=")
- .append(appStatusBuilder.getAmInfo().getAppId());
- break;
- case COMPLETE:
-
- sb.append("LLAP Application already complete. ApplicationId=")
- .append(appStatusBuilder.getAmInfo().getAppId());
- containerDiagnostics = constructCompletedContainerDiagnostics(
- appStatusBuilder.getCompletedInstances());
- if (StringUtils.isNotEmpty(containerDiagnostics)) {
- sb.append("\n").append(containerDiagnostics);
- }
-
- break;
- case UNKNOWN:
- sb.append("LLAP status unknown");
- break;
+ private void close() {
+ if (serviceClient != null) {
+ serviceClient.stop();
}
- if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) {
- sb.append("\n").append(appStatusBuilder.getDiagnostics());
+ if (llapRegistry != null) {
+ llapRegistry.stop();
}
-
- return sb.toString();
}
public enum ExitCode {
@@ -618,18 +488,16 @@ public class LlapStatusServiceDriver {
}
}
-
public static class LlapStatusCliException extends Exception {
final LlapStatusServiceDriver.ExitCode exitCode;
-
public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message) {
super(exitCode.getInt() +": " + message);
this.exitCode = exitCode;
}
public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message, Throwable cause) {
- super(message, cause);
+ super(exitCode.getInt() +": " + message, cause);
this.exitCode = exitCode;
}
@@ -638,18 +506,10 @@ public class LlapStatusServiceDriver {
}
}
-
- private static void logError(Throwable t) {
- LOG.error("FAILED: " + t.getMessage(), t);
- System.err.println("FAILED: " + t.getMessage());
- }
-
-
public static void main(String[] args) {
LOG.info("LLAP status invoked with arguments = {}", Arrays.toString(args));
int ret = ExitCode.SUCCESS.getInt();
Clock clock = SystemClock.getInstance();
- long startTime = clock.getTime();
long lastSummaryLogTime = -1;
LlapStatusServiceDriver statusServiceDriver = null;
@@ -661,8 +521,7 @@ public class LlapStatusServiceDriver {
statusServiceDriver.close();
logError(t);
if (t instanceof LlapStatusCliException) {
- LlapStatusCliException
- ce = (LlapStatusCliException) t;
+ LlapStatusCliException ce = (LlapStatusCliException) t;
ret = ce.getExitCode().getInt();
} else {
ret = ExitCode.INTERNAL_ERROR.getInt();
@@ -691,9 +550,9 @@ public class LlapStatusServiceDriver {
LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}." +
" Watch mode: {}. Running nodes threshold: {}.",
- TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS),
- TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS),
- numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold));
+ TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS),
+ TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS),
+ numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold));
while (numAttempts > 0) {
if (!firstAttempt) {
if (watchMode) {
@@ -712,9 +571,8 @@ public class LlapStatusServiceDriver {
ret = statusServiceDriver.run(options, watchMode ? watchTimeout : 0);
currentState = statusServiceDriver.appStatusBuilder.getState();
try {
- lastSummaryLogTime = LlapStatusServiceDriver
- .maybeLogSummary(clock, lastSummaryLogTime, statusServiceDriver,
- watchMode, watchTimeout, launchingState);
+ lastSummaryLogTime = LlapStatusServiceDriver.maybeLogSummary(clock, lastSummaryLogTime,
+ statusServiceDriver, watchMode, watchTimeout, launchingState);
} catch (Exception e) {
LOG.warn("Failed to log summary", e);
}
@@ -731,8 +589,7 @@ public class LlapStatusServiceDriver {
if (currentState.equals(State.COMPLETE)) {
if (launchingState != null || options.isLaunched()) {
LOG.warn("COMPLETE state reached while waiting for RUNNING state. Failing.");
- System.err.println("Final diagnostics: " +
- statusServiceDriver.appStatusBuilder.getDiagnostics());
+ System.err.println("Final diagnostics: " + statusServiceDriver.appStatusBuilder.getDiagnostics());
break;
} else {
LOG.info("Found a stopped application; assuming it was a previous attempt "
@@ -740,19 +597,15 @@ public class LlapStatusServiceDriver {
}
}
- if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(
- State.RUNNING_ALL))) {
- if (LOG.isDebugEnabled()) {
- LOG.debug(
- "Current state: {}. Desired state: {}. {}/{} instances.",
- currentState,
- runningNodesThreshold == 1.0f ?
- State.RUNNING_ALL :
- State.RUNNING_PARTIAL,
- statusServiceDriver.appStatusBuilder.getLiveInstances(),
- statusServiceDriver.appStatusBuilder
- .getDesiredInstances());
- }
+ if (!(currentState.equals(State.RUNNING_PARTIAL) || currentState.equals(State.RUNNING_ALL))) {
+ LOG.debug(
+ "Current state: {}. Desired state: {}. {}/{} instances.",
+ currentState,
+ runningNodesThreshold == 1.0f ?
+ State.RUNNING_ALL :
+ State.RUNNING_PARTIAL,
+ statusServiceDriver.appStatusBuilder.getLiveInstances(),
+ statusServiceDriver.appStatusBuilder.getDesiredInstances());
numAttempts--;
continue;
}
@@ -763,17 +616,13 @@ public class LlapStatusServiceDriver {
if (desiredInstances > 0) {
final float ratio = (float) liveInstances / (float) desiredInstances;
if (ratio < runningNodesThreshold) {
- if (LOG.isDebugEnabled()) {
- LOG.debug(
- "Waiting until running nodes threshold is reached. Current: {} Desired: {}." +
- " {}/{} instances.",
- new DecimalFormat("#.###").format(ratio),
- new DecimalFormat("#.###")
- .format(runningNodesThreshold),
- statusServiceDriver.appStatusBuilder.getLiveInstances(),
- statusServiceDriver.appStatusBuilder
- .getDesiredInstances());
- }
+ LOG.debug(
+ "Waiting until running nodes threshold is reached. Current: {} Desired: {}." +
+ " {}/{} instances.",
+ new DecimalFormat("#.###").format(ratio),
+ new DecimalFormat("#.###").format(runningNodesThreshold),
+ statusServiceDriver.appStatusBuilder.getLiveInstances(),
+ statusServiceDriver.appStatusBuilder.getDesiredInstances());
numAttempts--;
continue;
} else {
@@ -805,9 +654,7 @@ public class LlapStatusServiceDriver {
break;
}
// Log final state to CONSOLE_LOGGER
- LlapStatusServiceDriver
- .maybeLogSummary(clock, 0L, statusServiceDriver,
- watchMode, watchTimeout, launchingState);
+ LlapStatusServiceDriver.maybeLogSummary(clock, 0L, statusServiceDriver, watchMode, watchTimeout, launchingState);
CONSOLE_LOGGER.info("\n\n\n");
// print current state before exiting
statusServiceDriver.outputJson(pw);
@@ -815,35 +662,35 @@ public class LlapStatusServiceDriver {
pw.flush();
if (numAttempts == 0 && watchMode && !desiredStateAttained) {
LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.",
- TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS));
+ TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS));
}
} catch (Throwable t) {
logError(t);
if (t instanceof LlapStatusCliException) {
- LlapStatusCliException
- ce = (LlapStatusCliException) t;
+ LlapStatusCliException ce = (LlapStatusCliException) t;
ret = ce.getExitCode().getInt();
} else {
ret = ExitCode.INTERNAL_ERROR.getInt();
}
} finally {
LOG.info("LLAP status finished");
+ if (ret != ExitCode.SUCCESS.exitCode) {
+ LOG.error("LLAP did not start. Check the application log for more info:\n" +
+ "\tyarn logs --applicationId {} -out <path>", statusServiceDriver.applicationId);
+ }
statusServiceDriver.close();
}
- if (LOG.isDebugEnabled()) {
- LOG.debug("Completed processing - exiting with " + ret);
- }
+ LOG.debug("Completed processing - exiting with " + ret);
+
System.exit(ret);
}
- private static long maybeLogSummary(Clock clock, long lastSummaryLogTime,
- LlapStatusServiceDriver statusServiceDriver,
+ private static long maybeLogSummary(Clock clock, long lastSummaryLogTime, LlapStatusServiceDriver statusServiceDriver,
boolean watchMode, long watchTimeout, LlapStatusHelpers.State launchingState) {
long currentTime = clock.getTime();
if (lastSummaryLogTime < currentTime - LOG_SUMMARY_INTERVAL) {
String diagString = null;
- if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() ==
- State.COMPLETE && watchMode) {
+ if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() == State.COMPLETE && watchMode) {
// First known state was COMPLETED. Wait for the app launch to start.
diagString = "Awaiting LLAP launch";
// Clear completed instances in this case. Don't want to provide information from the previous run.
@@ -859,23 +706,106 @@ public class LlapStatusServiceDriver {
} else {
CONSOLE_LOGGER.info("\nLLAPSTATUS");
}
- CONSOLE_LOGGER.info(
- "--------------------------------------------------------------------------------");
+ CONSOLE_LOGGER.info("--------------------------------------------------------------------------------");
}
CONSOLE_LOGGER.info(diagString);
- CONSOLE_LOGGER.info(
- "--------------------------------------------------------------------------------");
+ CONSOLE_LOGGER.info("--------------------------------------------------------------------------------");
lastSummaryLogTime = currentTime;
}
return lastSummaryLogTime;
}
- private void close() {
- if (serviceClient != null) {
- serviceClient.stop();
+ /**
+ * Helper method to construct a diagnostic message from a complete AppStatusBuilder.
+ */
+ private static String constructDiagnostics(AppStatusBuilder appStatusBuilder) {
+ StringBuilder sb = new StringBuilder();
+
+ switch (appStatusBuilder.getState()) {
+ case APP_NOT_FOUND:
+ sb.append("LLAP status unknown. Awaiting app launch");
+ break;
+ case LAUNCHING:
+ // This is a catch all state - when containers have not started yet, or LLAP has not started yet.
+ if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) {
+ sb.append("LLAP Starting up with AppId=").append(appStatusBuilder.getAmInfo().getAppId()).append(".");
+ if (appStatusBuilder.getDesiredInstances() != null) {
+ sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances");
+ }
+
+ String containerDiagnostics = constructCompletedContainerDiagnostics(
+ appStatusBuilder.getCompletedInstances());
+ if (StringUtils.isNotEmpty(containerDiagnostics)) {
+ sb.append("\n").append(containerDiagnostics);
+ }
+ } else {
+ sb.append("Awaiting LLAP startup");
+ }
+ break;
+ case RUNNING_PARTIAL:
+ sb.append("LLAP Starting up with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId());
+ sb.append(" Started").append(appStatusBuilder.getLiveInstances()).append("/")
+ .append(appStatusBuilder.getDesiredInstances()).append(" instances");
+ String containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances());
+ if (StringUtils.isNotEmpty(containerDiagnostics)) {
+ sb.append("\n").append(containerDiagnostics);
+ }
+
+ // TODO HIVE-15865: Include information about pending requests, and last
+ // allocation time once YARN Service provides this information.
+ break;
+ case RUNNING_ALL:
+ sb.append("LLAP Application running with ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId());
+ break;
+ case COMPLETE:
+ sb.append("LLAP Application already complete. ApplicationId=").append(appStatusBuilder.getAmInfo().getAppId());
+ containerDiagnostics = constructCompletedContainerDiagnostics(appStatusBuilder.getCompletedInstances());
+ if (StringUtils.isNotEmpty(containerDiagnostics)) {
+ sb.append("\n").append(containerDiagnostics);
+ }
+
+ break;
+ case UNKNOWN:
+ sb.append("LLAP status unknown");
+ break;
}
- if (llapRegistry != null) {
- llapRegistry.stop();
+ if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) {
+ sb.append("\n").append(appStatusBuilder.getDiagnostics());
}
+
+ return sb.toString();
+ }
+
+ private static String constructCompletedContainerDiagnostics(List<LlapInstance> completedInstances) {
+ StringBuilder sb = new StringBuilder();
+ if (completedInstances == null || completedInstances.size() == 0) {
+ return "";
+ } else {
+ // TODO HIVE-15865 Ideally sort these by completion time, once that is available.
+ boolean isFirst = true;
+ for (LlapInstance instance : completedInstances) {
+ if (!isFirst) {
+ sb.append("\n");
+ } else {
+ isFirst = false;
+ }
+
+ if (instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_PMEM ||
+ instance.getYarnContainerExitStatus() == ContainerExitStatus.KILLED_EXCEEDED_VMEM) {
+ sb.append("\tKILLED container (by YARN for exceeding memory limits): ");
+ } else {
+ // TODO HIVE-15865 Handle additional reasons like OS launch failed
+ sb.append("\tFAILED container: ");
+ }
+ sb.append(" ").append(instance.getContainerId());
+ sb.append(", Logs at: ").append(instance.getLogUrl());
+ }
+ }
+ return sb.toString();
+ }
+
+ private static void logError(Throwable t) {
+ LOG.error("FAILED: " + t.getMessage(), t);
+ System.err.println("FAILED: " + t.getMessage());
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/a2bbf792/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java
----------------------------------------------------------------------
diff --git a/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java
new file mode 100644
index 0000000..54166d5
--- /dev/null
+++ b/llap-server/src/test/org/apache/hadoop/hive/llap/cli/TestLlapStatusServiceDriver.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hadoop.hive.llap.cli;
+
+import org.apache.hadoop.hive.llap.cli.LlapStatusOptionsProcessor.LlapStatusOptions;
+import org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver.ExitCode;
+import org.apache.hadoop.hive.llap.cli.LlapStatusServiceDriver.LlapStatusCliException;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static junit.framework.TestCase.assertEquals;
+
+import java.util.Properties;
+
+// TODO: write unit tests for the main logic of this class - needs refactoring first, current design isn't testable.
+public class TestLlapStatusServiceDriver {
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void testArgumentParsingDefault() throws LlapStatusCliException {
+ LlapStatusServiceDriver driver = new LlapStatusServiceDriver();
+ LlapStatusOptions parseOptions = driver.parseOptions(new String[] {});
+
+ assertEquals("findAppTimeout should be the default value if not specified otherwise",
+ parseOptions.getFindAppTimeoutMs(), LlapStatusOptionsProcessor.FIND_YARN_APP_TIMEOUT_MS);
+
+ assertEquals("refreshInterval should be the default value if not specified otherwise",
+ parseOptions.getRefreshIntervalMs(), LlapStatusOptionsProcessor.DEFAULT_STATUS_REFRESH_INTERVAL_MS);
+
+ assertEquals("watchTimeout should be the default value if not specified otherwise",
+ parseOptions.getWatchTimeoutMs(), LlapStatusOptionsProcessor.DEFAULT_WATCH_MODE_TIMEOUT_MS);
+
+ assertEquals("runningNodesThreshold should be the default value if not specified otherwise",
+ parseOptions.getRunningNodesThreshold(), LlapStatusOptionsProcessor.DEFAULT_RUNNING_NODES_THRESHOLD);
+
+ assertEquals("hiveConf should be empty properties if not specified otherwise", parseOptions.getConf(),
+ new Properties());
+
+ assertEquals("isLaunched should be the true if not specified otherwise", parseOptions.isLaunched(), true);
+
+ assertEquals("watchMode should be the false if not specified otherwise", parseOptions.isWatchMode(), false);
+ }
+
+ @Test
+ public void testNegativeRefreshInterval() throws LlapStatusCliException {
+ thrown.expect(LlapStatusCliException.class);
+ thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage");
+
+ LlapStatusServiceDriver driver = new LlapStatusServiceDriver();
+ driver.parseOptions(new String[] {"--refreshInterval -1"});
+ }
+
+ @Test
+ public void testNegativeWatchTimeout() throws LlapStatusCliException {
+ thrown.expect(LlapStatusCliException.class);
+ thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage");
+
+ LlapStatusServiceDriver driver = new LlapStatusServiceDriver();
+ driver.parseOptions(new String[] {"--watchTimeout -1"});
+ }
+
+ @Test
+ public void testNegativeRunningNodesThreshold() throws LlapStatusCliException {
+ thrown.expect(LlapStatusCliException.class);
+ thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage");
+
+ LlapStatusServiceDriver driver = new LlapStatusServiceDriver();
+ driver.parseOptions(new String[] {"--runningNodesThreshold -1"});
+ }
+
+ @Test
+ public void testRunningNodesThresholdOverOne() throws LlapStatusCliException {
+ thrown.expect(LlapStatusCliException.class);
+ thrown.expectMessage(ExitCode.INCORRECT_USAGE.getInt() + ": Incorrect usage");
+
+ LlapStatusServiceDriver driver = new LlapStatusServiceDriver();
+ driver.parseOptions(new String[] {"--runningNodesThreshold 1.1"});
+ }
+}