You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by sn...@apache.org on 2019/08/21 14:49:44 UTC

[hadoop] branch branch-3.2 updated: YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko

This is an automated email from the ASF dual-hosted git repository.

snemeth pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new 6980f17  YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko
6980f17 is described below

commit 6980f1740fe4037653a4095ed42dfe5b84d24850
Author: Szilard Nemeth <sn...@apache.org>
AuthorDate: Wed Aug 21 16:49:34 2019 +0200

    YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko
---
 .../apache/hadoop/yarn/conf/YarnConfiguration.java |  14 +++
 .../src/main/resources/yarn-default.xml            |  11 +++
 .../linux/resources/ResourcesExceptionUtil.java    |  42 ++++++++
 .../resources/gpu/GpuResourceHandlerImpl.java      |  13 ++-
 .../resourceplugin/ResourcePluginManager.java      |   7 +-
 .../resourceplugin/gpu/GpuDiscoverer.java          | 108 ++++++++++++---------
 .../gpu/GpuNodeResourceUpdateHandler.java          |  13 ++-
 .../resourceplugin/gpu/GpuResourcePlugin.java      |  35 ++++++-
 .../resourceplugin/gpu/NvidiaBinaryHelper.java     |  63 ++++++++++++
 .../resources/gpu/TestGpuResourceHandler.java      |  16 +--
 .../resourceplugin/gpu/TestGpuDiscoverer.java      |  47 ++++-----
 11 files changed, 275 insertions(+), 94 deletions(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 79593ea..04a7003 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1612,6 +1612,20 @@ public class YarnConfiguration extends Configuration {
   public static final String NM_RESOURCE_PLUGINS =
       NM_PREFIX + "resource-plugins";
 
+
+  /**
+   * Specifies whether the initialization of the Node Manager should continue
+   * if a certain device (GPU, FPGA, etc) was not found in the system. If set
+   * to "true", then an exception will be thrown if a device is missing or
+   * an error occurred during discovery.
+   */
+  @Private
+  public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
+      NM_RESOURCE_PLUGINS + ".fail-fast";
+
+  @Private
+  public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
+
   /**
    * Prefix for gpu configurations. Work in progress: This configuration
    * parameter may be changed/removed in the future.
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 887a7c3..f99977e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -3800,6 +3800,17 @@
 
   <property>
     <description>
+      Specifies whether the initialization of the Node Manager should continue
+      if a certain device (GPU, FPGA, etc) was not found in the system. If set
+      to "true", then an exception will be thrown if a device is missing or
+      an error occurred during discovery.
+    </description>
+    <name>yarn.nodemanager.resource-plugins.fail-fast</name>
+    <value></value>
+  </property>
+
+  <property>
+    <description>
       Specify GPU devices which can be managed by YARN NodeManager, split by comma
       Number of GPU devices will be reported to RM to make scheduling decisions.
       Set to auto (default) let YARN automatically discover GPU resource from
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java
new file mode 100644
index 0000000..f270f42
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
+
+import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
+import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+
+/**
+ * Small utility class which only re-throws YarnException if
+ * NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
+ *
+ */
+public final class ResourcesExceptionUtil {
+  private ResourcesExceptionUtil() {}
+
+  public static void throwIfNecessary(YarnException e, Configuration conf)
+      throws YarnException {
+    if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
+        DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
+      throw e;
+    }
+  }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
index bcade9e..00c8a85 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
@@ -18,6 +18,12 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
 
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -36,10 +42,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
 public class GpuResourceHandlerImpl implements ResourceHandler {
   final static Log LOG = LogFactory
       .getLog(GpuResourceHandlerImpl.class);
@@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
         String message = "GPU is enabled on the NodeManager, but couldn't find "
             + "any usable GPU devices, please double check configuration!";
         LOG.error(message);
-        throw new ResourceHandlerException(message);
+        throwIfNecessary(new ResourceHandlerException(message),
+            configuration);
       }
     } catch (YarnException e) {
       LOG.error("Exception when trying to get usable GPU device", e);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
index 4ace3ae..9e7652c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
@@ -60,7 +60,7 @@ public class ResourcePluginManager {
 
     Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
     if (plugins != null) {
-      pluginMap = initializePlugins(context, plugins);
+      pluginMap = initializePlugins(conf, context, plugins);
     }
 
     configuredPlugins = Collections.unmodifiableMap(pluginMap);
@@ -77,8 +77,7 @@ public class ResourcePluginManager {
     return plugins;
   }
 
-
-  private Map<String, ResourcePlugin> initializePlugins(
+  private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
       Context context, String[] plugins) throws YarnException {
     Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
 
@@ -91,7 +90,7 @@ public class ResourcePluginManager {
         if (resourceName.equals(GPU_URI)) {
           final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
           final GpuNodeResourceUpdateHandler updateHandler =
-              new GpuNodeResourceUpdateHandler(gpuDiscoverer);
+              new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
           plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
         } else if (resourceName.equals(FPGA_URI)) {
           plugin = new FpgaResourcePlugin();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
index ce76722..f710ff0 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
@@ -18,14 +18,19 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
@@ -34,13 +39,10 @@ import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInfo
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 
 
 @InterfaceAudience.Private
@@ -57,11 +59,10 @@ public class GpuDiscoverer {
   private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
       "/usr/bin", "/bin", "/usr/local/nvidia/bin");
 
-  // command should not run more than 10 sec.
-  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
   private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
 
   private Configuration conf = null;
+  private NvidiaBinaryHelper nvidiaBinaryHelper;
   private String pathOfGpuBinary = null;
   private Map<String, String> environment = new HashMap<>();
 
@@ -110,24 +111,17 @@ public class GpuDiscoverer {
    * @return GpuDeviceInformation
    * @throws YarnException when any error happens
    */
-  synchronized GpuDeviceInformation getGpuDeviceInformation()
+  public synchronized GpuDeviceInformation getGpuDeviceInformation()
       throws YarnException {
-    validateConfOrThrowException();
-
     if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
       String msg = getErrorMessageOfScriptExecutionThresholdReached();
       LOG.error(msg);
       throw new YarnException(msg);
     }
 
-    String output;
     try {
-      output = Shell.execCommand(environment,
-          new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
-      GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
-      lastDiscoveredGpuInformation = parser.parseXml(output);
-      numOfErrorExecutionSinceLastSucceed = 0;
-      return lastDiscoveredGpuInformation;
+      lastDiscoveredGpuInformation =
+          nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
     } catch (IOException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -138,17 +132,18 @@ public class GpuDiscoverer {
     } catch (YarnException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getFailedToParseErrorMessage(e.getMessage());
-      if (LOG.isDebugEnabled()) {
-        LOG.warn(msg, e);
-      }
+      LOG.debug(msg, e);
       throw e;
     }
+
+    return lastDiscoveredGpuInformation;
   }
 
-  private boolean IsAutoDiscoveryEnabled() {
+  private boolean isAutoDiscoveryEnabled() {
     String allowedDevicesStr = conf.get(
         YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
         YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
+
     return allowedDevicesStr.equals(
         YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
   }
@@ -157,13 +152,12 @@ public class GpuDiscoverer {
    * Get list of GPU devices usable by YARN.
    *
    * @return List of GPU devices
-   * @throws YarnException when any issue happens
    */
   public synchronized List<GpuDevice> getGpusUsableByYarn()
       throws YarnException {
     validateConfOrThrowException();
 
-    if (IsAutoDiscoveryEnabled()) {
+    if (isAutoDiscoveryEnabled()) {
       return parseGpuDevicesFromAutoDiscoveredGpuInfo();
     } else {
       if (gpuDevicesFromUser == null) {
@@ -219,16 +213,27 @@ public class GpuDiscoverer {
       if (device.trim().length() > 0) {
         String[] splitByColon = device.trim().split(":");
         if (splitByColon.length != 2) {
-          throw GpuDeviceSpecificationException.
-              createWithWrongValueSpecified(device, devices);
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithWrongValueSpecified(device, devices), conf);
+          LOG.warn("Wrong GPU specification string {}, ignored", device);
+        }
+
+        GpuDevice gpuDevice;
+        try {
+          gpuDevice = parseGpuDevice(splitByColon);
+        } catch (NumberFormatException e) {
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithWrongValueSpecified(device, devices, e), conf);
+          LOG.warn("Cannot parse GPU device numbers: {}", device);
+          continue;
         }
 
-        GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
         if (!gpuDevices.contains(gpuDevice)) {
           gpuDevices.add(gpuDevice);
         } else {
-          throw GpuDeviceSpecificationException
-              .createWithDuplicateValueSpecified(device, devices);
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithDuplicateValueSpecified(device, devices), conf);
+          LOG.warn("CPU device is duplicated: {}", device);
         }
       }
     }
@@ -237,22 +242,18 @@ public class GpuDiscoverer {
     return gpuDevices;
   }
 
-  private GpuDevice parseGpuDevice(String device, String[] splitByColon,
-      String allowedDevicesStr) throws YarnException {
-    try {
-      int index = Integer.parseInt(splitByColon[0]);
-      int minorNumber = Integer.parseInt(splitByColon[1]);
-      return new GpuDevice(index, minorNumber);
-    } catch (NumberFormatException e) {
-      throw GpuDeviceSpecificationException.
-          createWithWrongValueSpecified(device, allowedDevicesStr, e);
-    }
+  private GpuDevice parseGpuDevice(String[] splitByColon) {
+    int index = Integer.parseInt(splitByColon[0]);
+    int minorNumber = Integer.parseInt(splitByColon[1]);
+    return new GpuDevice(index, minorNumber);
   }
 
-  public synchronized void initialize(Configuration config)
-      throws YarnException {
+
+  public synchronized void initialize(Configuration config,
+      NvidiaBinaryHelper nvidiaHelper) throws YarnException {
     this.conf = config;
-    if (IsAutoDiscoveryEnabled()) {
+    this.nvidiaBinaryHelper = nvidiaHelper;
+    if (isAutoDiscoveryEnabled()) {
       numOfErrorExecutionSinceLastSucceed = 0;
       lookUpAutoDiscoveryBinary(config);
 
@@ -286,7 +287,18 @@ public class GpuDiscoverer {
       binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
     } else {
       binaryPath = configuredBinaryFile;
+      // If path exists but file name is incorrect don't execute the file
+      String fileName = binaryPath.getName();
+      if (DEFAULT_BINARY_NAME.equals(fileName)) {
+        String msg = String.format("Please check the configuration value of"
+             +" %s. It should point to an %s binary.",
+             YarnConfiguration.NM_GPU_PATH_TO_EXEC,
+             DEFAULT_BINARY_NAME);
+        throwIfNecessary(new YarnException(msg), config);
+        LOG.warn(msg);
+      }
     }
+
     pathOfGpuBinary = binaryPath.getAbsolutePath();
   }
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
index 4b2258d..afb0d7e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
@@ -18,6 +18,9 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.ResourceInformation;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
@@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
   private static final Logger LOG =
       LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
   private final GpuDiscoverer gpuDiscoverer;
+  private Configuration conf;
 
-  public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
+  public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
+      Configuration conf) {
     this.gpuDiscoverer = gpuDiscoverer;
+    this.conf = conf;
   }
 
   @Override
@@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
           "but could not find any usable GPUs on the NodeManager!";
       LOG.error(message);
       // No gpu can be used by YARN.
-      throw new YarnException(message);
+      throwIfNecessary(new YarnException(message), conf);
+      return;
     }
 
     long nUsableGpus = usableGpus.size();
@@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
     Map<String, ResourceInformation> configuredResourceTypes =
         ResourceUtils.getResourceTypes();
     if (!configuredResourceTypes.containsKey(GPU_URI)) {
-      throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
+      LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
           + GPU_URI
           + " resource-type is not configured inside"
           + " resource-types.xml, please configure it to enable GPU feature or"
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
index 2b06f31..d44160e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import java.util.List;
+
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
@@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
-
-import java.util.List;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
   private final GpuDiscoverer gpuDiscoverer;
+  public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
+
+  private int numOfErrorExecutionSinceLastSucceed = 0;
+
   private GpuResourceHandlerImpl gpuResourceHandler = null;
   private DockerCommandPlugin dockerCommandPlugin = null;
 
@@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   @Override
   public void initialize(Context context) throws YarnException {
-    this.gpuDiscoverer.initialize(context.getConf());
+    this.gpuDiscoverer.initialize(context.getConf(),
+        new NvidiaBinaryHelper());
     this.dockerCommandPlugin =
         GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
             context.getConf());
@@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   @Override
   public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
-    GpuDeviceInformation gpuDeviceInformation =
-        gpuDiscoverer.getGpuDeviceInformation();
+    GpuDeviceInformation gpuDeviceInformation;
 
     //At this point the gpu plugin is already enabled
     checkGpuResourceHandler();
 
+    checkErrorCount();
+    try{
+      gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
+      numOfErrorExecutionSinceLastSucceed = 0;
+    } catch (YarnException e) {
+      LOG.error(e.getMessage(), e);
+      numOfErrorExecutionSinceLastSucceed++;
+      throw e;
+    }
+
     GpuResourceAllocator gpuResourceAllocator =
         gpuResourceHandler.getGpuAllocator();
     List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
@@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
     }
   }
 
+  private void checkErrorCount() throws YarnException {
+    if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+      String msg =
+          "Failed to execute GPU device information detection script for "
+              + MAX_REPEATED_ERROR_ALLOWED
+              + " times, skip following executions.";
+      LOG.error(msg);
+      throw new YarnException(msg);
+    }
+  }
+
   @Override
   public String toString() {
     return GpuResourcePlugin.class.getName();
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
new file mode 100644
index 0000000..8efc32a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
+
+/**
+ * Executes the "nvidia-smi" command and returns an object
+ * based on its output.
+ *
+ */
+public class NvidiaBinaryHelper {
+  /**
+   * command should not run more than 10 sec.
+   */
+  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
+
+  /**
+   * @param pathOfGpuBinary The path of the binary
+   * @return the GpuDeviceInformation parsed from the nvidia-smi output
+   * @throws IOException if the binary output is not readable
+   * @throws YarnException if the pathOfGpuBinary is null,
+   * or the output parse failed
+   */
+  synchronized GpuDeviceInformation getGpuDeviceInformation(
+      String pathOfGpuBinary) throws IOException, YarnException {
+    GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+    if (pathOfGpuBinary == null) {
+      throw new YarnException(
+          "Failed to find GPU discovery executable, please double check "
+              + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
+    }
+
+    String output = Shell.execCommand(new HashMap<>(),
+        new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
+    return parser.parseXml(output);
+  }
+}
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
index efd28ee..5cb508a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java
@@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@@ -116,11 +117,13 @@ public class TestGpuResourceHandler {
   @Rule
   public ExpectedException expected = ExpectedException.none();
 
+  private NvidiaBinaryHelper nvidiaBinaryHelper;
+
   @Before
   public void setup() throws IOException {
     createTestDataDirectory();
-
     TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI);
+    nvidiaBinaryHelper = new NvidiaBinaryHelper();
 
     mockCGroupsHandler = mock(CGroupsHandler.class);
     mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class);
@@ -146,13 +149,14 @@ public class TestGpuResourceHandler {
   @After
   public void cleanupTestFiles() throws IOException {
     FileUtils.deleteDirectory(testDataDirectory);
+    nvidiaBinaryHelper = new NvidiaBinaryHelper();
   }
 
   @Test
   public void testBootstrapWithRealGpuDiscoverer() throws Exception {
     Configuration conf = createDefaultConfig();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     gpuResourceHandler.bootstrap(conf);
 
@@ -170,7 +174,7 @@ public class TestGpuResourceHandler {
   public void testBootstrapWithMockGpuDiscoverer() throws Exception {
     GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
     Configuration conf = new YarnConfiguration();
-    mockDiscoverer.initialize(conf);
+    mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     expected.expect(ResourceHandlerException.class);
     gpuResourceHandler.bootstrap(conf);
@@ -270,7 +274,7 @@ public class TestGpuResourceHandler {
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
 
     gpuDiscoverer = new GpuDiscoverer();
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
     Context nmContext = createMockNmContext(conf);
     gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
         mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
@@ -379,7 +383,7 @@ public class TestGpuResourceHandler {
   public void testAllocationWithoutAllowedGpus() throws Exception {
     Configuration conf = createDefaultConfig();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     try {
       gpuResourceHandler.bootstrap(conf);
@@ -460,7 +464,7 @@ public class TestGpuResourceHandler {
         new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
         mockPrivilegedExecutor, gpuDiscoverer);
 
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     gpuNULLStateResourceHandler.bootstrap(conf);
     verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
index f0f100c..8261895 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java
@@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
   private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
   private static final String TEST_PARENT_DIR = new File("target/temp/" +
       TestGpuDiscoverer.class.getName()).getAbsolutePath();
+  private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
 
   @Rule
   public ExpectedException exception = ExpectedException.none();
@@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
       Configuration conf) throws YarnException {
     conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     return discoverer;
   }
 
@@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
     // test case 1, check default setting.
     Configuration conf = new Configuration(false);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
     assertNvidiaIsOnPath(discoverer);
 
     // test case 2, check mandatory set path.
     File fakeBinary = setupFakeBinary(conf);
     discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(fakeBinary.getAbsolutePath(),
         discoverer.getPathOfGpuBinary());
     assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
@@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
     // but binary doesn't exist so default path will be used.
     fakeBinary.delete();
     discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(DEFAULT_BINARY_NAME,
         discoverer.getPathOfGpuBinary());
     assertNvidiaIsOnPath(discoverer);
@@ -310,14 +311,14 @@ public class TestGpuDiscoverer {
   }
 
   @Test
-  public void testGpuDiscover() throws YarnException {
+  public void testGpuDiscover() throws YarnException, IOException {
     // Since this is more of a performance unit test, only run if
     // RunUserLimitThroughput is set (-DRunUserLimitThroughput=true)
     Assume.assumeTrue(
         Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
     Configuration conf = new Configuration(false);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
 
     assertTrue(info.getGpus().size() > 0);
@@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
     Configuration conf = createConfigWithAllowedDevices("1:2");
 
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
     assertEquals(1, usableGpuDevices.size());
 
@@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
   public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
     Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
 
     List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
     assertEquals(4, usableGpuDevices.size());
@@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
     conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
     GpuDiscoverer plugin = new GpuDiscoverer();
     try {
-      plugin.initialize(conf);
+      plugin.initialize(conf, binaryHelper);
       plugin.getGpusUsableByYarn();
       fail("Illegal format, should fail.");
     } catch (YarnException e) {
@@ -501,15 +502,15 @@ public class TestGpuDiscoverer {
   }
 
   @Test
-  public void testScriptNotCalled() throws YarnException {
+  public void testScriptNotCalled() throws YarnException, IOException {
     Configuration conf = new Configuration();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
 
     GpuDiscoverer gpuSpy = spy(new GpuDiscoverer());
 
-    gpuSpy.initialize(conf);
+    gpuSpy.initialize(conf, binaryHelper);
     gpuSpy.getGpusUsableByYarn();
 
     verify(gpuSpy, never()).getGpuDeviceInformation();
   }
-}
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org