You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/07/02 14:34:42 UTC

svn commit: r1498904 - in /manifoldcf/trunk: ./ connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/ connectors/hdfs/connector/src/main/...

Author: kwright
Date: Tue Jul  2 12:34:41 2013
New Revision: 1498904

URL: http://svn.apache.org/r1498904
Log:
Fix for CONNECTORS-742.  Also revamped hdfs connectors to break up URI into host and port components.

Added:
    manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG   (with props)
    manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG   (with props)
    manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG   (with props)
Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties
    manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties
    manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties
    manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties
    manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html
    manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js
    manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html
    manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
    manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-configure-server.PNG

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jul  2 12:34:41 2013
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 1.3-dev =====================
 
+CONNECTORS-742: Document HDFS connector.
+(Karl Wright)
+
 CONNECTORS-741: Document HDFS output connector.
 (Karl Wright)
 

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java Tue Jul  2 12:34:41 2013
@@ -32,8 +32,9 @@ public class HDFSOutputConfig extends HD
 
   /** Parameters used for the configuration */
   final private static ParameterEnum[] CONFIGURATIONLIST = {
-	  ParameterEnum.NAMENODE,
-	  ParameterEnum.USER
+    ParameterEnum.NAMENODEHOST,
+    ParameterEnum.NAMENODEPORT,
+    ParameterEnum.USER
   };
 
   /** Build a set of ElasticSearchParameters by reading ConfigParams. If the

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java Tue Jul  2 12:34:41 2013
@@ -130,14 +130,20 @@ public class HDFSOutputConnector extends
 
   /** Set up a session */
   protected void getSession() throws ManifoldCFException, ServiceInterruption {
-    String nameNode = params.getParameter(ParameterEnum.NAMENODE.name());
-    if (nameNode == null)
-      throw new ManifoldCFException("Namenode must be specified");
+    String nameNodeHost = params.getParameter(ParameterEnum.NAMENODEHOST.name());
+    if (nameNodeHost == null)
+      throw new ManifoldCFException("Namenodehost must be specified");
+
+    String nameNodePort = params.getParameter(ParameterEnum.NAMENODEPORT.name());
+    if (nameNodePort == null)
+      throw new ManifoldCFException("Namenodeport must be specified");
     
     String user = params.getParameter(ParameterEnum.USER.name());
     if (user == null)
       throw new ManifoldCFException("User must be specified");
     
+    String nameNode = "hdfs://"+nameNodeHost+":"+nameNodePort;
+
     /*
      * make Configuration
      */

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java Tue Jul  2 12:34:41 2013
@@ -27,8 +27,11 @@ public class HDFSOutputConstant
 
   // Configuration parameters
 
-  /** Name node */
-  public static final String PARAM_NAMENODE = "namenode";
+  /** Name node host */
+  public static final String PARAM_NAMENODEHOST = "namenodehost";
+
+  /** Name node port */
+  public static final String PARAM_NAMENODEPORT = "namenodeport";
 
   /** User */
   public static final String PARAM_USER = "user";

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java Tue Jul  2 12:34:41 2013
@@ -24,7 +24,8 @@ import java.util.Map;
 
 /** Parameters constants */
 public enum ParameterEnum {
-  NAMENODE("hdfs://localhost:9000"),
+  NAMENODEHOST("localhost"),
+  NAMENODEPORT("9000"),
   USER(""),
   ROOTPATH("");
 

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Tue Jul  2 12:34:41 2013
@@ -33,7 +33,6 @@ import org.apache.manifoldcf.core.common
 import org.apache.manifoldcf.core.common.XThreadStringBuffer;
 import org.apache.manifoldcf.core.extmimemap.ExtensionMimeMap;
 
-import java.security.GeneralSecurityException;
 import java.util.*;
 import java.io.*;
 import java.net.URI;
@@ -55,7 +54,8 @@ public class HDFSRepositoryConnector ext
   // Activities list
   protected static final String[] activitiesList = new String[]{ACTIVITY_READ};
 
-  protected String nameNode = null;
+  protected String nameNodeHost = null;
+  protected String nameNodePort = null;
   protected String user = null;
   protected Configuration config = null;
   protected HDFSSession session = null;
@@ -122,8 +122,8 @@ public class HDFSRepositoryConnector ext
   public void connect(ConfigParams configParams) {
     super.connect(configParams);
 
-    nameNode = configParams.getParameter("namenode");
-    
+    nameNodeHost = configParams.getParameter("namenodehost");
+    nameNodePort = configParams.getParameter("namenodeport");
     user = configParams.getParameter("user");
     
     /*
@@ -133,7 +133,7 @@ public class HDFSRepositoryConnector ext
     try {
       Thread.currentThread().setContextClassLoader(org.apache.hadoop.conf.Configuration.class.getClassLoader());
       config = new Configuration();
-      config.set("fs.default.name", nameNode);
+      config.set("fs.default.name", makeNameNodeURI(nameNodeHost, nameNodePort));
     } finally {
       Thread.currentThread().setContextClassLoader(ocl);
     }
@@ -144,34 +144,34 @@ public class HDFSRepositoryConnector ext
    */
   @Override
   public void disconnect() throws ManifoldCFException {
-    if (session != null) {
-      try {
-        session.close();
-      } catch (IOException e) {
-    	throw new ManifoldCFException(e);  
-      } finally {
-        session = null;
-        lastSessionFetch = -1L;
-      }
-    }
-  
-    config.clear();
+    closeSession();
     config = null;
     user = null;
-    nameNode = null;
+    nameNodeHost = null;
+    nameNodePort = null;
     super.disconnect();
   }
 
+  protected static String makeNameNodeURI(String host, String port) {
+    return "hdfs://"+host+":"+port;
+  }
+  
   /**
    * Set up a session
    */
   protected void getSession() throws ManifoldCFException, ServiceInterruption {
     if (session == null) {
-      if (StringUtils.isEmpty(nameNode)) {
-        throw new ManifoldCFException("Parameter namenode required but not set");
+      if (StringUtils.isEmpty(nameNodeHost)) {
+        throw new ManifoldCFException("Parameter namenodehost required but not set");
+      }
+      if (Logging.connectors.isDebugEnabled()) {
+        Logging.connectors.debug("HDFS: NameNodeHost = '" + nameNodeHost + "'");
+      }
+      if (StringUtils.isEmpty(nameNodePort)) {
+        throw new ManifoldCFException("Parameter namenodeport required but not set");
       }
       if (Logging.connectors.isDebugEnabled()) {
-        Logging.connectors.debug("HDFS: NameNode = '" + nameNode + "'");
+        Logging.connectors.debug("HDFS: NameNodePort = '" + nameNodePort + "'");
       }
 
       if (StringUtils.isEmpty(user)) {
@@ -190,8 +190,10 @@ public class HDFSRepositoryConnector ext
         if (thr != null) {
           if (thr instanceof IOException) {
             throw (IOException) thr;
-          } else if (thr instanceof GeneralSecurityException) {
-            throw (GeneralSecurityException) thr;
+          } else if (thr instanceof URISyntaxException) {
+            throw (URISyntaxException) thr;
+          } else if (thr instanceof RuntimeException) {
+            throw (RuntimeException) thr;
           } else {
             throw (Error) thr;
           }
@@ -205,9 +207,9 @@ public class HDFSRepositoryConnector ext
       } catch (InterruptedIOException e) {
         t.interrupt();
         throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
-      } catch (GeneralSecurityException e) {
-        Logging.connectors.error("HDFS: " +  "General security error initializing transport: " + e.getMessage(), e);
-        handleGeneralSecurityException(e);
+      } catch (URISyntaxException e) {
+        Logging.connectors.error("HDFS: URI syntax exception: " + e.getMessage(), e);
+        handleURISyntaxException(e);
       } catch (IOException e) {
         Logging.connectors.warn("HDFS: IO error: " + e.getMessage(), e);
         handleIOException(e);
@@ -243,17 +245,7 @@ public class HDFSRepositoryConnector ext
     CheckConnectionThread t = new CheckConnectionThread();
     try {
       t.start();
-      t.join();
-      Throwable thr = t.getException();
-      if (thr != null) {
-        if (thr instanceof IOException) {
-          throw (IOException) thr;
-        } else if (thr instanceof RuntimeException) {
-          throw (RuntimeException) thr;
-        } else {
-          throw (Error) thr;
-        }
-      }
+      t.finishUp();
       return;
     } catch (InterruptedException e) {
       t.interrupt();
@@ -281,15 +273,26 @@ public class HDFSRepositoryConnector ext
 
     long currentTime = System.currentTimeMillis();
     if (currentTime >= lastSessionFetch + timeToRelease) {
-      if (session != null) {
-        try {
-          session.close();
-        } catch (IOException e) {
-          throw new ManifoldCFException(e);  
-        } finally {
-          session = null;
-          lastSessionFetch = -1L;
-        }
+      closeSession();
+    }
+  }
+
+  protected void closeSession()
+    throws ManifoldCFException {
+    if (session != null) {
+      try {
+        // This can in theory throw an IOException, so it is possible it is doing socket
+        // communication.  In practice, it's unlikely that there's any real IO, so I'm
+        // NOT putting it in a background thread for now.
+        session.close();
+      } catch (InterruptedIOException e) {
+        throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+      } catch (IOException e) {
+        Logging.connectors.warn("HDFS: Error closing connection: "+e.getMessage(),e);
+        // Eat the exception
+      } finally {
+        session = null;
+        lastSessionFetch = -1L;
       }
     }
   }
@@ -344,40 +347,12 @@ public class HDFSRepositoryConnector ext
       if (sn.getType().equals("startpoint")) {
         path = sn.getAttributeValue("path");
   
-        getSession();
-        GetSeedsThread t = new GetSeedsThread(path);
-        try {
-          t.start();
-          boolean wasInterrupted = false;
-          try {
-            XThreadStringBuffer seedBuffer = t.getBuffer();
+        FileStatus[] statuses = getChildren(new Path(path));
 
-            // Pick up the paths, and add them to the activities, before we join with the child thread.
-            while (true) {
-              // The only kind of exceptions this can throw are going to shut the process down.
-              String docPath = seedBuffer.fetch();
-              if (docPath ==  null) {
-                break;
-              }
-              // Add the pageID to the queue
-              activities.addSeedDocument(docPath);
-            }
-          } catch (InterruptedException e) {
-            wasInterrupted = true;
-            throw e;
-          } catch (ManifoldCFException e) {
-            if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
-              wasInterrupted = true;
-            }
-            throw e;
-          } finally {
-            if (!wasInterrupted) {
-              t.finishUp();
-            }
+        for (FileStatus fileStatus : statuses) {
+          if (fileStatus.isDir()) {
+            activities.addSeedDocument(fileStatus.getPath().toUri().toString());
           }
-        } catch (InterruptedException e) {
-          t.interrupt();
-          throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
         }
       }
       i++;
@@ -408,17 +383,9 @@ public class HDFSRepositoryConnector ext
   {
     String[] rval = new String[documentIdentifiers.length];
     for (int i = 0; i < rval.length; i++) {
-      getSession();
-      GetObjectThread objt = new GetObjectThread(documentIdentifiers[i]);
-      try {
-        objt.start();
-        objt.finishUp();
-      } catch (InterruptedException e) {
-        objt.interrupt();
-        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
-      }
-
-      FileStatus fileStatus = objt.getResponse();
+      String documentIdentifier = documentIdentifiers[i];
+      
+      FileStatus fileStatus = getObject(new Path(documentIdentifier));
       if (fileStatus != null) {
         if (fileStatus.isDir()) {
           long lastModified = fileStatus.getModificationTime();
@@ -469,150 +436,134 @@ public class HDFSRepositoryConnector ext
   public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities, DocumentSpecification spec, boolean[] scanOnly)
     throws ManifoldCFException, ServiceInterruption {
     for (int i = 0; i < documentIdentifiers.length; i++) {
-      long startTime = System.currentTimeMillis();
-      String errorCode = "FAILED";
-      String errorDesc = StringUtils.EMPTY;
-      long fileSize = 0;
-      boolean doLog = false;
       String version = versions[i];
       String documentIdentifier = documentIdentifiers[i];
         
-      try {
-        if (Logging.connectors.isDebugEnabled()) {
-          Logging.connectors.debug("HDFS: Processing document identifier '" + documentIdentifier + "'");
-        }
-        getSession();
-        GetObjectThread objt = new GetObjectThread(documentIdentifier);
-        try {
-          objt.start();
-          objt.finishUp();
-        } catch (InterruptedException e) {
-          objt.interrupt();
-          throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
-            ManifoldCFException.INTERRUPTED);
-        }
+      if (Logging.connectors.isDebugEnabled()) {
+        Logging.connectors.debug("HDFS: Processing document identifier '" + documentIdentifier + "'");
+      }
+      FileStatus fileStatus = getObject(new Path(documentIdentifier));
         
-        FileStatus fileStatus = objt.getResponse();
+      if (fileStatus == null) {
+        // It is no longer there , so delete right away
+        activities.deleteDocument(documentIdentifier,version);
+        continue;
+      }
         
-        if (fileStatus == null) {
-        	continue;
+      if (fileStatus.isDir()) {
+        /*
+          * Queue up stuff for directory
+          */
+        String entityReference = documentIdentifier;
+        FileStatus[] fileStatuses = getChildren(fileStatus.getPath());
+        if (fileStatuses == null) {
+          // Directory was deleted, so remove
+          activities.deleteDocument(documentIdentifier,version);
+          continue;
         }
-        
-        if (fileStatus.isDir()) {
-          /*
-           * Queue up stuff for directory
-           */
-          String entityReference = documentIdentifier;
-          try {
-            FileStatus[] fileStatuses = session.getFileSystem().listStatus(fileStatus.getPath());
-            if (fileStatuses != null) {
-              int j = 0;
-              while (j < fileStatuses.length) {
-                FileStatus fs = fileStatuses[j++];
-                String canonicalPath = fs.getPath().toString();
-                if (checkInclude(session.getFileSystem().getUri().toString(),fs,canonicalPath,spec)) {
-                  activities.addDocumentReference(canonicalPath,documentIdentifier,RELATIONSHIP_CHILD);
-                }
-              }
-            }
-          } catch (IOException e) {
-            errorCode = "IO ERROR";
-            errorDesc = e.getMessage();
-            throw new ManifoldCFException("IO Error: "+e.getMessage(),e);
-          } finally {
-            activities.recordActivity(new Long(startTime),ACTIVITY_READ,null,entityReference,errorCode,errorDesc,null);
+        for (int j = 0; j < fileStatuses.length; j++) {
+          FileStatus fs = fileStatuses[j++];
+          String canonicalPath = fs.getPath().toString();
+          if (checkInclude(session.getUri().toString(),fs,canonicalPath,spec)) {
+            activities.addDocumentReference(canonicalPath,documentIdentifier,RELATIONSHIP_CHILD);
           }
-        } else {
-          /*
-           * its a file
-           */
-          if (!scanOnly[i]) {
-            doLog = true;
-            if (!checkIngest(session.getFileSystem().getUri().toString(),fileStatus,spec)) {
-              continue;
-            }
+        }
+      } else {
+        if (scanOnly[i])
+          continue;
+        if (!checkIngest(session.getUri().toString(),fileStatus,spec))
+          continue;
 
-            /*
-             * get filepathtouri value
-             */
-            String convertPath = null;
-            if (version.length() > 0 && version.startsWith("+"))
-            {
-              StringBuilder unpack = new StringBuilder();
-              unpack(unpack, version, 1, '+');
-              convertPath = unpack.toString();
-            }
+        // Get the WGet conversion path out of the version string
+        String convertPath = null;
+        if (version.length() > 0 && version.startsWith("+"))
+        {
+          StringBuilder unpack = new StringBuilder();
+          unpack(unpack, version, 1, '+');
+          convertPath = unpack.toString();
+        }
 
-            // Length in bytes
-            fileSize = fileStatus.getLen();
-            
-            RepositoryDocument data = new RepositoryDocument();
+        // It is a file to be indexed.
+        
+        // Prepare the metadata part of RepositoryDocument
+        RepositoryDocument data = new RepositoryDocument();
 
-            data.setFileName(fileStatus.getPath().getName());
-            data.setMimeType(mapExtensionToMimeType(fileStatus.getPath().getName()));
-            data.setModifiedDate(new Date(fileStatus.getModificationTime()));
-
-            String uri;
-            if (convertPath != null) {
-              uri = convertToWGETURI(convertPath);
-            } else {
-              uri = fileStatus.getPath().toUri().toString();
-            }
-            data.addField("uri",uri);
+        data.setFileName(fileStatus.getPath().getName());
+        data.setMimeType(mapExtensionToMimeType(fileStatus.getPath().getName()));
+        data.setModifiedDate(new Date(fileStatus.getModificationTime()));
+
+        String uri;
+        if (convertPath != null) {
+          uri = convertToWGETURI(convertPath);
+        } else {
+          uri = fileStatus.getPath().toUri().toString();
+        }
+        data.addField("uri",uri);
+
+        // Make sure we have a session
+        getSession();
+
+        // We will record document fetch as an activity
+        long startTime = System.currentTimeMillis();
+        String errorCode = "FAILED";
+        String errorDesc = StringUtils.EMPTY;
+        long fileSize = 0;
 
-            getSession();
-            BackgroundStreamThread t = new BackgroundStreamThread(documentIdentifier);
+        try {
+          BackgroundStreamThread t = new BackgroundStreamThread(new Path(documentIdentifier));
+          try {
+            t.start();
+            boolean wasInterrupted = false;
             try {
-              t.start();
-              boolean wasInterrupted = false;
+              InputStream is = t.getSafeInputStream();
               try {
-                InputStream is = t.getSafeInputStream();
-                try {
-                  data.setBinary(is, fileSize);
-                  activities.ingestDocument(documentIdentifier,version,uri,data);
-                } finally {
-                  is.close();
-                }
-              } catch (java.net.SocketTimeoutException e) {
-                throw e;
-              } catch (InterruptedIOException e) {
-                wasInterrupted = true;
-                throw e;
-              } catch (ManifoldCFException e) {
-                if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
-                  wasInterrupted = true;
-                }
-                throw e;
+                data.setBinary(is, fileSize);
+                activities.ingestDocument(documentIdentifier,version,uri,data);
               } finally {
-                if (!wasInterrupted) {
-                  // This does a join
-                  t.finishUp();
-                }
+                is.close();
               }
-
-              // No errors.  Record the fact that we made it.
-              errorCode = "OK";
-            } catch (InterruptedException e) {
-              // We were interrupted out of the join, most likely.  Before we abandon the thread,
-              // send a courtesy interrupt.
-              t.interrupt();
-              throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
             } catch (java.net.SocketTimeoutException e) {
-              errorCode = "IO ERROR";
-              errorDesc = e.getMessage();
-              handleIOException(e);
+              throw e;
             } catch (InterruptedIOException e) {
-              t.interrupt();
-              throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
-            } catch (IOException e) {
-              errorCode = "IO ERROR";
-              errorDesc = e.getMessage();
-              handleIOException(e);
+              wasInterrupted = true;
+              throw e;
+            } catch (ManifoldCFException e) {
+              if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
+                wasInterrupted = true;
+              }
+              throw e;
+            } finally {
+              if (!wasInterrupted) {
+                // This does a join
+                t.finishUp();
+              }
             }
+
+            // No errors.  Record the fact that we made it.
+            errorCode = "OK";
+            // Length we did in bytes
+            fileSize = fileStatus.getLen();
+
+          } catch (InterruptedException e) {
+            // We were interrupted out of the join, most likely.  Before we abandon the thread,
+            // send a courtesy interrupt.
+            t.interrupt();
+            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+          } catch (java.net.SocketTimeoutException e) {
+            errorCode = "IO ERROR";
+            errorDesc = e.getMessage();
+            handleIOException(e);
+          } catch (InterruptedIOException e) {
+            t.interrupt();
+            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+          } catch (IOException e) {
+            errorCode = "IO ERROR";
+            errorDesc = e.getMessage();
+            handleIOException(e);
           }
+        } finally {
+          activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
         }
-      } finally {
-        activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
       }
     }
   }
@@ -643,11 +594,25 @@ public class HDFSRepositoryConnector ext
 "<!--\n"+
 "function checkConfigForSave()\n"+
 "{\n"+
-"  if (editconnection.namenode.value == \"\")\n"+
+"  if (editconnection.namenodehost.value == \"\")\n"+
+"  {\n"+
+"    alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodeHostCannotBeNull")+"\");\n"+
+"    SelectTab(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.ServerTabName")+"\");\n"+
+"    editconnection.namenodehost.focus();\n"+
+"    return false;\n"+
+"  }\n"+
+"  if (editconnection.namenodeport.value == \"\")\n"+
+"  {\n"+
+"    alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodePortCannotBeNull")+"\");\n"+
+"    SelectTab(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.ServerTabName")+"\");\n"+
+"    editconnection.namenodeport.focus();\n"+
+"    return false;\n"+
+"  }\n"+
+"  if (!isInteger(editconnection.namenodeport.value))\n"+
 "  {\n"+
-"    alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodeURICannotBeNull")+"\");\n"+
+"    alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodePortMustBeAnInteger")+"\");\n"+
 "    SelectTab(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.ServerTabName")+"\");\n"+
-"    editconnection.namenode.focus();\n"+
+"    editconnection.namenodeport.focus();\n"+
 "    return false;\n"+
 "  }\n"+
 "  if (editconnection.user.value == \"\")\n"+
@@ -677,13 +642,19 @@ public class HDFSRepositoryConnector ext
   public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName)
     throws ManifoldCFException, IOException
   {
-    String nameNode = parameters.getParameter("namenode");
-    if (nameNode == null) {
-    	nameNode = "hdfs://localhost:9000";
+    String nameNodeHost = parameters.getParameter("namenodehost");
+    if (nameNodeHost == null) {
+      nameNodeHost = "localhost";
+    }
+    
+    String nameNodePort = parameters.getParameter("namenodeport");
+    if (nameNodePort == null) {
+      nameNodePort = "9000";
     }
+
     String user = parameters.getParameter("user");
     if (user == null) {
-    	user = "";
+      user = "";
     }
     
     if (tabName.equals(Messages.getString(locale,"HDFSRepositoryConnector.ServerTabName")))
@@ -691,15 +662,21 @@ public class HDFSRepositoryConnector ext
       out.print(
 "<table class=\"displaytable\">\n"+
 "  <tr>\n"+
-"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNode") + "</nobr></td>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodeHost") + "</nobr></td>\n"+
+"    <td class=\"value\">\n"+
+"      <input name=\"namenodehost\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodeHost)+"\"/>\n"+
+"    </td>\n"+
+"  </tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodePort") + "</nobr></td>\n"+
 "    <td class=\"value\">\n"+
-"      <input name=\"namenode\" type=\"text\" size=\"48\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNode)+"\"/>\n"+
+"      <input name=\"namenodeport\" type=\"text\" size=\"5\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodePort)+"\"/>\n"+
 "    </td>\n"+
 "  </tr>\n"+
 "  <tr>\n"+
 "    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.User") + "</nobr></td>\n"+
 "    <td class=\"value\">\n"+
-"      <input name=\"user\" type=\"text\" size=\"48\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(user)+"\"/>\n"+
+"      <input name=\"user\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(user)+"\"/>\n"+
 "    </td>\n"+
 "  </tr>\n"+
 "</table>\n"
@@ -709,7 +686,8 @@ public class HDFSRepositoryConnector ext
     {
       // Server tab hiddens
       out.print(
-"<input type=\"hidden\" name=\"namenode\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNode)+"\"/>\n"+
+"<input type=\"hidden\" name=\"namenodehost\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodeHost)+"\"/>\n"+
+"<input type=\"hidden\" name=\"namenodeport\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodePort)+"\"/>\n"+
 "<input type=\"hidden\" name=\"user\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(user)+"\"/>\n"
       );
     }
@@ -728,9 +706,14 @@ public class HDFSRepositoryConnector ext
   public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, ConfigParams parameters)
     throws ManifoldCFException
   {
-    String nameNode = variableContext.getParameter("namenode");
-    if (nameNode != null) {
-      parameters.setParameter("namenode", nameNode);
+    String nameNodeHost = variableContext.getParameter("namenodehost");
+    if (nameNodeHost != null) {
+      parameters.setParameter("namenodehost", nameNodeHost);
+    }
+
+    String nameNodePort = variableContext.getParameter("namenodeport");
+    if (nameNodePort != null) {
+      parameters.setParameter("namenodeport", nameNodePort);
     }
 
     String user = variableContext.getParameter("user");
@@ -752,21 +735,19 @@ public class HDFSRepositoryConnector ext
   public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters)
     throws ManifoldCFException, IOException
   {
-    String nameNode = parameters.getParameter("namenode");
-    if (nameNode == null) {
-      nameNode = "hdfs://localhost:9000";
-    }
-    
+    String nameNodeHost = parameters.getParameter("namenodehost");
+    String nameNodePort = parameters.getParameter("namenodeport");
     String user = parameters.getParameter("user");
-    if (user == null) {
-      user = "user";
-    }
     
     out.print(
 "<table class=\"displaytable\">\n"+
 "  <tr>\n"+
-"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNode") + "</nobr></td>\n"+
-"    <td class=\"value\">\n"+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNode)+"</td>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodeHost") + "</nobr></td>\n"+
+"    <td class=\"value\">\n"+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodeHost)+"</td>\n"+
+"  </tr>\n"+
+"  <tr>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodePort") + "</nobr></td>\n"+
+"    <td class=\"value\">\n"+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodePort)+"</td>\n"+
 "  </tr>\n"+
 "  <tr>\n"+
 "    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.User") + "</nobr></td>\n"+
@@ -1689,9 +1670,9 @@ public class HDFSRepositoryConnector ext
    * @throws ManifoldCFException
    * @throws ServiceInterruption
    */
-  private static void handleGeneralSecurityException(GeneralSecurityException e) throws ManifoldCFException, ServiceInterruption {
-    // Permanent problem: can't initialize transport layer
-    throw new ManifoldCFException("HDFS exception: "+e.getMessage(), e);
+  private static void handleURISyntaxException(URISyntaxException e) throws ManifoldCFException, ServiceInterruption {
+    // Permanent problem
+    throw new ManifoldCFException("HDFS bad namenode specification: "+e.getMessage(), e);
   }
 
   protected class CheckConnectionThread extends Thread {
@@ -1710,8 +1691,18 @@ public class HDFSRepositoryConnector ext
       }
     }
 
-    public Throwable getException() {
-      return exception;
+    public void finishUp() throws InterruptedException, IOException {
+      join();
+      Throwable thr = exception;
+      if (thr != null) {
+        if (thr instanceof IOException) {
+          throw (IOException) thr;
+        } else if (thr instanceof RuntimeException) {
+          throw (RuntimeException) thr;
+        } else {
+          throw (Error) thr;
+        }
+      }
     }
   }
 
@@ -1726,7 +1717,7 @@ public class HDFSRepositoryConnector ext
     public void run() {
       try {
         // Create a session
-        session = new HDFSSession(nameNode, config, user);
+        session = new HDFSSession(makeNameNodeURI(nameNodeHost,nameNodePort), config, user);
       } catch (Throwable e) {
         this.exception = e;
       }
@@ -1737,34 +1728,46 @@ public class HDFSRepositoryConnector ext
     }
   }
 
-  protected class GetSeedsThread extends Thread {
+  protected FileStatus[] getChildren(Path path)
+    throws ManifoldCFException, ServiceInterruption {
+    getSession();
+    try {
+      GetChildrenThread t = new GetChildrenThread(path);
+      try {
+        t.start();
+        t.finishUp();
+      } catch (InterruptedException e) {
+        t.interrupt();
+        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+      }
+      return t.getResult();
+    } catch (IOException e) {
+      handleIOException(e);
+    }
+    return null;
+  }
+
+  protected class GetChildrenThread extends Thread {
     protected Throwable exception = null;
-    protected final String path;
-    protected final XThreadStringBuffer seedBuffer;
+    protected FileStatus[] result = null;
+    protected final Path path;
 
-    public GetSeedsThread(String path) {
+    public GetChildrenThread(Path path) {
       super();
       this.path = path;
-      this.seedBuffer = new XThreadStringBuffer();
       setDaemon(true);
     }
 
     @Override
     public void run() {
       try {
-        session.getSeeds(seedBuffer, path);
-        seedBuffer.signalDone();
+        result = session.listStatus(path);
       } catch (Throwable e) {
         this.exception = e;
       }
     }
 
-    public XThreadStringBuffer getBuffer() {
-      return seedBuffer;
-    }
-
-    public void finishUp() throws InterruptedException {
-      seedBuffer.abandon();
+    public void finishUp() throws InterruptedException, IOException {
       join();
       Throwable thr = exception;
       if (thr != null) {
@@ -1772,19 +1775,45 @@ public class HDFSRepositoryConnector ext
           throw (RuntimeException) thr;
         } else if (thr instanceof Error) {
           throw (Error) thr;
+        } else if (thr instanceof IOException) {
+          throw (IOException) thr;
         } else {
           throw new RuntimeException("Unhandled exception of type: "+thr.getClass().getName(),thr);
         }
       }
     }
+    
+    public FileStatus[] getResult() {
+      return result;
+    }
   }
 
+  protected FileStatus getObject(Path path)
+    throws ManifoldCFException, ServiceInterruption {
+    getSession();
+    try {
+      GetObjectThread objt = new GetObjectThread(path);
+      try {
+        objt.start();
+        objt.finishUp();
+      } catch (InterruptedException e) {
+        objt.interrupt();
+        throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+      }
+
+      return objt.getResponse();
+    } catch (IOException e) {
+      handleIOException(e);
+    }
+    return null;
+  }
+  
   protected class GetObjectThread extends Thread {
-    protected final String nodeId;
+    protected final Path nodeId;
     protected Throwable exception = null;
     protected FileStatus response = null;
 
-    public GetObjectThread(String nodeId) {
+    public GetObjectThread(Path nodeId) {
       super();
       setDaemon(true);
       this.nodeId = nodeId;
@@ -1798,7 +1827,7 @@ public class HDFSRepositoryConnector ext
       }
     }
 
-    public void finishUp() throws InterruptedException {
+    public void finishUp() throws InterruptedException, IOException {
       join();
       Throwable thr = exception;
       if (thr != null) {
@@ -1806,6 +1835,8 @@ public class HDFSRepositoryConnector ext
           throw (RuntimeException) thr;
         } else if (thr instanceof Error) {
           throw (Error) thr;
+        } else if (thr instanceof IOException) {
+          throw (IOException) thr;
         } else {
           throw new RuntimeException("Unhandled exception of type: "+thr.getClass().getName(),thr);
         }
@@ -1816,21 +1847,18 @@ public class HDFSRepositoryConnector ext
       return response;
     }
 
-    public Throwable getException() {
-      return exception;
-    }
   }
 
   protected class BackgroundStreamThread extends Thread
   {
-    protected final String nodeId;
+    protected final Path nodeId;
     
     protected boolean abortThread = false;
     protected Throwable responseException = null;
     protected InputStream sourceStream = null;
     protected XThreadInputStream threadStream = null;
     
-    public BackgroundStreamThread(String nodeId)
+    public BackgroundStreamThread(Path nodeId)
     {
       super();
       setDaemon(true);

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java Tue Jul  2 12:34:41 2013
@@ -69,42 +69,33 @@ public class HDFSSession {
     return info;
   }
 
-  public void getSeeds(XThreadStringBuffer idBuffer, String path)
-    throws IOException, InterruptedException {
-
-    /*
-     * need to add root dir so that single files such as /file1 will still get read
-     */
-    idBuffer.add(path);
-    
-    /*
-     * gets a list of the contents of the entire folder: subfolders + files
-     */
-    FileStatus[] fileStatuses = fileSystem.listStatus(new Path(path));
-    for (FileStatus fileStatus : fileStatuses) {
-      /*
-       * only add the directories as seeds, we'll add the files later
-       */
-      if (fileStatus.isDir()) {
-        idBuffer.add(fileStatus.getPath().toUri().toString());
-      }
+  public FileStatus[] listStatus(Path path)
+    throws IOException {
+    try {
+      return fileSystem.listStatus(path);
+    } catch (FileNotFoundException e) {
+      return null;
     }
   }
   
-  public FileSystem getFileSystem() {
-	  return fileSystem;
+  public URI getUri() {
+    return fileSystem.getUri();
   }
-  
-  public FileStatus getObject(String id) throws IOException {
+
+  public FileStatus getObject(Path path) throws IOException {
     try {
-      return fileSystem.getFileStatus(new Path(id));
+      return fileSystem.getFileStatus(path);
     } catch(FileNotFoundException e) {
       return null;
     }
   }
 
-  public FSDataInputStream getFSDataInputStream(String id) throws IOException {
-    return fileSystem.open(new Path(id));
+  public FSDataInputStream getFSDataInputStream(Path path) throws IOException {
+    try {
+      return fileSystem.open(path);
+    } catch (FileNotFoundException e) {
+      return null;
+    }
   }
   
   public void close() throws IOException {

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties Tue Jul  2 12:34:41 2013
@@ -14,9 +14,12 @@
 # limitations under the License.
 
 HDFSOutputConnector.ServerTabName=Server
-HDFSOutputConnector.NameNode=Name Node:
+HDFSOutputConnector.NameNodeHost=Name node host:
+HDFSOutputConnector.NameNodePort=Name node port:
 HDFSOutputConnector.User=User:
-HDFSOutputConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSOutputConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSOutputConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSOutputConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
 HDFSOutputConnector.UserCannotBeNull=User cannot be null
 
 HDFSOutputConnector.PathTabName=Output Path

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties Tue Jul  2 12:34:41 2013
@@ -14,9 +14,12 @@
 # limitations under the License.
 
 HDFSOutputConnector.ServerTabName=サーバー
-HDFSOutputConnector.NameNode=ネームノード:
+HDFSOutputConnector.NameNodeHost=Name node host:
+HDFSOutputConnector.NameNodePort=Name node port:
 HDFSOutputConnector.User=ユーザー:
-HDFSOutputConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSOutputConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSOutputConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSOutputConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
 HDFSOutputConnector.UserCannotBeNull=User cannot be null
 
 HDFSOutputConnector.PathTabName=出力パス

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties Tue Jul  2 12:34:41 2013
@@ -14,9 +14,12 @@
 # limitations under the License.
 
 HDFSRepositoryConnector.ServerTabName=Server
-HDFSRepositoryConnector.NameNode=Name Node:
+HDFSRepositoryConnector.NameNodeHost=Name node host:
+HDFSRepositoryConnector.NameNodePort=Name node port:
 HDFSRepositoryConnector.User=User:
-HDFSRepositoryConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSRepositoryConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSRepositoryConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSRepositoryConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
 HDFSRepositoryConnector.UserCannotBeNull=User cannot be null
 
 HDFSRepositoryConnector.Paths=Repository Paths
@@ -45,7 +48,3 @@ HDFSRepositoryConnector.InsertNewMatchFo
 HDFSRepositoryConnector.DeletePath=Delete path #
 HDFSRepositoryConnector.AddNewMatchForPath=Add new match for path #
 HDFSRepositoryConnector.AddNewPath=Add new path
-
-HDFSRepositoryConnector.FilePathToURITab=Convert file path to URI
-HDFSRepositoryConnector.FilePathToURI=Convert file path to URI:
-HDFSRepositoryConnector.FilePathToURIExample=EX) $REPOSITORY_PATH/http/localhost/index.html => http://localhost/index.html

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties Tue Jul  2 12:34:41 2013
@@ -14,9 +14,12 @@
 # limitations under the License.
 
 HDFSRepositoryConnector.ServerTabName=サーバー
-HDFSRepositoryConnector.NameNode=ネームノード:
+HDFSRepositoryConnector.NameNodeHost=Name node host:
+HDFSRepositoryConnector.NameNodePort=Name node port:
 HDFSRepositoryConnector.User=ユーザー:
-HDFSRepositoryConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSRepositoryConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSRepositoryConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSRepositoryConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
 HDFSRepositoryConnector.UserCannotBeNull=User cannot be null
 
 HDFSRepositoryConnector.Paths=リポジトリパス
@@ -45,7 +48,3 @@ HDFSRepositoryConnector.InsertNewMatchFo
 HDFSRepositoryConnector.DeletePath=パスを削除: #
 HDFSRepositoryConnector.AddNewMatchForPath=パス用に新しいパターンを追加: #
 HDFSRepositoryConnector.AddNewPath=新しいパスを追加
-
-HDFSRepositoryConnector.FilePathToURITab=ファイルパスをURIへ変換する
-HDFSRepositoryConnector.FilePathToURI=ファイルパスをURIへ変換する:
-HDFSRepositoryConnector.FilePathToURIExample=例) $REPOSITORY_PATH/http/localhost/index.html => http://localhost/index.html

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html Tue Jul  2 12:34:41 2013
@@ -18,18 +18,25 @@
 #if($TABNAME == $ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))
 
 <table class="displaytable">
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
   <tr>
-    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNode'))</nobr></td>
-    <td class="value"><input name="namenode" type="text" value="$Encoder.attributeEscape($NAMENODE)" size="48" /></td>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeHost'))</nobr></td>
+    <td class="value"><input name="namenodehost" type="text" value="$Encoder.attributeEscape($NAMENODEHOST)" size="32" /></td>
+  </tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePort'))</nobr></td>
+    <td class="value"><input name="namenodeport" type="text" value="$Encoder.attributeEscape($NAMENODEPORT)" size="5" /></td>
   </tr>
   <tr>
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.User'))</nobr></td>
-    <td class="value"><input name="user" type="text" value="$Encoder.attributeEscape($USER)" size="48" /></td>
+    <td class="value"><input name="user" type="text" value="$Encoder.attributeEscape($USER)" size="32" /></td>
   </tr>
 </table>
 
 #else
 
-<input type="hidden" name="namenode" value="$Encoder.attributeEscape($NAMENODE)" />
+<input type="hidden" name="namenodehost" value="$Encoder.attributeEscape($NAMENODEHOST)" />
+<input type="hidden" name="namenodeport" value="$Encoder.attributeEscape($NAMENODEPORT)" />
+<input type="hidden" name="user" value="$Encoder.attributeEscape($USER)" />
 
 #end

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js Tue Jul  2 12:34:41 2013
@@ -19,11 +19,25 @@
 <!--
 function checkConfigForSave()
 {
-  if (editconnection.namenode.value == "")
+  if (editconnection.namenodehost.value == "")
   {
-    alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeURICannotBeNull'))");
+    alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeHostCannotBeNull'))");
     SelectTab("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))");
-    editconnection.namenode.focus();
+    editconnection.namenodehost.focus();
+    return false;
+  }
+  if (editconnection.namenodeport.value == "")
+  {
+    alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePortCannotBeNull'))");
+    SelectTab("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))");
+    editconnection.namenodeport.focus();
+    return false;
+  }
+  if (!isInteger(editconnection.namenodeport.value))
+  {
+    alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePortMustBeAnInteger'))");
+    SelectTab("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))");
+    editconnection.namenodeport.focus();
     return false;
   }
   if (editconnection.user.value == "")

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html Tue Jul  2 12:34:41 2013
@@ -17,8 +17,12 @@
 
 <table class="displaytable">
   <tr>
-    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNode'))</nobr></td>
-    <td class="value">$Encoder.bodyEscape($NAMENODE)</td>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeHost'))</nobr></td>
+    <td class="value">$Encoder.bodyEscape($NAMENODEHOST)</td>
+  </tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePort'))</nobr></td>
+    <td class="value">$Encoder.bodyEscape($NAMENODEPORT)</td>
   </tr>
   <tr>
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.User'))</nobr></td>

Modified: manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml (original)
+++ manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml Tue Jul  2 12:34:41 2013
@@ -1072,7 +1072,7 @@ curl -XGET http://localhost:9200/index/_
                        will parse file names that were created by <em>wget</em>, or by the wget-compatible File System Output Connector, and turn these back
                        into full URL's to external web content.</p>
                 <p>This connection type has no support for any kind of document security.</p>
-                <p>The file system repository connection type provides no configuration tabs beyond the standard ones.  However, please consider setting a "Maximum connections per
+                <p>The File System repository connection type provides no configuration tabs beyond the standard ones.  However, please consider setting a "Maximum connections per
                        JVM" value on the "Throttling" tab to at least one per worker thread, or 30, for best performance.</p>
                 <p>Jobs created using a file-system-type repository connection
                        have two tabs in addition to the standard repertoire: the "Hop Filters" tab, and the "Repository Paths" tab.</p>
@@ -1082,7 +1082,7 @@ curl -XGET http://localhost:9200/index/_
                 <br/><br/>
                 <figure src="images/en_US/filesystem-job-hopcount.PNG" alt="File System Connection, Hop Filters tab" width="80%"/>
                 <br/><br/>
-                <p>In the case of the file system connection type, there is only one variety of relationship between documents, which is called a "child" relationship.  If you want to
+                <p>In the case of the File System connection type, there is only one variety of relationship between documents, which is called a "child" relationship.  If you want to
                        restrict the document set by how far away a document is from the path root, enter the maximum allowed number of hops in the text box.  Leaving the box blank
                        indicates that no such filtering will take place.</p>
                 <p>On this same tab, you can tell the Framework what to do should there be changes in the distance from the root to a document.  The choice "Delete unreachable
@@ -1103,6 +1103,43 @@ curl -XGET http://localhost:9200/index/_
                        a match file specification (e.g. "*.txt"), and click the "Add" button.</p>
             </section>
             
+            <section id="hdfsrepository">
+                <title>HDFS Repository Connection (WGET sensitive)</title>
+                <p>The HDFS repository connection operates much like the File System Repository Connection, except it reads data from the Hadoop File System rather than a
+                       local disk.  It, too, is capable of understanding directories written in the manner of the Unix utility called <em>wget</em>.  In the latter mode, the HDFS Repository Connector
+                       will parse file names that were created by <em>wget</em>, or by the wget-compatible HDFS Output Connector, and turn these back
+                       into full URL's pointing to external web content.</p>
+                <p>This connection type has no support for any kind of document security.</p>
+                <p>The HDFS repository connection type has an additional configuration tab above and beyond the standard ones, called "Server".  This is what it looks like:</p>
+                <br/><br/>
+                <figure src="images/en_US/hdfs-repository-configure-server.PNG" alt="HDFS Connection, Server tab" width="80%"/>
+                <br/><br/>
+                <p>Enter the HDFS name node URI, and the user name, and click the "Save" button.</p>
+                <p>Jobs created using an HDFS repository connection type
+                       have two tabs in addition to the standard repertoire: the "Hop Filters" tab, and the "Repository Paths" tab.</p>
+                <p>The "Hop Filters" tab allows you to restrict the document set by the number of child hops from the path root.  This is what it looks like:</p>
+                <br/><br/>
+                <figure src="images/en_US/hdfs-job-hopcount.PNG" alt="HDFS Connection, Hop Filters tab" width="80%"/>
+                <br/><br/>
+                <p>In the case of the HDFS connection type, there is only one variety of relationship between documents, which is called a "child" relationship.  If you want to
+                       restrict the document set by how far away a document is from the path root, enter the maximum allowed number of hops in the text box.  Leaving the box blank
+                       indicates that no such filtering will take place.</p>
+                <p>On this same tab, you can tell the Framework what to do should there be changes in the distance from the root to a document.  The choice "Delete unreachable
+                       documents" requires the Framework to recalculate the distance to every potentially affected document whenever a change takes place.  This may require
+                       expensive bookkeeping, however, so you also have the option of  ignoring such changes.  There are two varieties of this latter option - you can ignore the changes
+                       for now, with the option of turning back on the aggressive bookkeeping at a later time, or you can decide not to ever allow changes to propagate, in which case
+                       the Framework will discard the necessary bookkeeping information permanently.</p>
+                <p>The "Repository Paths" tab looks like this:</p>
+                <br/><br/>
+                <figure src="images/en_US/hdfs-job-paths.PNG" alt="HDFS Connection, Repository Paths tab" width="80%"/>
+                <br/><br/>
+                <p>This tab allows you to type in a set of paths which function as the roots of the crawl.  For each desired path, type in the path, select whether the root should
+                       behave as an WGET repository or not, and click the "Add" button to add it to the list.</p>
+                <p>Each root path has a set of rules which determines whether a document is included or not in the set for the job.  Once you have added the root path to the list, you
+                       may then add rules to it.  Each rule has a match expression, an indication of whether the rule is intended to match files or directories, and an action (include or exclude).
+                       Rules are evaluated from top to bottom, and the first rule that matches the file name is the one that is chosen.  To add a rule, select the desired pulldowns, type in 
+                       a match file specification (e.g. "*.txt"), and click the "Add" button.</p>
+            </section>
 
             <section id="rssrepository">
                 <title>Generic RSS Repository Connection</title>

Modified: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-configure-server.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-configure-server.PNG?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
Binary files - no diff available.

Added: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG?rev=1498904&view=auto
==============================================================================
Binary file - no diff available.

Propchange: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG?rev=1498904&view=auto
==============================================================================
Binary file - no diff available.

Propchange: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG?rev=1498904&view=auto
==============================================================================
Binary file - no diff available.

Propchange: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream