You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/07/02 14:34:42 UTC
svn commit: r1498904 - in /manifoldcf/trunk: ./
connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/
connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/
connectors/hdfs/connector/src/main/...
Author: kwright
Date: Tue Jul 2 12:34:41 2013
New Revision: 1498904
URL: http://svn.apache.org/r1498904
Log:
Fix for CONNECTORS-742. Also revamped hdfs connectors to break up URI into host and port components.
Added:
manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG (with props)
manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG (with props)
manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG (with props)
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties
manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties
manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties
manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties
manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html
manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js
manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html
manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-configure-server.PNG
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jul 2 12:34:41 2013
@@ -3,6 +3,9 @@ $Id$
======================= 1.3-dev =====================
+CONNECTORS-742: Document HDFS connector.
+(Karl Wright)
+
CONNECTORS-741: Document HDFS output connector.
(Karl Wright)
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConfig.java Tue Jul 2 12:34:41 2013
@@ -32,8 +32,9 @@ public class HDFSOutputConfig extends HD
/** Parameters used for the configuration */
final private static ParameterEnum[] CONFIGURATIONLIST = {
- ParameterEnum.NAMENODE,
- ParameterEnum.USER
+ ParameterEnum.NAMENODEHOST,
+ ParameterEnum.NAMENODEPORT,
+ ParameterEnum.USER
};
/** Build a set of ElasticSearchParameters by reading ConfigParams. If the
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConnector.java Tue Jul 2 12:34:41 2013
@@ -130,14 +130,20 @@ public class HDFSOutputConnector extends
/** Set up a session */
protected void getSession() throws ManifoldCFException, ServiceInterruption {
- String nameNode = params.getParameter(ParameterEnum.NAMENODE.name());
- if (nameNode == null)
- throw new ManifoldCFException("Namenode must be specified");
+ String nameNodeHost = params.getParameter(ParameterEnum.NAMENODEHOST.name());
+ if (nameNodeHost == null)
+ throw new ManifoldCFException("Namenodehost must be specified");
+
+ String nameNodePort = params.getParameter(ParameterEnum.NAMENODEPORT.name());
+ if (nameNodePort == null)
+ throw new ManifoldCFException("Namenodeport must be specified");
String user = params.getParameter(ParameterEnum.USER.name());
if (user == null)
throw new ManifoldCFException("User must be specified");
+ String nameNode = "hdfs://"+nameNodeHost+":"+nameNodePort;
+
/*
* make Configuration
*/
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/HDFSOutputConstant.java Tue Jul 2 12:34:41 2013
@@ -27,8 +27,11 @@ public class HDFSOutputConstant
// Configuration parameters
- /** Name node */
- public static final String PARAM_NAMENODE = "namenode";
+ /** Name node host */
+ public static final String PARAM_NAMENODEHOST = "namenodehost";
+
+ /** Name node port */
+ public static final String PARAM_NAMENODEPORT = "namenodeport";
/** User */
public static final String PARAM_USER = "user";
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/agents/output/hdfs/ParameterEnum.java Tue Jul 2 12:34:41 2013
@@ -24,7 +24,8 @@ import java.util.Map;
/** Parameters constants */
public enum ParameterEnum {
- NAMENODE("hdfs://localhost:9000"),
+ NAMENODEHOST("localhost"),
+ NAMENODEPORT("9000"),
USER(""),
ROOTPATH("");
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Tue Jul 2 12:34:41 2013
@@ -33,7 +33,6 @@ import org.apache.manifoldcf.core.common
import org.apache.manifoldcf.core.common.XThreadStringBuffer;
import org.apache.manifoldcf.core.extmimemap.ExtensionMimeMap;
-import java.security.GeneralSecurityException;
import java.util.*;
import java.io.*;
import java.net.URI;
@@ -55,7 +54,8 @@ public class HDFSRepositoryConnector ext
// Activities list
protected static final String[] activitiesList = new String[]{ACTIVITY_READ};
- protected String nameNode = null;
+ protected String nameNodeHost = null;
+ protected String nameNodePort = null;
protected String user = null;
protected Configuration config = null;
protected HDFSSession session = null;
@@ -122,8 +122,8 @@ public class HDFSRepositoryConnector ext
public void connect(ConfigParams configParams) {
super.connect(configParams);
- nameNode = configParams.getParameter("namenode");
-
+ nameNodeHost = configParams.getParameter("namenodehost");
+ nameNodePort = configParams.getParameter("namenodeport");
user = configParams.getParameter("user");
/*
@@ -133,7 +133,7 @@ public class HDFSRepositoryConnector ext
try {
Thread.currentThread().setContextClassLoader(org.apache.hadoop.conf.Configuration.class.getClassLoader());
config = new Configuration();
- config.set("fs.default.name", nameNode);
+ config.set("fs.default.name", makeNameNodeURI(nameNodeHost, nameNodePort));
} finally {
Thread.currentThread().setContextClassLoader(ocl);
}
@@ -144,34 +144,34 @@ public class HDFSRepositoryConnector ext
*/
@Override
public void disconnect() throws ManifoldCFException {
- if (session != null) {
- try {
- session.close();
- } catch (IOException e) {
- throw new ManifoldCFException(e);
- } finally {
- session = null;
- lastSessionFetch = -1L;
- }
- }
-
- config.clear();
+ closeSession();
config = null;
user = null;
- nameNode = null;
+ nameNodeHost = null;
+ nameNodePort = null;
super.disconnect();
}
+ protected static String makeNameNodeURI(String host, String port) {
+ return "hdfs://"+host+":"+port;
+ }
+
/**
* Set up a session
*/
protected void getSession() throws ManifoldCFException, ServiceInterruption {
if (session == null) {
- if (StringUtils.isEmpty(nameNode)) {
- throw new ManifoldCFException("Parameter namenode required but not set");
+ if (StringUtils.isEmpty(nameNodeHost)) {
+ throw new ManifoldCFException("Parameter namenodehost required but not set");
+ }
+ if (Logging.connectors.isDebugEnabled()) {
+ Logging.connectors.debug("HDFS: NameNodeHost = '" + nameNodeHost + "'");
+ }
+ if (StringUtils.isEmpty(nameNodePort)) {
+ throw new ManifoldCFException("Parameter namenodeport required but not set");
}
if (Logging.connectors.isDebugEnabled()) {
- Logging.connectors.debug("HDFS: NameNode = '" + nameNode + "'");
+ Logging.connectors.debug("HDFS: NameNodePort = '" + nameNodePort + "'");
}
if (StringUtils.isEmpty(user)) {
@@ -190,8 +190,10 @@ public class HDFSRepositoryConnector ext
if (thr != null) {
if (thr instanceof IOException) {
throw (IOException) thr;
- } else if (thr instanceof GeneralSecurityException) {
- throw (GeneralSecurityException) thr;
+ } else if (thr instanceof URISyntaxException) {
+ throw (URISyntaxException) thr;
+ } else if (thr instanceof RuntimeException) {
+ throw (RuntimeException) thr;
} else {
throw (Error) thr;
}
@@ -205,9 +207,9 @@ public class HDFSRepositoryConnector ext
} catch (InterruptedIOException e) {
t.interrupt();
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
- } catch (GeneralSecurityException e) {
- Logging.connectors.error("HDFS: " + "General security error initializing transport: " + e.getMessage(), e);
- handleGeneralSecurityException(e);
+ } catch (URISyntaxException e) {
+ Logging.connectors.error("HDFS: URI syntax exception: " + e.getMessage(), e);
+ handleURISyntaxException(e);
} catch (IOException e) {
Logging.connectors.warn("HDFS: IO error: " + e.getMessage(), e);
handleIOException(e);
@@ -243,17 +245,7 @@ public class HDFSRepositoryConnector ext
CheckConnectionThread t = new CheckConnectionThread();
try {
t.start();
- t.join();
- Throwable thr = t.getException();
- if (thr != null) {
- if (thr instanceof IOException) {
- throw (IOException) thr;
- } else if (thr instanceof RuntimeException) {
- throw (RuntimeException) thr;
- } else {
- throw (Error) thr;
- }
- }
+ t.finishUp();
return;
} catch (InterruptedException e) {
t.interrupt();
@@ -281,15 +273,26 @@ public class HDFSRepositoryConnector ext
long currentTime = System.currentTimeMillis();
if (currentTime >= lastSessionFetch + timeToRelease) {
- if (session != null) {
- try {
- session.close();
- } catch (IOException e) {
- throw new ManifoldCFException(e);
- } finally {
- session = null;
- lastSessionFetch = -1L;
- }
+ closeSession();
+ }
+ }
+
+ protected void closeSession()
+ throws ManifoldCFException {
+ if (session != null) {
+ try {
+ // This can in theory throw an IOException, so it is possible it is doing socket
+ // communication. In practice, it's unlikely that there's any real IO, so I'm
+ // NOT putting it in a background thread for now.
+ session.close();
+ } catch (InterruptedIOException e) {
+ throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ } catch (IOException e) {
+ Logging.connectors.warn("HDFS: Error closing connection: "+e.getMessage(),e);
+ // Eat the exception
+ } finally {
+ session = null;
+ lastSessionFetch = -1L;
}
}
}
@@ -344,40 +347,12 @@ public class HDFSRepositoryConnector ext
if (sn.getType().equals("startpoint")) {
path = sn.getAttributeValue("path");
- getSession();
- GetSeedsThread t = new GetSeedsThread(path);
- try {
- t.start();
- boolean wasInterrupted = false;
- try {
- XThreadStringBuffer seedBuffer = t.getBuffer();
+ FileStatus[] statuses = getChildren(new Path(path));
- // Pick up the paths, and add them to the activities, before we join with the child thread.
- while (true) {
- // The only kind of exceptions this can throw are going to shut the process down.
- String docPath = seedBuffer.fetch();
- if (docPath == null) {
- break;
- }
- // Add the pageID to the queue
- activities.addSeedDocument(docPath);
- }
- } catch (InterruptedException e) {
- wasInterrupted = true;
- throw e;
- } catch (ManifoldCFException e) {
- if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
- wasInterrupted = true;
- }
- throw e;
- } finally {
- if (!wasInterrupted) {
- t.finishUp();
- }
+ for (FileStatus fileStatus : statuses) {
+ if (fileStatus.isDir()) {
+ activities.addSeedDocument(fileStatus.getPath().toUri().toString());
}
- } catch (InterruptedException e) {
- t.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
}
}
i++;
@@ -408,17 +383,9 @@ public class HDFSRepositoryConnector ext
{
String[] rval = new String[documentIdentifiers.length];
for (int i = 0; i < rval.length; i++) {
- getSession();
- GetObjectThread objt = new GetObjectThread(documentIdentifiers[i]);
- try {
- objt.start();
- objt.finishUp();
- } catch (InterruptedException e) {
- objt.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
- }
-
- FileStatus fileStatus = objt.getResponse();
+ String documentIdentifier = documentIdentifiers[i];
+
+ FileStatus fileStatus = getObject(new Path(documentIdentifier));
if (fileStatus != null) {
if (fileStatus.isDir()) {
long lastModified = fileStatus.getModificationTime();
@@ -469,150 +436,134 @@ public class HDFSRepositoryConnector ext
public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities, DocumentSpecification spec, boolean[] scanOnly)
throws ManifoldCFException, ServiceInterruption {
for (int i = 0; i < documentIdentifiers.length; i++) {
- long startTime = System.currentTimeMillis();
- String errorCode = "FAILED";
- String errorDesc = StringUtils.EMPTY;
- long fileSize = 0;
- boolean doLog = false;
String version = versions[i];
String documentIdentifier = documentIdentifiers[i];
- try {
- if (Logging.connectors.isDebugEnabled()) {
- Logging.connectors.debug("HDFS: Processing document identifier '" + documentIdentifier + "'");
- }
- getSession();
- GetObjectThread objt = new GetObjectThread(documentIdentifier);
- try {
- objt.start();
- objt.finishUp();
- } catch (InterruptedException e) {
- objt.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
- ManifoldCFException.INTERRUPTED);
- }
+ if (Logging.connectors.isDebugEnabled()) {
+ Logging.connectors.debug("HDFS: Processing document identifier '" + documentIdentifier + "'");
+ }
+ FileStatus fileStatus = getObject(new Path(documentIdentifier));
- FileStatus fileStatus = objt.getResponse();
+ if (fileStatus == null) {
+ // It is no longer there , so delete right away
+ activities.deleteDocument(documentIdentifier,version);
+ continue;
+ }
- if (fileStatus == null) {
- continue;
+ if (fileStatus.isDir()) {
+ /*
+ * Queue up stuff for directory
+ */
+ String entityReference = documentIdentifier;
+ FileStatus[] fileStatuses = getChildren(fileStatus.getPath());
+ if (fileStatuses == null) {
+ // Directory was deleted, so remove
+ activities.deleteDocument(documentIdentifier,version);
+ continue;
}
-
- if (fileStatus.isDir()) {
- /*
- * Queue up stuff for directory
- */
- String entityReference = documentIdentifier;
- try {
- FileStatus[] fileStatuses = session.getFileSystem().listStatus(fileStatus.getPath());
- if (fileStatuses != null) {
- int j = 0;
- while (j < fileStatuses.length) {
- FileStatus fs = fileStatuses[j++];
- String canonicalPath = fs.getPath().toString();
- if (checkInclude(session.getFileSystem().getUri().toString(),fs,canonicalPath,spec)) {
- activities.addDocumentReference(canonicalPath,documentIdentifier,RELATIONSHIP_CHILD);
- }
- }
- }
- } catch (IOException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- throw new ManifoldCFException("IO Error: "+e.getMessage(),e);
- } finally {
- activities.recordActivity(new Long(startTime),ACTIVITY_READ,null,entityReference,errorCode,errorDesc,null);
+ for (int j = 0; j < fileStatuses.length; j++) {
+ FileStatus fs = fileStatuses[j++];
+ String canonicalPath = fs.getPath().toString();
+ if (checkInclude(session.getUri().toString(),fs,canonicalPath,spec)) {
+ activities.addDocumentReference(canonicalPath,documentIdentifier,RELATIONSHIP_CHILD);
}
- } else {
- /*
- * its a file
- */
- if (!scanOnly[i]) {
- doLog = true;
- if (!checkIngest(session.getFileSystem().getUri().toString(),fileStatus,spec)) {
- continue;
- }
+ }
+ } else {
+ if (scanOnly[i])
+ continue;
+ if (!checkIngest(session.getUri().toString(),fileStatus,spec))
+ continue;
- /*
- * get filepathtouri value
- */
- String convertPath = null;
- if (version.length() > 0 && version.startsWith("+"))
- {
- StringBuilder unpack = new StringBuilder();
- unpack(unpack, version, 1, '+');
- convertPath = unpack.toString();
- }
+ // Get the WGet conversion path out of the version string
+ String convertPath = null;
+ if (version.length() > 0 && version.startsWith("+"))
+ {
+ StringBuilder unpack = new StringBuilder();
+ unpack(unpack, version, 1, '+');
+ convertPath = unpack.toString();
+ }
- // Length in bytes
- fileSize = fileStatus.getLen();
-
- RepositoryDocument data = new RepositoryDocument();
+ // It is a file to be indexed.
+
+ // Prepare the metadata part of RepositoryDocument
+ RepositoryDocument data = new RepositoryDocument();
- data.setFileName(fileStatus.getPath().getName());
- data.setMimeType(mapExtensionToMimeType(fileStatus.getPath().getName()));
- data.setModifiedDate(new Date(fileStatus.getModificationTime()));
-
- String uri;
- if (convertPath != null) {
- uri = convertToWGETURI(convertPath);
- } else {
- uri = fileStatus.getPath().toUri().toString();
- }
- data.addField("uri",uri);
+ data.setFileName(fileStatus.getPath().getName());
+ data.setMimeType(mapExtensionToMimeType(fileStatus.getPath().getName()));
+ data.setModifiedDate(new Date(fileStatus.getModificationTime()));
+
+ String uri;
+ if (convertPath != null) {
+ uri = convertToWGETURI(convertPath);
+ } else {
+ uri = fileStatus.getPath().toUri().toString();
+ }
+ data.addField("uri",uri);
+
+ // Make sure we have a session
+ getSession();
+
+ // We will record document fetch as an activity
+ long startTime = System.currentTimeMillis();
+ String errorCode = "FAILED";
+ String errorDesc = StringUtils.EMPTY;
+ long fileSize = 0;
- getSession();
- BackgroundStreamThread t = new BackgroundStreamThread(documentIdentifier);
+ try {
+ BackgroundStreamThread t = new BackgroundStreamThread(new Path(documentIdentifier));
+ try {
+ t.start();
+ boolean wasInterrupted = false;
try {
- t.start();
- boolean wasInterrupted = false;
+ InputStream is = t.getSafeInputStream();
try {
- InputStream is = t.getSafeInputStream();
- try {
- data.setBinary(is, fileSize);
- activities.ingestDocument(documentIdentifier,version,uri,data);
- } finally {
- is.close();
- }
- } catch (java.net.SocketTimeoutException e) {
- throw e;
- } catch (InterruptedIOException e) {
- wasInterrupted = true;
- throw e;
- } catch (ManifoldCFException e) {
- if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
- wasInterrupted = true;
- }
- throw e;
+ data.setBinary(is, fileSize);
+ activities.ingestDocument(documentIdentifier,version,uri,data);
} finally {
- if (!wasInterrupted) {
- // This does a join
- t.finishUp();
- }
+ is.close();
}
-
- // No errors. Record the fact that we made it.
- errorCode = "OK";
- } catch (InterruptedException e) {
- // We were interrupted out of the join, most likely. Before we abandon the thread,
- // send a courtesy interrupt.
- t.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
} catch (java.net.SocketTimeoutException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- handleIOException(e);
+ throw e;
} catch (InterruptedIOException e) {
- t.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
- } catch (IOException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- handleIOException(e);
+ wasInterrupted = true;
+ throw e;
+ } catch (ManifoldCFException e) {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
+ wasInterrupted = true;
+ }
+ throw e;
+ } finally {
+ if (!wasInterrupted) {
+ // This does a join
+ t.finishUp();
+ }
}
+
+ // No errors. Record the fact that we made it.
+ errorCode = "OK";
+ // Length we did in bytes
+ fileSize = fileStatus.getLen();
+
+ } catch (InterruptedException e) {
+ // We were interrupted out of the join, most likely. Before we abandon the thread,
+ // send a courtesy interrupt.
+ t.interrupt();
+ throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+ } catch (java.net.SocketTimeoutException e) {
+ errorCode = "IO ERROR";
+ errorDesc = e.getMessage();
+ handleIOException(e);
+ } catch (InterruptedIOException e) {
+ t.interrupt();
+ throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+ } catch (IOException e) {
+ errorCode = "IO ERROR";
+ errorDesc = e.getMessage();
+ handleIOException(e);
}
+ } finally {
+ activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
}
- } finally {
- activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
}
}
}
@@ -643,11 +594,25 @@ public class HDFSRepositoryConnector ext
"<!--\n"+
"function checkConfigForSave()\n"+
"{\n"+
-" if (editconnection.namenode.value == \"\")\n"+
+" if (editconnection.namenodehost.value == \"\")\n"+
+" {\n"+
+" alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodeHostCannotBeNull")+"\");\n"+
+" SelectTab(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.ServerTabName")+"\");\n"+
+" editconnection.namenodehost.focus();\n"+
+" return false;\n"+
+" }\n"+
+" if (editconnection.namenodeport.value == \"\")\n"+
+" {\n"+
+" alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodePortCannotBeNull")+"\");\n"+
+" SelectTab(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.ServerTabName")+"\");\n"+
+" editconnection.namenodeport.focus();\n"+
+" return false;\n"+
+" }\n"+
+" if (!isInteger(editconnection.namenodeport.value))\n"+
" {\n"+
-" alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodeURICannotBeNull")+"\");\n"+
+" alert(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.NameNodePortMustBeAnInteger")+"\");\n"+
" SelectTab(\""+Messages.getBodyJavascriptString(locale,"HDFSRepositoryConnector.ServerTabName")+"\");\n"+
-" editconnection.namenode.focus();\n"+
+" editconnection.namenodeport.focus();\n"+
" return false;\n"+
" }\n"+
" if (editconnection.user.value == \"\")\n"+
@@ -677,13 +642,19 @@ public class HDFSRepositoryConnector ext
public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName)
throws ManifoldCFException, IOException
{
- String nameNode = parameters.getParameter("namenode");
- if (nameNode == null) {
- nameNode = "hdfs://localhost:9000";
+ String nameNodeHost = parameters.getParameter("namenodehost");
+ if (nameNodeHost == null) {
+ nameNodeHost = "localhost";
+ }
+
+ String nameNodePort = parameters.getParameter("namenodeport");
+ if (nameNodePort == null) {
+ nameNodePort = "9000";
}
+
String user = parameters.getParameter("user");
if (user == null) {
- user = "";
+ user = "";
}
if (tabName.equals(Messages.getString(locale,"HDFSRepositoryConnector.ServerTabName")))
@@ -691,15 +662,21 @@ public class HDFSRepositoryConnector ext
out.print(
"<table class=\"displaytable\">\n"+
" <tr>\n"+
-" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNode") + "</nobr></td>\n"+
+" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodeHost") + "</nobr></td>\n"+
+" <td class=\"value\">\n"+
+" <input name=\"namenodehost\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodeHost)+"\"/>\n"+
+" </td>\n"+
+" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodePort") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
-" <input name=\"namenode\" type=\"text\" size=\"48\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNode)+"\"/>\n"+
+" <input name=\"namenodeport\" type=\"text\" size=\"5\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodePort)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.User") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
-" <input name=\"user\" type=\"text\" size=\"48\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(user)+"\"/>\n"+
+" <input name=\"user\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(user)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
@@ -709,7 +686,8 @@ public class HDFSRepositoryConnector ext
{
// Server tab hiddens
out.print(
-"<input type=\"hidden\" name=\"namenode\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNode)+"\"/>\n"+
+"<input type=\"hidden\" name=\"namenodehost\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodeHost)+"\"/>\n"+
+"<input type=\"hidden\" name=\"namenodeport\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodePort)+"\"/>\n"+
"<input type=\"hidden\" name=\"user\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(user)+"\"/>\n"
);
}
@@ -728,9 +706,14 @@ public class HDFSRepositoryConnector ext
public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, ConfigParams parameters)
throws ManifoldCFException
{
- String nameNode = variableContext.getParameter("namenode");
- if (nameNode != null) {
- parameters.setParameter("namenode", nameNode);
+ String nameNodeHost = variableContext.getParameter("namenodehost");
+ if (nameNodeHost != null) {
+ parameters.setParameter("namenodehost", nameNodeHost);
+ }
+
+ String nameNodePort = variableContext.getParameter("namenodeport");
+ if (nameNodePort != null) {
+ parameters.setParameter("namenodeport", nameNodePort);
}
String user = variableContext.getParameter("user");
@@ -752,21 +735,19 @@ public class HDFSRepositoryConnector ext
public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters)
throws ManifoldCFException, IOException
{
- String nameNode = parameters.getParameter("namenode");
- if (nameNode == null) {
- nameNode = "hdfs://localhost:9000";
- }
-
+ String nameNodeHost = parameters.getParameter("namenodehost");
+ String nameNodePort = parameters.getParameter("namenodeport");
String user = parameters.getParameter("user");
- if (user == null) {
- user = "user";
- }
out.print(
"<table class=\"displaytable\">\n"+
" <tr>\n"+
-" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNode") + "</nobr></td>\n"+
-" <td class=\"value\">\n"+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNode)+"</td>\n"+
+" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodeHost") + "</nobr></td>\n"+
+" <td class=\"value\">\n"+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodeHost)+"</td>\n"+
+" </tr>\n"+
+" <tr>\n"+
+" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.NameNodePort") + "</nobr></td>\n"+
+" <td class=\"value\">\n"+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nameNodePort)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"HDFSRepositoryConnector.User") + "</nobr></td>\n"+
@@ -1689,9 +1670,9 @@ public class HDFSRepositoryConnector ext
* @throws ManifoldCFException
* @throws ServiceInterruption
*/
- private static void handleGeneralSecurityException(GeneralSecurityException e) throws ManifoldCFException, ServiceInterruption {
- // Permanent problem: can't initialize transport layer
- throw new ManifoldCFException("HDFS exception: "+e.getMessage(), e);
+ private static void handleURISyntaxException(URISyntaxException e) throws ManifoldCFException, ServiceInterruption {
+ // Permanent problem
+ throw new ManifoldCFException("HDFS bad namenode specification: "+e.getMessage(), e);
}
protected class CheckConnectionThread extends Thread {
@@ -1710,8 +1691,18 @@ public class HDFSRepositoryConnector ext
}
}
- public Throwable getException() {
- return exception;
+ public void finishUp() throws InterruptedException, IOException {
+ join();
+ Throwable thr = exception;
+ if (thr != null) {
+ if (thr instanceof IOException) {
+ throw (IOException) thr;
+ } else if (thr instanceof RuntimeException) {
+ throw (RuntimeException) thr;
+ } else {
+ throw (Error) thr;
+ }
+ }
}
}
@@ -1726,7 +1717,7 @@ public class HDFSRepositoryConnector ext
public void run() {
try {
// Create a session
- session = new HDFSSession(nameNode, config, user);
+ session = new HDFSSession(makeNameNodeURI(nameNodeHost,nameNodePort), config, user);
} catch (Throwable e) {
this.exception = e;
}
@@ -1737,34 +1728,46 @@ public class HDFSRepositoryConnector ext
}
}
- protected class GetSeedsThread extends Thread {
+ protected FileStatus[] getChildren(Path path)
+ throws ManifoldCFException, ServiceInterruption {
+ getSession();
+ try {
+ GetChildrenThread t = new GetChildrenThread(path);
+ try {
+ t.start();
+ t.finishUp();
+ } catch (InterruptedException e) {
+ t.interrupt();
+ throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+ }
+ return t.getResult();
+ } catch (IOException e) {
+ handleIOException(e);
+ }
+ return null;
+ }
+
+ protected class GetChildrenThread extends Thread {
protected Throwable exception = null;
- protected final String path;
- protected final XThreadStringBuffer seedBuffer;
+ protected FileStatus[] result = null;
+ protected final Path path;
- public GetSeedsThread(String path) {
+ public GetChildrenThread(Path path) {
super();
this.path = path;
- this.seedBuffer = new XThreadStringBuffer();
setDaemon(true);
}
@Override
public void run() {
try {
- session.getSeeds(seedBuffer, path);
- seedBuffer.signalDone();
+ result = session.listStatus(path);
} catch (Throwable e) {
this.exception = e;
}
}
- public XThreadStringBuffer getBuffer() {
- return seedBuffer;
- }
-
- public void finishUp() throws InterruptedException {
- seedBuffer.abandon();
+ public void finishUp() throws InterruptedException, IOException {
join();
Throwable thr = exception;
if (thr != null) {
@@ -1772,19 +1775,45 @@ public class HDFSRepositoryConnector ext
throw (RuntimeException) thr;
} else if (thr instanceof Error) {
throw (Error) thr;
+ } else if (thr instanceof IOException) {
+ throw (IOException) thr;
} else {
throw new RuntimeException("Unhandled exception of type: "+thr.getClass().getName(),thr);
}
}
}
+
+ public FileStatus[] getResult() {
+ return result;
+ }
}
+ protected FileStatus getObject(Path path)
+ throws ManifoldCFException, ServiceInterruption {
+ getSession();
+ try {
+ GetObjectThread objt = new GetObjectThread(path);
+ try {
+ objt.start();
+ objt.finishUp();
+ } catch (InterruptedException e) {
+ objt.interrupt();
+ throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+ }
+
+ return objt.getResponse();
+ } catch (IOException e) {
+ handleIOException(e);
+ }
+ return null;
+ }
+
protected class GetObjectThread extends Thread {
- protected final String nodeId;
+ protected final Path nodeId;
protected Throwable exception = null;
protected FileStatus response = null;
- public GetObjectThread(String nodeId) {
+ public GetObjectThread(Path nodeId) {
super();
setDaemon(true);
this.nodeId = nodeId;
@@ -1798,7 +1827,7 @@ public class HDFSRepositoryConnector ext
}
}
- public void finishUp() throws InterruptedException {
+ public void finishUp() throws InterruptedException, IOException {
join();
Throwable thr = exception;
if (thr != null) {
@@ -1806,6 +1835,8 @@ public class HDFSRepositoryConnector ext
throw (RuntimeException) thr;
} else if (thr instanceof Error) {
throw (Error) thr;
+ } else if (thr instanceof IOException) {
+ throw (IOException) thr;
} else {
throw new RuntimeException("Unhandled exception of type: "+thr.getClass().getName(),thr);
}
@@ -1816,21 +1847,18 @@ public class HDFSRepositoryConnector ext
return response;
}
- public Throwable getException() {
- return exception;
- }
}
protected class BackgroundStreamThread extends Thread
{
- protected final String nodeId;
+ protected final Path nodeId;
protected boolean abortThread = false;
protected Throwable responseException = null;
protected InputStream sourceStream = null;
protected XThreadInputStream threadStream = null;
- public BackgroundStreamThread(String nodeId)
+ public BackgroundStreamThread(Path nodeId)
{
super();
setDaemon(true);
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSSession.java Tue Jul 2 12:34:41 2013
@@ -69,42 +69,33 @@ public class HDFSSession {
return info;
}
- public void getSeeds(XThreadStringBuffer idBuffer, String path)
- throws IOException, InterruptedException {
-
- /*
- * need to add root dir so that single files such as /file1 will still get read
- */
- idBuffer.add(path);
-
- /*
- * gets a list of the contents of the entire folder: subfolders + files
- */
- FileStatus[] fileStatuses = fileSystem.listStatus(new Path(path));
- for (FileStatus fileStatus : fileStatuses) {
- /*
- * only add the directories as seeds, we'll add the files later
- */
- if (fileStatus.isDir()) {
- idBuffer.add(fileStatus.getPath().toUri().toString());
- }
+ public FileStatus[] listStatus(Path path)
+ throws IOException {
+ try {
+ return fileSystem.listStatus(path);
+ } catch (FileNotFoundException e) {
+ return null;
}
}
- public FileSystem getFileSystem() {
- return fileSystem;
+ public URI getUri() {
+ return fileSystem.getUri();
}
-
- public FileStatus getObject(String id) throws IOException {
+
+ public FileStatus getObject(Path path) throws IOException {
try {
- return fileSystem.getFileStatus(new Path(id));
+ return fileSystem.getFileStatus(path);
} catch(FileNotFoundException e) {
return null;
}
}
- public FSDataInputStream getFSDataInputStream(String id) throws IOException {
- return fileSystem.open(new Path(id));
+ public FSDataInputStream getFSDataInputStream(Path path) throws IOException {
+ try {
+ return fileSystem.open(path);
+ } catch (FileNotFoundException e) {
+ return null;
+ }
}
public void close() throws IOException {
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_en_US.properties Tue Jul 2 12:34:41 2013
@@ -14,9 +14,12 @@
# limitations under the License.
HDFSOutputConnector.ServerTabName=Server
-HDFSOutputConnector.NameNode=Name Node:
+HDFSOutputConnector.NameNodeHost=Name node host:
+HDFSOutputConnector.NameNodePort=Name node port:
HDFSOutputConnector.User=User:
-HDFSOutputConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSOutputConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSOutputConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSOutputConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
HDFSOutputConnector.UserCannotBeNull=User cannot be null
HDFSOutputConnector.PathTabName=Output Path
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/hdfs/common_ja_JP.properties Tue Jul 2 12:34:41 2013
@@ -14,9 +14,12 @@
# limitations under the License.
HDFSOutputConnector.ServerTabName=ãµã¼ãã¼
-HDFSOutputConnector.NameNode=ãã¼ã ãã¼ãï¼
+HDFSOutputConnector.NameNodeHost=Name node host:
+HDFSOutputConnector.NameNodePort=Name node port:
HDFSOutputConnector.User=ã¦ã¼ã¶ã¼:
-HDFSOutputConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSOutputConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSOutputConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSOutputConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
HDFSOutputConnector.UserCannotBeNull=User cannot be null
HDFSOutputConnector.PathTabName=åºåãã¹
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_en_US.properties Tue Jul 2 12:34:41 2013
@@ -14,9 +14,12 @@
# limitations under the License.
HDFSRepositoryConnector.ServerTabName=Server
-HDFSRepositoryConnector.NameNode=Name Node:
+HDFSRepositoryConnector.NameNodeHost=Name node host:
+HDFSRepositoryConnector.NameNodePort=Name node port:
HDFSRepositoryConnector.User=User:
-HDFSRepositoryConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSRepositoryConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSRepositoryConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSRepositoryConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
HDFSRepositoryConnector.UserCannotBeNull=User cannot be null
HDFSRepositoryConnector.Paths=Repository Paths
@@ -45,7 +48,3 @@ HDFSRepositoryConnector.InsertNewMatchFo
HDFSRepositoryConnector.DeletePath=Delete path #
HDFSRepositoryConnector.AddNewMatchForPath=Add new match for path #
HDFSRepositoryConnector.AddNewPath=Add new path
-
-HDFSRepositoryConnector.FilePathToURITab=Convert file path to URI
-HDFSRepositoryConnector.FilePathToURI=Convert file path to URI:
-HDFSRepositoryConnector.FilePathToURIExample=EX) $REPOSITORY_PATH/http/localhost/index.html => http://localhost/index.html
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/hdfs/common_ja_JP.properties Tue Jul 2 12:34:41 2013
@@ -14,9 +14,12 @@
# limitations under the License.
HDFSRepositoryConnector.ServerTabName=ãµã¼ãã¼
-HDFSRepositoryConnector.NameNode=ãã¼ã ãã¼ãï¼
+HDFSRepositoryConnector.NameNodeHost=Name node host:
+HDFSRepositoryConnector.NameNodePort=Name node port:
HDFSRepositoryConnector.User=ã¦ã¼ã¶ã¼ï¼
-HDFSRepositoryConnector.NameNodeURICannotBeNull=Name node URI cannot be null
+HDFSRepositoryConnector.NameNodeHostCannotBeNull=Name node host cannot be null
+HDFSRepositoryConnector.NameNodePortCannotBeNull=Name node port cannot be null
+HDFSRepositoryConnector.NameNodePortMustBeAnInteger=Name node port must be an integer
HDFSRepositoryConnector.UserCannotBeNull=User cannot be null
HDFSRepositoryConnector.Paths=ãªãã¸ããªãã¹
@@ -45,7 +48,3 @@ HDFSRepositoryConnector.InsertNewMatchFo
HDFSRepositoryConnector.DeletePath=ãã¹ãåé¤ï¼ #
HDFSRepositoryConnector.AddNewMatchForPath=ãã¹ç¨ã«æ°ãããã¿ã¼ã³ã追å ï¼ #
HDFSRepositoryConnector.AddNewPath=æ°ãããã¹ã追å
-
-HDFSRepositoryConnector.FilePathToURITab=ãã¡ã¤ã«ãã¹ãURIã¸å¤æãã
-HDFSRepositoryConnector.FilePathToURI=ãã¡ã¤ã«ãã¹ãURIã¸å¤æããï¼
-HDFSRepositoryConnector.FilePathToURIExample=ä¾) $REPOSITORY_PATH/http/localhost/index.html => http://localhost/index.html
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.html Tue Jul 2 12:34:41 2013
@@ -18,18 +18,25 @@
#if($TABNAME == $ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))
<table class="displaytable">
+ <tr><td class="separator" colspan="2"><hr/></td></tr>
<tr>
- <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNode'))</nobr></td>
- <td class="value"><input name="namenode" type="text" value="$Encoder.attributeEscape($NAMENODE)" size="48" /></td>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeHost'))</nobr></td>
+ <td class="value"><input name="namenodehost" type="text" value="$Encoder.attributeEscape($NAMENODEHOST)" size="32" /></td>
+ </tr>
+ <tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePort'))</nobr></td>
+ <td class="value"><input name="namenodeport" type="text" value="$Encoder.attributeEscape($NAMENODEPORT)" size="5" /></td>
</tr>
<tr>
<td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.User'))</nobr></td>
- <td class="value"><input name="user" type="text" value="$Encoder.attributeEscape($USER)" size="48" /></td>
+ <td class="value"><input name="user" type="text" value="$Encoder.attributeEscape($USER)" size="32" /></td>
</tr>
</table>
#else
-<input type="hidden" name="namenode" value="$Encoder.attributeEscape($NAMENODE)" />
+<input type="hidden" name="namenodehost" value="$Encoder.attributeEscape($NAMENODEHOST)" />
+<input type="hidden" name="namenodeport" value="$Encoder.attributeEscape($NAMENODEPORT)" />
+<input type="hidden" name="user" value="$Encoder.attributeEscape($USER)" />
#end
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/editConfiguration.js Tue Jul 2 12:34:41 2013
@@ -19,11 +19,25 @@
<!--
function checkConfigForSave()
{
- if (editconnection.namenode.value == "")
+ if (editconnection.namenodehost.value == "")
{
- alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeURICannotBeNull'))");
+ alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeHostCannotBeNull'))");
SelectTab("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))");
- editconnection.namenode.focus();
+ editconnection.namenodehost.focus();
+ return false;
+ }
+ if (editconnection.namenodeport.value == "")
+ {
+ alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePortCannotBeNull'))");
+ SelectTab("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))");
+ editconnection.namenodeport.focus();
+ return false;
+ }
+ if (!isInteger(editconnection.namenodeport.value))
+ {
+ alert("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePortMustBeAnInteger'))");
+ SelectTab("$Encoder.bodyJavascriptEscape($ResourceBundle.getString('HDFSOutputConnector.ServerTabName'))");
+ editconnection.namenodeport.focus();
return false;
}
if (editconnection.user.value == "")
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/resources/org/apache/manifoldcf/agents/output/hdfs/viewConfiguration.html Tue Jul 2 12:34:41 2013
@@ -17,8 +17,12 @@
<table class="displaytable">
<tr>
- <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNode'))</nobr></td>
- <td class="value">$Encoder.bodyEscape($NAMENODE)</td>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodeHost'))</nobr></td>
+ <td class="value">$Encoder.bodyEscape($NAMENODEHOST)</td>
+ </tr>
+ <tr>
+ <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.NameNodePort'))</nobr></td>
+ <td class="value">$Encoder.bodyEscape($NAMENODEPORT)</td>
</tr>
<tr>
<td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('HDFSOutputConnector.User'))</nobr></td>
Modified: manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
--- manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml (original)
+++ manifoldcf/trunk/site/src/documentation/content/xdocs/en_US/end-user-documentation.xml Tue Jul 2 12:34:41 2013
@@ -1072,7 +1072,7 @@ curl -XGET http://localhost:9200/index/_
will parse file names that were created by <em>wget</em>, or by the wget-compatible File System Output Connector, and turn these back
into full URL's to external web content.</p>
<p>This connection type has no support for any kind of document security.</p>
- <p>The file system repository connection type provides no configuration tabs beyond the standard ones. However, please consider setting a "Maximum connections per
+ <p>The File System repository connection type provides no configuration tabs beyond the standard ones. However, please consider setting a "Maximum connections per
JVM" value on the "Throttling" tab to at least one per worker thread, or 30, for best performance.</p>
<p>Jobs created using a file-system-type repository connection
have two tabs in addition to the standard repertoire: the "Hop Filters" tab, and the "Repository Paths" tab.</p>
@@ -1082,7 +1082,7 @@ curl -XGET http://localhost:9200/index/_
<br/><br/>
<figure src="images/en_US/filesystem-job-hopcount.PNG" alt="File System Connection, Hop Filters tab" width="80%"/>
<br/><br/>
- <p>In the case of the file system connection type, there is only one variety of relationship between documents, which is called a "child" relationship. If you want to
+ <p>In the case of the File System connection type, there is only one variety of relationship between documents, which is called a "child" relationship. If you want to
restrict the document set by how far away a document is from the path root, enter the maximum allowed number of hops in the text box. Leaving the box blank
indicates that no such filtering will take place.</p>
<p>On this same tab, you can tell the Framework what to do should there be changes in the distance from the root to a document. The choice "Delete unreachable
@@ -1103,6 +1103,43 @@ curl -XGET http://localhost:9200/index/_
a match file specification (e.g. "*.txt"), and click the "Add" button.</p>
</section>
+ <section id="hdfsrepository">
+ <title>HDFS Repository Connection (WGET sensitive)</title>
+ <p>The HDFS repository connection operates much like the File System Repository Connection, except it reads data from the Hadoop File System rather than a
+ local disk. It, too, is capable of understanding directories written in the manner of the Unix utility called <em>wget</em>. In the latter mode, the HDFS Repository Connector
+ will parse file names that were created by <em>wget</em>, or by the wget-compatible HDFS Output Connector, and turn these back
+ into full URL's pointing to external web content.</p>
+ <p>This connection type has no support for any kind of document security.</p>
+ <p>The HDFS repository connection type has an additional configuration tab above and beyond the standard ones, called "Server". This is what it looks like:</p>
+ <br/><br/>
+ <figure src="images/en_US/hdfs-repository-configure-server.PNG" alt="HDFS Connection, Server tab" width="80%"/>
+ <br/><br/>
+ <p>Enter the HDFS name node URI, and the user name, and click the "Save" button.</p>
+ <p>Jobs created using an HDFS repository connection type
+ have two tabs in addition to the standard repertoire: the "Hop Filters" tab, and the "Repository Paths" tab.</p>
+ <p>The "Hop Filters" tab allows you to restrict the document set by the number of child hops from the path root. This is what it looks like:</p>
+ <br/><br/>
+ <figure src="images/en_US/hdfs-job-hopcount.PNG" alt="HDFS Connection, Hop Filters tab" width="80%"/>
+ <br/><br/>
+ <p>In the case of the HDFS connection type, there is only one variety of relationship between documents, which is called a "child" relationship. If you want to
+ restrict the document set by how far away a document is from the path root, enter the maximum allowed number of hops in the text box. Leaving the box blank
+ indicates that no such filtering will take place.</p>
+ <p>On this same tab, you can tell the Framework what to do should there be changes in the distance from the root to a document. The choice "Delete unreachable
+ documents" requires the Framework to recalculate the distance to every potentially affected document whenever a change takes place. This may require
+ expensive bookkeeping, however, so you also have the option of ignoring such changes. There are two varieties of this latter option - you can ignore the changes
+ for now, with the option of turning back on the aggressive bookkeeping at a later time, or you can decide not to ever allow changes to propagate, in which case
+ the Framework will discard the necessary bookkeeping information permanently.</p>
+ <p>The "Repository Paths" tab looks like this:</p>
+ <br/><br/>
+ <figure src="images/en_US/hdfs-job-paths.PNG" alt="HDFS Connection, Repository Paths tab" width="80%"/>
+ <br/><br/>
+ <p>This tab allows you to type in a set of paths which function as the roots of the crawl. For each desired path, type in the path, select whether the root should
+ behave as an WGET repository or not, and click the "Add" button to add it to the list.</p>
+ <p>Each root path has a set of rules which determines whether a document is included or not in the set for the job. Once you have added the root path to the list, you
+ may then add rules to it. Each rule has a match expression, an indication of whether the rule is intended to match files or directories, and an action (include or exclude).
+ Rules are evaluated from top to bottom, and the first rule that matches the file name is the one that is chosen. To add a rule, select the desired pulldowns, type in
+ a match file specification (e.g. "*.txt"), and click the "Add" button.</p>
+ </section>
<section id="rssrepository">
<title>Generic RSS Repository Connection</title>
Modified: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-configure-server.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-configure-server.PNG?rev=1498904&r1=1498903&r2=1498904&view=diff
==============================================================================
Binary files - no diff available.
Added: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG?rev=1498904&view=auto
==============================================================================
Binary file - no diff available.
Propchange: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-hopcount.PNG
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG?rev=1498904&view=auto
==============================================================================
Binary file - no diff available.
Propchange: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-job-paths.PNG
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG?rev=1498904&view=auto
==============================================================================
Binary file - no diff available.
Propchange: manifoldcf/trunk/site/src/documentation/resources/images/en_US/hdfs-repository-configure-server.PNG
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream