You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by mo...@apache.org on 2014/10/30 14:20:41 UTC
svn commit: r1635490 - in /manifoldcf/trunk/connectors:
googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/
gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/
hdfs/connector/src/main/ja...
Author: molgun
Date: Thu Oct 30 13:20:40 2014
New Revision: 1635490
URL: http://svn.apache.org/r1635490
Log:
CONNECTORS-1077: for HDFS and some modification on GridFS and GoogleDrive
Modified:
manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
Modified: manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java?rev=1635490&r1=1635489&r2=1635490&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java Thu Oct 30 13:20:40 2014
@@ -1202,9 +1202,7 @@ public class GoogleDriveRepositoryConnec
// No errors. Record the fact that we made it.
fileSize = new Long(fileLength);
- if (doLog){
- activities.recordActivity(new Long(startTime), ACTIVITY_READ, fileSize, nodeId, "OK", null, null);
- }
+ errorCode = "OK";
} catch (InterruptedException e) {
t.interrupt();
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
Modified: manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java?rev=1635490&r1=1635489&r2=1635490&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java Thu Oct 30 13:20:40 2014
@@ -535,8 +535,7 @@ public class GridFSRepositoryConnector e
}
gfs.getDB().getMongo().getConnector().close();
session = null;
- activities.recordActivity(startTime, ACTIVITY_FETCH,
- fileLenght, _id, "OK", null, null);
+ errorCode = "OK";
} else {
Logging.connectors.warn("GridFS: Document " + _id + " has a invalid URL: " + urlValue + " - skipping.");
errorCode = activities.BAD_URL;
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1635490&r1=1635489&r2=1635490&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Thu Oct 30 13:20:40 2014
@@ -378,6 +378,11 @@ public class HDFSRepositoryConnector ext
sb.append("-");
sb.append(new Long(lastModified).toString());
versionString = sb.toString();
+ // We will record document fetch as an activity
+ long startTime = System.currentTimeMillis();
+ String errorCode = null;
+ String errorDesc = null;
+ long fileSize = 0;
if (activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)) {
// Process file!
@@ -391,106 +396,112 @@ public class HDFSRepositoryConnector ext
String fileName = fileStatus.getPath().getName();
String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName());
Date modifiedDate = new Date(fileStatus.getModificationTime());
- String uri;
- if (convertPath != null) {
- uri = convertToWGETURI(convertPath);
- } else {
- uri = fileStatus.getPath().toUri().toString();
- }
+ try {
+ String uri;
+ if (convertPath != null) {
+ uri = convertToWGETURI(convertPath);
+ } else {
+ uri = fileStatus.getPath().toUri().toString();
+ }
- if (!activities.checkLengthIndexable(fileLength))
- {
- activities.noDocument(documentIdentifier,versionString);
- continue;
- }
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ errorCode = activities.EXCLUDED_LENGTH;
+ errorDesc = "Excluding document because of file length ('"+fileLength+"')";
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
- if (!activities.checkURLIndexable(uri))
- {
- activities.noDocument(documentIdentifier,versionString);
- continue;
- }
+ if (!activities.checkURLIndexable(uri))
+ {
+ errorCode = activities.EXCLUDED_URL;
+ errorDesc = "Excluding document because of URL ('"+uri+"')";
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
- if (!activities.checkMimeTypeIndexable(mimeType))
- {
- activities.noDocument(documentIdentifier,versionString);
- continue;
- }
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ errorCode = activities.EXCLUDED_MIMETYPE;
+ errorDesc = "Excluding document because of mime type ("+mimeType+")";
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
- if (!activities.checkDateIndexable(modifiedDate))
- {
- activities.noDocument(documentIdentifier,versionString);
- continue;
- }
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ errorCode = activities.EXCLUDED_DATE;
+ errorDesc = "Excluding document because of date ("+modifiedDate+")";
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
- // Prepare the metadata part of RepositoryDocument
- RepositoryDocument data = new RepositoryDocument();
+ // Prepare the metadata part of RepositoryDocument
+ RepositoryDocument data = new RepositoryDocument();
- data.setFileName(fileName);
- data.setMimeType(mimeType);
- data.setModifiedDate(modifiedDate);
-
- data.addField("uri",uri);
-
- // We will record document fetch as an activity
- long startTime = System.currentTimeMillis();
- String errorCode = "FAILED";
- String errorDesc = StringUtils.EMPTY;
- long fileSize = 0;
+ data.setFileName(fileName);
+ data.setMimeType(mimeType);
+ data.setModifiedDate(modifiedDate);
- try {
- BackgroundStreamThread t = new BackgroundStreamThread(getSession(),new Path(documentIdentifier));
- try {
- t.start();
- boolean wasInterrupted = false;
+ data.addField("uri",uri);
+
+
+
+ BackgroundStreamThread t = new BackgroundStreamThread(getSession(),new Path(documentIdentifier));
try {
- InputStream is = t.getSafeInputStream();
- try {
- data.setBinary(is, fileSize);
- activities.ingestDocumentWithException(documentIdentifier,versionString,uri,data);
- } finally {
- is.close();
- }
+ t.start();
+ boolean wasInterrupted = false;
+ try {
+ InputStream is = t.getSafeInputStream();
+ try {
+ data.setBinary(is, fileSize);
+ activities.ingestDocumentWithException(documentIdentifier,versionString,uri,data);
+ } finally {
+ is.close();
+ }
+ } catch (java.net.SocketTimeoutException e) {
+ throw e;
+ } catch (InterruptedIOException e) {
+ wasInterrupted = true;
+ throw e;
+ } catch (ManifoldCFException e) {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
+ wasInterrupted = true;
+ }
+ throw e;
+ } finally {
+ if (!wasInterrupted) {
+ // This does a join
+ t.finishUp();
+ }
+ }
+
+ // No errors. Record the fact that we made it.
+ errorCode = "OK";
+ // Length we did in bytes
+ fileSize = fileStatus.getLen();
+
+ } catch (InterruptedException e) {
+ // We were interrupted out of the join, most likely. Before we abandon the thread,
+ // send a courtesy interrupt.
+ t.interrupt();
+ throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
} catch (java.net.SocketTimeoutException e) {
- throw e;
+ errorCode = "IOERROR";
+ errorDesc = e.getMessage();
+ handleIOException(e);
} catch (InterruptedIOException e) {
- wasInterrupted = true;
- throw e;
- } catch (ManifoldCFException e) {
- if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
- wasInterrupted = true;
- }
- throw e;
- } finally {
- if (!wasInterrupted) {
- // This does a join
- t.finishUp();
- }
+ t.interrupt();
+ throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+ } catch (IOException e) {
+ errorCode = "IOERROR";
+ errorDesc = e.getMessage();
+ handleIOException(e);
}
-
- // No errors. Record the fact that we made it.
- errorCode = "OK";
- // Length we did in bytes
- fileSize = fileStatus.getLen();
-
- } catch (InterruptedException e) {
- // We were interrupted out of the join, most likely. Before we abandon the thread,
- // send a courtesy interrupt.
- t.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
- } catch (java.net.SocketTimeoutException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- handleIOException(e);
- } catch (InterruptedIOException e) {
- t.interrupt();
- throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
- } catch (IOException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- handleIOException(e);
- }
} finally {
- activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
+ if(errorCode != null){
+ activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
+ }
}
}
}