You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/08 14:33:23 UTC
svn commit: r1630083 - in /manifoldcf/branches/CONNECTORS-1067/connectors:
filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/
googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/
Author: kwright
Date: Wed Oct 8 12:33:22 2014
New Revision: 1630083
URL: http://svn.apache.org/r1630083
Log:
Update file connector and google drive connector
Modified:
manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
Modified: manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1630083&r1=1630082&r2=1630083&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java Wed Oct 8 12:33:22 2014
@@ -330,6 +330,45 @@ public class FileConnector extends org.a
// We still need to check based on file data.
if (checkIngest(file,spec))
{
+ String fileName = file.getName();
+ Date modifiedDate = new Date(file.lastModified());
+ String mimeType = mapExtensionToMimeType(fileName);
+ String uri;
+ if (convertPath != null) {
+ // WGET-compatible input; convert back to external URI
+ uri = convertToWGETURI(convertPath);
+ } else {
+ uri = convertToURI(documentIdentifier);
+ }
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because length was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkURLIndexable(uri))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because URL was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because date ("+modifiedDate+") was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because mime type ('"+mimeType+"') was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
long startTime = System.currentTimeMillis();
String errorCode = "OK";
String errorDesc = null;
@@ -345,17 +384,13 @@ public class FileConnector extends org.a
{
RepositoryDocument data = new RepositoryDocument();
data.setBinary(is,fileLength);
- String fileName = file.getName();
data.setFileName(fileName);
- data.setMimeType(mapExtensionToMimeType(fileName));
- data.setModifiedDate(new Date(file.lastModified()));
- String uri;
+ data.setMimeType(mimeType);
+ data.setModifiedDate(modifiedDate);
if (convertPath != null) {
// WGET-compatible input; convert back to external URI
- uri = convertToWGETURI(convertPath);
data.addField("uri",uri);
} else {
- uri = convertToURI(documentIdentifier);
data.addField("uri",file.toString());
}
// MHL for other metadata
Modified: manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java?rev=1630083&r1=1630082&r2=1630083&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java Wed Oct 8 12:33:22 2014
@@ -1090,18 +1090,51 @@ public class GoogleDriveRepositoryConnec
Logging.connectors.debug("GOOGLEDRIVE: its a file");
}
- // We always direct to the PDF except for Spreadsheets
- String documentURI = null;
- if (!googleFile.getMimeType().equals("application/vnd.google-apps.spreadsheet")) {
- documentURI = getUrl(googleFile, "application/pdf");
- } else {
- documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- }
-
// Get the file length
- Long fileLength = Objects.firstNonNull(googleFile.getFileSize(), 0L);
- if (fileLength != null) {
+ Long fileLengthLong = Objects.firstNonNull(googleFile.getFileSize(), 0L);
+ if (fileLengthLong != null) {
+ // Now do standard stuff
+ long fileLength = fileLengthLong.longValue();
+ String mimeType = googleFile.getMimeType();
+ DateTime createdDateObject = googleFile.getCreatedDate();
+ DateTime modifiedDateObject = googleFile.getModifiedDate();
+ String extension = googleFile.getFileExtension();
+ String title = googleFile.getTitle();
+ Date createdDate = (createdDateObject==null)?null:new Date(createdDateObject.getValue());
+ Date modifiedDate = (modifiedDateObject==null)?null:new Date(modifiedDateObject.getValue());
+ // We always direct to the PDF except for Spreadsheets
+ String documentURI = null;
+ if (!mimeType.equals("application/vnd.google-apps.spreadsheet")) {
+ documentURI = getUrl(googleFile, "application/pdf");
+ } else {
+ documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ }
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
+ if (!activities.checkURLIndexable(documentURI))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
RepositoryDocument rd = new RepositoryDocument();
if (acls != null) {
@@ -1112,19 +1145,12 @@ public class GoogleDriveRepositoryConnec
}
}
- // Now do standard stuff
- String mimeType = googleFile.getMimeType();
- DateTime createdDate = googleFile.getCreatedDate();
- DateTime modifiedDate = googleFile.getModifiedDate();
- String extension = googleFile.getFileExtension();
- String title = googleFile.getTitle();
-
if (mimeType != null)
rd.setMimeType(mimeType);
if (createdDate != null)
- rd.setCreatedDate(new Date(createdDate.getValue()));
+ rd.setCreatedDate(createdDate);
if (modifiedDate != null)
- rd.setModifiedDate(new Date(modifiedDate.getValue()));
+ rd.setModifiedDate(modifiedDate);
if (extension != null)
{
if (title == null)