You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/08 14:33:23 UTC

svn commit: r1630083 - in /manifoldcf/branches/CONNECTORS-1067/connectors: filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/ googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/

Author: kwright
Date: Wed Oct  8 12:33:22 2014
New Revision: 1630083

URL: http://svn.apache.org/r1630083
Log:
Update file connector and google drive connector

Modified:
    manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
    manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java

Modified: manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1630083&r1=1630082&r2=1630083&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1067/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java Wed Oct  8 12:33:22 2014
@@ -330,6 +330,45 @@ public class FileConnector extends org.a
             // We still need to check based on file data.
             if (checkIngest(file,spec))
             {
+              String fileName = file.getName();
+              Date modifiedDate = new Date(file.lastModified());
+              String mimeType = mapExtensionToMimeType(fileName);
+              String uri;
+              if (convertPath != null) {
+                // WGET-compatible input; convert back to external URI
+                uri = convertToWGETURI(convertPath);
+              } else {
+                uri = convertToURI(documentIdentifier);
+              }
+
+              if (!activities.checkLengthIndexable(fileLength))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because length was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
+              if (!activities.checkURLIndexable(uri))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because URL was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
+              if (!activities.checkDateIndexable(modifiedDate))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because date ("+modifiedDate+") was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
+              if (!activities.checkMimeTypeIndexable(mimeType))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because mime type ('"+mimeType+"') was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
               long startTime = System.currentTimeMillis();
               String errorCode = "OK";
               String errorDesc = null;
@@ -345,17 +384,13 @@ public class FileConnector extends org.a
                   {
                     RepositoryDocument data = new RepositoryDocument();
                     data.setBinary(is,fileLength);
-                    String fileName = file.getName();
                     data.setFileName(fileName);
-                    data.setMimeType(mapExtensionToMimeType(fileName));
-                    data.setModifiedDate(new Date(file.lastModified()));
-                    String uri;
+                    data.setMimeType(mimeType);
+                    data.setModifiedDate(modifiedDate);
                     if (convertPath != null) {
                       // WGET-compatible input; convert back to external URI
-                      uri = convertToWGETURI(convertPath);
                       data.addField("uri",uri);
                     } else {
-                      uri = convertToURI(documentIdentifier);
                       data.addField("uri",file.toString());
                     }
                     // MHL for other metadata

Modified: manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java?rev=1630083&r1=1630082&r2=1630083&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1067/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java Wed Oct  8 12:33:22 2014
@@ -1090,18 +1090,51 @@ public class GoogleDriveRepositoryConnec
               Logging.connectors.debug("GOOGLEDRIVE: its a file");
             }
 
-            // We always direct to the PDF except for Spreadsheets
-            String documentURI = null;
-            if (!googleFile.getMimeType().equals("application/vnd.google-apps.spreadsheet")) {
-              documentURI = getUrl(googleFile, "application/pdf");
-            } else {
-              documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-            }
-
             // Get the file length
-            Long fileLength = Objects.firstNonNull(googleFile.getFileSize(), 0L);
-            if (fileLength != null) {
+            Long fileLengthLong = Objects.firstNonNull(googleFile.getFileSize(), 0L);
+            if (fileLengthLong != null) {
 
+              // Now do standard stuff
+              long fileLength = fileLengthLong.longValue();
+              String mimeType = googleFile.getMimeType();
+              DateTime createdDateObject = googleFile.getCreatedDate();
+              DateTime modifiedDateObject = googleFile.getModifiedDate();
+              String extension = googleFile.getFileExtension();
+              String title = googleFile.getTitle();
+              Date createdDate = (createdDateObject==null)?null:new Date(createdDateObject.getValue());
+              Date modifiedDate = (modifiedDateObject==null)?null:new Date(modifiedDateObject.getValue());
+              // We always direct to the PDF except for Spreadsheets
+              String documentURI = null;
+              if (!mimeType.equals("application/vnd.google-apps.spreadsheet")) {
+                documentURI = getUrl(googleFile, "application/pdf");
+              } else {
+                documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+              }
+
+              if (!activities.checkLengthIndexable(fileLength))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
+              if (!activities.checkURLIndexable(documentURI))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
+              if (!activities.checkMimeTypeIndexable(mimeType))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
+              if (!activities.checkDateIndexable(modifiedDate))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
               RepositoryDocument rd = new RepositoryDocument();
 
               if (acls != null) {
@@ -1112,19 +1145,12 @@ public class GoogleDriveRepositoryConnec
                 }
               }
               
-              // Now do standard stuff
-              String mimeType = googleFile.getMimeType();
-              DateTime createdDate = googleFile.getCreatedDate();
-              DateTime modifiedDate = googleFile.getModifiedDate();
-              String extension = googleFile.getFileExtension();
-              String title = googleFile.getTitle();
-              
               if (mimeType != null)
                 rd.setMimeType(mimeType);
               if (createdDate != null)
-                rd.setCreatedDate(new Date(createdDate.getValue()));
+                rd.setCreatedDate(createdDate);
               if (modifiedDate != null)
-                rd.setModifiedDate(new Date(modifiedDate.getValue()));
+                rd.setModifiedDate(modifiedDate);
               if (extension != null)
               {
                 if (title == null)