You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/09/08 19:14:58 UTC

svn commit: r1701844 - in /manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3: AmazonS3Connector.java DocumentProcess.java GenericDocumentProcess.java

Author: kwright
Date: Tue Sep  8 17:14:57 2015
New Revision: 1701844

URL: http://svn.apache.org/r1701844
Log:
Commit latest patch, along with some rearrangement.

Modified:
    manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/AmazonS3Connector.java
    manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/DocumentProcess.java
    manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/GenericDocumentProcess.java

Modified: manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/AmazonS3Connector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/AmazonS3Connector.java?rev=1701844&r1=1701843&r2=1701844&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/AmazonS3Connector.java (original)
+++ manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/AmazonS3Connector.java Tue Sep  8 17:14:57 2015
@@ -730,7 +730,7 @@ public class AmazonS3Connector extends B
     if (amazons3Client == null)
       throw new ManifoldCFException(
           "Amazon client can not connect at the moment");
-    documentProcess.doPocessDocument(documentIdentifiers, statuses, spec,
+    documentProcess.doProcessDocument(documentIdentifiers, statuses, spec,
         activities, jobMode, usesDefaultAuthority, amazons3Client);
 
   }

Modified: manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/DocumentProcess.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/DocumentProcess.java?rev=1701844&r1=1701843&r2=1701844&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/DocumentProcess.java (original)
+++ manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/DocumentProcess.java Tue Sep  8 17:14:57 2015
@@ -20,16 +20,17 @@ import org.apache.manifoldcf.core.interf
 import org.apache.manifoldcf.core.interfaces.Specification;
 import org.apache.manifoldcf.crawler.interfaces.IExistingVersions;
 import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
+import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
 
 import com.amazonaws.services.s3.AmazonS3;
 
 public interface DocumentProcess {
   
 
-  void doPocessDocument(String[] documentIdentifiers,
+  void doProcessDocument(String[] documentIdentifiers,
       IExistingVersions statuses, Specification spec,
       IProcessActivity activities, int jobMode,
       boolean usesDefaultAuthority, AmazonS3 amazons3Client)
-      throws ManifoldCFException;
+      throws ManifoldCFException, ServiceInterruption;
 
 }
\ No newline at end of file

Modified: manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/GenericDocumentProcess.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/GenericDocumentProcess.java?rev=1701844&r1=1701843&r2=1701844&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/GenericDocumentProcess.java (original)
+++ manifoldcf/branches/CONNECTORS-1233/connectors/amazons3/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/amazons3/GenericDocumentProcess.java Tue Sep  8 17:14:57 2015
@@ -51,6 +51,8 @@ import com.amazonaws.services.s3.model.S
 public class GenericDocumentProcess extends AmazonS3DocumentProcessUtility
     implements DocumentProcess {
 
+  private static final String TEXT_PLAIN = "text/plain";
+
   /**
    * Process documents with out any tika extractor
    * @param documentIdentifiers
@@ -63,11 +65,11 @@ public class GenericDocumentProcess exte
    * @throws ManifoldCFException
    */
   @Override
-  public void doPocessDocument(String[] documentIdentifiers,
+  public void doProcessDocument(String[] documentIdentifiers,
       IExistingVersions statuses, Specification spec,
       IProcessActivity activities, int jobMode,
       boolean usesDefaultAuthority, AmazonS3 amazons3Client)
-      throws ManifoldCFException {
+      throws ManifoldCFException, ServiceInterruption {
     if (amazons3Client == null)
       throw new ManifoldCFException(
           "Amazon client can not connect at the moment");
@@ -122,8 +124,7 @@ public class GenericDocumentProcess exte
         String[] users = getUsers(grants);
 
         aclsToUse = users;
-
-        //
+        
         sb.append(lastModified.toString());
         versionString = sb.toString();
 
@@ -147,129 +148,122 @@ public class GenericDocumentProcess exte
         String errorDesc = null;
         Long fileSize = null;
 
-        String mimeType = "text/plain";// default
+        String mimeType = TEXT_PLAIN;//default        
+        long fileLength = s3Obj.getObjectMetadata()
+          .getContentLength();
+        
+        if (!activities.checkLengthIndexable(fileLength)) {
+          errorCode = activities.EXCLUDED_LENGTH;
+          errorDesc = "Excluded because of document length ("
+              + fileLength + ")";
+          activities.noDocument(documentIdentifier,
+              versionString);
+          continue;
+        }
 
-        // tika works starts
-        InputStream in = null;
-        ByteArrayOutputStream bao = new ByteArrayOutputStream();
+        String documentURI = getDocumentURI(s3Artifact);
+        Logging.connectors.debug("document : " + documentURI);
 
-        String document = null;
+        if (!activities.checkURLIndexable(documentURI)) {
+          errorCode = activities.EXCLUDED_URL;
+          errorDesc = "Excluded because of URL ('"
+              + documentURI + "')";
+          activities.noDocument(documentIdentifier,
+              versionString);
+          continue;
+        }
+
+        if (!activities.checkMimeTypeIndexable(mimeType)) {
+          errorCode = activities.EXCLUDED_MIMETYPE;
+          errorDesc = "Excluded because of mime type ('"
+              + mimeType + "')";
+          activities.noDocument(documentIdentifier,
+              versionString);
+          continue;
+        }
+        
+        if (!activities.checkDateIndexable(lastModified)) {
+          errorCode = activities.EXCLUDED_DATE;
+          errorDesc = "Excluded because of date ("
+              + lastModified + ")";
+          activities.noDocument(documentIdentifier,
+              versionString);
+          continue;
+        }
 
+        InputStream in = null;
         try {
+
           in = s3Obj.getObjectContent();
-          IOUtils.copy(in, bao);
-          long fileLength = bao.size();
-          if(fileLength < 1)
-          {
-            Logging.connectors.warn("File length 0");
-            continue;
-          }
-
-          String documentURI = getDocumentURI(s3Artifact);
-          Logging.connectors.debug("document : " + documentURI);
-
-          try {
-            if (!activities.checkURLIndexable(documentURI)) {
-              errorCode = activities.EXCLUDED_URL;
-              errorDesc = "Excluded because of URL ('"
-                  + documentURI + "')";
-              activities.noDocument(documentIdentifier,
-                  versionString);
-              continue;
-            }
-
-            if (!activities.checkMimeTypeIndexable(mimeType)) {
-              errorCode = activities.EXCLUDED_MIMETYPE;
-              errorDesc = "Excluded because of mime type ('"
-                  + mimeType + "')";
-              activities.noDocument(documentIdentifier,
-                  versionString);
-              continue;
-            }
-            if (!activities.checkDateIndexable(lastModified)) {
-              errorCode = activities.EXCLUDED_DATE;
-              errorDesc = "Excluded because of date ("
-                  + lastModified + ")";
-              activities.noDocument(documentIdentifier,
-                  versionString);
-              continue;
-            }
-
-            // otherwise process
-            RepositoryDocument rd = new RepositoryDocument();
-            addRawMetadata(rd, objectMetadata);
-            // Turn into acls and add into
-            // description
-            String[] denyAclsToUse;
-            if (aclsToUse.length > 0)
-              denyAclsToUse = new String[] { AmazonS3Connector.GLOBAL_DENY_TOKEN };
-            else
-              denyAclsToUse = new String[0];
-            rd.setSecurity(
-                RepositoryDocument.SECURITY_TYPE_DOCUMENT,
-                aclsToUse, denyAclsToUse);
-
-            rd.setMimeType(mimeType);
-
-            if (lastModified != null)
-              rd.setModifiedDate(lastModified);
-
-            
-              
-
-            if (!activities.checkLengthIndexable(fileLength)) {
-              errorCode = activities.EXCLUDED_LENGTH;
-              errorDesc = "Excluded because of document length ("
-                  + fileLength + ")";
-              activities.noDocument(documentIdentifier,
-                  versionString);
-              continue;
-            }
-
-            InputStream is = null;
-            try {
-              is = new ByteArrayInputStream(bao.toByteArray());
-              rd.setBinary(is, fileLength);
-              activities.ingestDocumentWithException(
-                  documentIdentifier, versionString,
-                  documentURI, rd);
-
-              errorCode = "OK";
-              fileSize = new Long(fileLength);
-            }
-            finally {
-              if (is != null)
-                IOUtils.closeQuietly(is);
-            }
-
-          }
-          catch (ServiceInterruption e) {
-            Logging.connectors
-                .error("Error while checking if document is indexable",
-                    e);
-          }
+
+          // otherwise process
+          RepositoryDocument rd = new RepositoryDocument();
+          addRawMetadata(rd, objectMetadata);
+          // Turn into acls and add into
+          // description
+          String[] denyAclsToUse;
+          if (aclsToUse.length > 0)
+            denyAclsToUse = new String[] { AmazonS3Connector.GLOBAL_DENY_TOKEN };
+          else
+            denyAclsToUse = new String[0];
+          rd.setSecurity(
+              RepositoryDocument.SECURITY_TYPE_DOCUMENT,
+              aclsToUse, denyAclsToUse);
+
+          rd.setMimeType(mimeType);
+
+          if (lastModified != null)
+            rd.setModifiedDate(lastModified);
+
+
+          //assign the stream
+          rd.setBinary(in, fileLength);
+          activities.ingestDocumentWithException(
+              documentIdentifier, versionString,
+              documentURI, rd);
+
+          errorCode = "OK";
+          fileSize = new Long(fileLength);
         }
         catch (IOException e1) {
-          Logging.connectors.error("Error while copying stream", e1);
+          handleIOException(e1);
         }
         finally {
-          //close output stream
-          IOUtils.closeQuietly(bao);
 
-          //close input stream
+          // close input stream
           if (in != null)
             IOUtils.closeQuietly(in);
         }
 
       }
       catch (AmazonServiceException e) {
-        Logging.connectors.error(e);
+        handleServiceException(e);
       }
       catch (AmazonClientException e) {
-        Logging.connectors.error(e);
+        handleClientException(e);
       }
     }
 
   }
 
+  protected static void handleIOException(final IOException e1)
+    throws ManifoldCFException, ServiceInterruption {
+    Logging.connectors.error("Error while copying stream", e1);
+    // Gotta handle this?? MHL
+    throw new ManifoldCFException("Error copying stream: "+e1.getMessage(),e1);
+  }
+
+  protected static void handleServiceException(final AmazonServiceException e1)
+    throws ManifoldCFException, ServiceInterruption {
+    // MHL to figure out what to throw?
+    Logging.connectors.error(e1);
+    throw new ManifoldCFException("Amazon service exception: "+e1.getMessage(),e1);
+  }
+  
+  protected static void handleClientException(final AmazonClientException e1)
+    throws ManifoldCFException, ServiceInterruption {
+    // MHL to figure out what to throw?
+    Logging.connectors.error(e1);
+    throw new ManifoldCFException("Amazon client exception: "+e1.getMessage(),e1);
+  }
 }