You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/05/28 14:15:36 UTC

svn commit: r1682222 - /manifoldcf/branches/CONNECTORS-1204/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java

Author: kwright
Date: Thu May 28 12:15:36 2015
New Revision: 1682222

URL: http://svn.apache.org/r1682222
Log:
Add support for original binary size to repository connector.  Also clean up flow and exceptions to reduce work.

Modified:
    manifoldcf/branches/CONNECTORS-1204/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java

Modified: manifoldcf/branches/CONNECTORS-1204/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1204/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1682222&r1=1682221&r2=1682222&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1204/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1204/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Thu May 28 12:15:36 2015
@@ -52,6 +52,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InterruptedIOException;
 import java.net.MalformedURLException;
+import java.net.UnknownHostException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Date;
@@ -472,15 +473,11 @@ public class SharedDriveConnector extend
         }
       }
     }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
-      throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
+      throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
     }
@@ -571,116 +568,142 @@ public class SharedDriveConnector extend
       String[] documentDeny = null;
       boolean documentSecurityOn = false;
       
+      // Common info we really need to fetch only once
+      long fileLength = 0L;
+      long lastModified = 0L;
+      boolean fileExists = false;
+      boolean fileIsDirectory = false;
+      
       try
       {
         file = new SmbFile(documentIdentifier,pa);
+        fileExists = fileExists(file);
 
         // File has to exist AND have a non-null canonical path to be readable.  If the canonical path is
         // null, it means that the windows permissions are not right and directory/file is not readable!!!
         String newPath = getFileCanonicalPath(file);
         // We MUST check the specification here, otherwise a recrawl may not delete what it's supposed to!
-        if (fileExists(file) && newPath != null && checkInclude(file,newPath,spec,activities))
+        if (fileExists && newPath != null)
         {
-          if (fileIsDirectory(file))
+          fileIsDirectory = fileIsDirectory(file);
+          if (checkInclude(fileIsDirectory,newPath,spec))
           {
-            // Hmm, this is not correct; version string should be empty for windows directories, since
-            // they are not hierarchical in modified date propagation.
-            // It's a directory. The version ID will be the
-            // last modified date.
-            //long lastModified = fileLastModified(file);
-            //versionString = new Long(lastModified).toString();
-            versionString = "";
-
-          }
-          else
-          {
-            // It's a file of acceptable length.
-            // The ability to get ACLs, list files, and an inputstream under DFS all work now.
-            // The SmbFile for parentFolder acls.
-            SmbFile parentFolder = new SmbFile(file.getParent(),pa);
-
-            // Compute the security information
-            String[] modelArray = new String[0];
-            
-            List<String> allowList = new ArrayList<String>();
-            List<String> denyList = new ArrayList<String>();
-            shareSecurityOn = getFileShareSecuritySet(allowList, denyList, file, shareAcls);
-            shareAllow = allowList.toArray(modelArray);
-            shareDeny = denyList.toArray(modelArray);
-
-            allowList.clear();
-            denyList.clear();
-            parentSecurityOn = getFileSecuritySet(allowList, denyList, parentFolder, parentFolderAcls);
-            parentAllow = allowList.toArray(modelArray);
-            parentDeny = denyList.toArray(modelArray);
-
-            allowList.clear();
-            denyList.clear();
-            documentSecurityOn = getFileSecuritySet(allowList, denyList, file, acls);
-            documentAllow = allowList.toArray(modelArray);
-            documentDeny = denyList.toArray(modelArray);
-            
-            // The format of this string changed on 11/8/2006 to be comformant with the standard way
-            // acls and metadata descriptions are being stuffed into the version string across connectors.
-
-            // The format of this string changed again on 7/3/2009 to permit the ingestion uri/iri to be included.
-            // This was to support filename/uri mapping functionality.
-
-            StringBuilder sb = new StringBuilder();
-
-            addSecuritySet(sb,shareSecurityOn,shareAllow,shareDeny);
-            addSecuritySet(sb,parentSecurityOn,parentAllow,parentDeny);
-            addSecuritySet(sb,documentSecurityOn,documentAllow,documentDeny);
+            if (fileIsDirectory)
+            {
+              // Hmm, this is not correct; version string should be empty for windows directories, since
+              // they are not hierarchical in modified date propagation.
+              // It's a directory. The version ID will be the
+              // last modified date.
+              //long lastModified = fileLastModified(file);
+              //versionString = new Long(lastModified).toString();
+              versionString = "";
 
-            // Include the path attribute name and value in the parseable area.
-            if (pathAttributeName != null)
+            }
+            else
             {
-              sb.append('+');
-              pack(sb,pathAttributeName,'+');
-              // Calculate path string; we'll include that wholesale in the version
-              pathAttributeValue = documentIdentifier;
-              // 3/13/2008
-              // In looking at what comes into the path metadata attribute by default, and cogitating a bit, I've concluded that
-              // the smb:// and the server/domain name at the start of the path are just plain old noise, and should be stripped.
-              // This changes a behavior that has been around for a while, so there is a risk, but a quick back-and-forth with the
-              // SE's leads me to believe that this is safe.
+              fileLength = fileLength(file);
+              if (checkIncludeFile(fileLength,newPath,spec,activities))
+              {
+                // It's a file of acceptable length.
+                // The ability to get ACLs, list files, and an inputstream under DFS all work now.
+                // The SmbFile for parentFolder acls.
+                SmbFile parentFolder = new SmbFile(file.getParent(),pa);
+
+                // Compute the security information
+                String[] modelArray = new String[0];
+                
+                List<String> allowList = new ArrayList<String>();
+                List<String> denyList = new ArrayList<String>();
+                shareSecurityOn = getFileShareSecuritySet(allowList, denyList, file, shareAcls);
+                shareAllow = allowList.toArray(modelArray);
+                shareDeny = denyList.toArray(modelArray);
+
+                allowList.clear();
+                denyList.clear();
+                parentSecurityOn = getFileSecuritySet(allowList, denyList, parentFolder, parentFolderAcls);
+                parentAllow = allowList.toArray(modelArray);
+                parentDeny = denyList.toArray(modelArray);
+
+                allowList.clear();
+                denyList.clear();
+                documentSecurityOn = getFileSecuritySet(allowList, denyList, file, acls);
+                documentAllow = allowList.toArray(modelArray);
+                documentDeny = denyList.toArray(modelArray);
+                
+                // This is stuff we need for computing the version string AND for indexing
+                lastModified = fileLastModified(file);
+                
+                // The format of this string changed on 11/8/2006 to be comformant with the standard way
+                // acls and metadata descriptions are being stuffed into the version string across connectors.
+
+                // The format of this string changed again on 7/3/2009 to permit the ingestion uri/iri to be included.
+                // This was to support filename/uri mapping functionality.
+
+                StringBuilder sb = new StringBuilder();
+
+                addSecuritySet(sb,shareSecurityOn,shareAllow,shareDeny);
+                addSecuritySet(sb,parentSecurityOn,parentAllow,parentDeny);
+                addSecuritySet(sb,documentSecurityOn,documentAllow,documentDeny);
+
+                // Include the path attribute name and value in the parseable area.
+                if (pathAttributeName != null)
+                {
+                  sb.append('+');
+                  pack(sb,pathAttributeName,'+');
+                  // Calculate path string; we'll include that wholesale in the version
+                  pathAttributeValue = documentIdentifier;
+                  // 3/13/2008
+                  // In looking at what comes into the path metadata attribute by default, and cogitating a bit, I've concluded that
+                  // the smb:// and the server/domain name at the start of the path are just plain old noise, and should be stripped.
+                  // This changes a behavior that has been around for a while, so there is a risk, but a quick back-and-forth with the
+                  // SE's leads me to believe that this is safe.
 
-              if (pathAttributeValue.startsWith("smb://"))
+                  if (pathAttributeValue.startsWith("smb://"))
+                  {
+                    int index = pathAttributeValue.indexOf("/","smb://".length());
+                    if (index == -1)
+                      index = pathAttributeValue.length();
+                    pathAttributeValue = pathAttributeValue.substring(index);
+                  }
+                  // Now, translate
+                  pathAttributeValue = matchMap.translate(pathAttributeValue);
+                  pack(sb,pathAttributeValue,'+');
+                }
+                else
+                  sb.append('-');
+
+                // Calculate the ingestion IRI/URI, and include that in the parseable area.
+                ingestionURI = convertToURI(documentIdentifier,fileMap,uriMap);
+                pack(sb,ingestionURI,'+');
+
+                // The stuff from here on down is non-parseable.
+                sb.append(new Long(lastModified).toString()).append(":")
+                  .append(new Long(fileLength).toString());
+                // Also include the specification-based answer for the question of whether fingerprinting is
+                // going to be done.  Although we may not consider this to truly be "version" information, the
+                // specification does affect whether anything is ingested or not, so it really is.  The alternative
+                // is to fingerprint right here, in the version part of the world, but that's got a performance
+                // downside, because it means that we'd have to suck over pretty much everything just to determine
+                // what we wanted to ingest.
+                boolean ifIndexable = wouldFileBeIncluded(newPath,spec,true);
+                boolean ifNotIndexable = wouldFileBeIncluded(newPath,spec,false);
+                if (ifIndexable == ifNotIndexable)
+                  sb.append("I");
+                else
+                  sb.append(ifIndexable?"Y":"N");
+                versionString = sb.toString();
+              }
+              else
               {
-                int index = pathAttributeValue.indexOf("/","smb://".length());
-                if (index == -1)
-                  index = pathAttributeValue.length();
-                pathAttributeValue = pathAttributeValue.substring(index);
+                activities.deleteDocument(documentIdentifier);
+                continue;
               }
-              // Now, translate
-              pathAttributeValue = matchMap.translate(pathAttributeValue);
-              pack(sb,pathAttributeValue,'+');
             }
-            else
-              sb.append('-');
-
-            // Calculate the ingestion IRI/URI, and include that in the parseable area.
-            ingestionURI = convertToURI(documentIdentifier,fileMap,uriMap);
-            pack(sb,ingestionURI,'+');
-
-            // The stuff from here on down is non-parseable.
-            // Get the file's modified date.
-            long lastModified = fileLastModified(file);
-            sb.append(new Long(lastModified).toString()).append(":")
-              .append(new Long(fileLength(file)).toString());
-            // Also include the specification-based answer for the question of whether fingerprinting is
-            // going to be done.  Although we may not consider this to truly be "version" information, the
-            // specification does affect whether anything is ingested or not, so it really is.  The alternative
-            // is to fingerprint right here, in the version part of the world, but that's got a performance
-            // downside, because it means that we'd have to suck over pretty much everything just to determine
-            // what we wanted to ingest.
-            boolean ifIndexable = wouldFileBeIncluded(newPath,spec,true);
-            boolean ifNotIndexable = wouldFileBeIncluded(newPath,spec,false);
-            if (ifIndexable == ifNotIndexable)
-              sb.append("I");
-            else
-              sb.append(ifIndexable?"Y":"N");
-            versionString = sb.toString();
+          }
+          else
+          {
+            activities.deleteDocument(documentIdentifier);
+            continue;
           }
         }
         else
@@ -742,9 +765,9 @@ public class SharedDriveConnector extend
           try
           {
 
-            if (fileExists(file))
+            if (fileExists)
             {
-              if (fileIsDirectory(file))
+              if (fileIsDirectory)
               {
                 if (Logging.connectors.isDebugEnabled())
                   Logging.connectors.debug("JCIFS: '"+documentIdentifier+"' is a directory");
@@ -781,8 +804,9 @@ public class SharedDriveConnector extend
                 {
                   String uri = ingestionURI;
                   String fileNameString = file.getName();
-                  Date lastModifiedDate = new Date(file.lastModified());
+                  Date lastModifiedDate = new Date(lastModified);
                   Date creationDate = new Date(file.createTime());
+                  Long originalLength = new Long(fileLength);
                   String contentType = mapExtensionToMimeType(fileNameString);
 
                   if (!activities.checkURLIndexable(uri))
@@ -821,6 +845,8 @@ public class SharedDriveConnector extend
                   String shareName = file.getShare();
 
                   rd.setFileName(fileNameString);
+                  rd.setOriginalSize(originalLength);
+                  
                   if (contentType != null)
                     rd.setMimeType(contentType);
                   rd.addField("lastModified", lastModifiedDate.toString());
@@ -882,7 +908,7 @@ public class SharedDriveConnector extend
 
                       if (checkIngest(tempFile, newPath, spec, activities))
                       {
-                        long fileLength = tempFile.length();
+                        // Not needed; fetched earlier: long fileLength = tempFile.length();
                         if (!activities.checkLengthIndexable(fileLength))
                         {
                           Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
@@ -902,7 +928,7 @@ public class SharedDriveConnector extend
                             
                           activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
                           errorCode = "OK";
-                          fileLengthLong = new Long(tempFile.length());
+                          fileLengthLong = new Long(fileLength);
                         }
                         finally
                         {
@@ -931,7 +957,7 @@ public class SharedDriveConnector extend
                     if (Logging.connectors.isDebugEnabled())
                       Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
 
-                    long fileLength = fileLength(file);
+                    // Not needed; fetched earlier: long fileLength = fileLength(file);
                     if (!activities.checkLengthIndexable(fileLength))
                     {
                       Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
@@ -952,7 +978,7 @@ public class SharedDriveConnector extend
                         
                       activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
                       errorCode = "OK";
-                      fileLengthLong = new Long(fileLength(file));
+                      fileLengthLong = new Long(fileLength);
                     }
                     finally
                     {
@@ -1380,14 +1406,47 @@ public class SharedDriveConnector extend
 
   // Protected methods
 
+  /** Check if a file's stats are OK for inclusion.
+  */
+  protected static boolean checkIncludeFile(long fileLength, String fileName, Specification documentSpecification, IFingerprintActivity activities)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // If it's a file, make sure the maximum length is not exceeded
+    if (!activities.checkLengthIndexable(fileLength) ||
+      !activities.checkMimeTypeIndexable(mapExtensionToMimeType(fileName)))
+      return false;
+    long maxFileLength = Long.MAX_VALUE;
+    for (int i = 0; i < documentSpecification.getChildCount(); i++)
+    {
+      SpecificationNode sn = documentSpecification.getChild(i++);
+      if (sn.getType().equals(NODE_MAXLENGTH))
+      {
+        try
+        {
+          String value = sn.getAttributeValue(ATTRIBUTE_VALUE);
+          if (value != null && value.length() > 0)
+            maxFileLength = new Long(value).longValue();
+        }
+        catch (NumberFormatException e)
+        {
+          throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
+        }
+      }
+    }
+    if (fileLength > maxFileLength)
+      return false;
+    return true;
+  }
+
+
   /** Check if a file or directory should be included, given a document specification.
-  *@param file is the file object.
+  *@param isDirectory is true if the file is a directory.
   *@param fileName is the canonical file name.
   *@param documentSpecification is the specification.
   *@return true if it should be included.
   */
-  protected boolean checkInclude(SmbFile file, String fileName, Specification documentSpecification, IFingerprintActivity activities)
-    throws ManifoldCFException, ServiceInterruption
+  protected boolean checkInclude(boolean isDirectory, String fileName, Specification documentSpecification)
+    throws ManifoldCFException
   {
     if (Logging.connectors.isDebugEnabled())
       Logging.connectors.debug("JCIFS: In checkInclude for '"+fileName+"'");
@@ -1400,7 +1459,6 @@ public class SharedDriveConnector extend
     {
       String pathPart;
       String filePart;
-      boolean isDirectory = fileIsDirectory(file);
       if (isDirectory)
       {
 
@@ -1423,36 +1481,7 @@ public class SharedDriveConnector extend
         }
       }
 
-      // If it's a file, make sure the maximum length is not exceeded
       int i;
-      if (!isDirectory)
-      {
-        long fileLength = fileLength(file);
-        if (!activities.checkLengthIndexable(fileLength) ||
-          !activities.checkMimeTypeIndexable(mapExtensionToMimeType(fileName)))
-          return false;
-        long maxFileLength = Long.MAX_VALUE;
-        i = 0;
-        while (i < documentSpecification.getChildCount())
-        {
-          SpecificationNode sn = documentSpecification.getChild(i++);
-          if (sn.getType().equals(NODE_MAXLENGTH))
-          {
-            try
-            {
-              String value = sn.getAttributeValue(ATTRIBUTE_VALUE);
-              if (value != null && value.length() > 0)
-                maxFileLength = new Long(value).longValue();
-            }
-            catch (NumberFormatException e)
-            {
-              throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
-            }
-          }
-        }
-        if (fileLength > maxFileLength)
-          return false;
-      }
 
       // Scan until we match a startpoint
       i = 0;
@@ -1564,25 +1593,11 @@ public class SharedDriveConnector extend
       }
       return false;
     }
-    catch (jcifs.smb.SmbAuthException e)
-    {
-      Logging.connectors.warn("JCIFS: Authorization exception checking inclusion for "+fileName+" - skipping");
-      return false;
-    }
-    catch (SmbException se)
-    {
-      processSMBException(se, fileName, "checking inclusion", "canonical path mapping");
-      return false;
-    }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
@@ -1712,15 +1727,11 @@ public class SharedDriveConnector extend
       }
       return false;
     }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
@@ -1860,25 +1871,11 @@ public class SharedDriveConnector extend
       }
       return false;
     }
-    catch (jcifs.smb.SmbAuthException e)
-    {
-      Logging.connectors.warn("JCIFS: Authorization exception checking ingestion for "+fileName+" - skipping");
-      return false;
-    }
-    catch (SmbException se)
-    {
-      processSMBException(se, fileName, "checking ingestion", "reading document");
-      return false;
-    }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
@@ -2107,7 +2104,7 @@ public class SharedDriveConnector extend
   /** Map a "path" specification to a full identifier.
   */
   protected String mapToIdentifier(String path)
-    throws IOException
+    throws MalformedURLException, UnknownHostException
   {
     String smburi = smbconnectionPath;
     String uri = smburi + path + "/";
@@ -4823,11 +4820,30 @@ public class SharedDriveConnector extend
           // documents that we will immediately turn around and remove.  However, if this
           // check was not here, everything should still function, provided the getDocumentVersions()
           // method does the right thing.
-          if (checkInclude(f, newPath, spec, activities))
+          boolean fileIsDirectory = fileIsDirectory(f);
+          if (checkInclude(fileIsDirectory, newPath, spec))
           {
-            if (Logging.connectors.isDebugEnabled())
-              Logging.connectors.debug("JCIFS: Recorded path is '" + newPath + "' and is included.");
-            activities.addDocumentReference(newPath);
+            if (fileIsDirectory)
+            {
+              if (Logging.connectors.isDebugEnabled())
+                Logging.connectors.debug("JCIFS: Recorded path is '" + newPath + "' and is included.");
+              activities.addDocumentReference(newPath);
+            }
+            else
+            {
+              long fileLength = fileLength(f);
+              if (checkIncludeFile(fileLength, newPath, spec, activities))
+              {
+                if (Logging.connectors.isDebugEnabled())
+                  Logging.connectors.debug("JCIFS: Recorded path is '" + newPath + "' and is included.");
+                activities.addDocumentReference(newPath);
+              }
+              else
+              {
+                if (Logging.connectors.isDebugEnabled())
+                  Logging.connectors.debug("JCIFS: Recorded path '"+newPath+"' is excluded!");
+              }
+            }
           }
           else
           {