You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/05/29 12:03:38 UTC

svn commit: r1682410 - in /manifoldcf/trunk: ./ connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/ connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/ connectors/solr/connector/sr...

Author: kwright
Date: Fri May 29 10:03:38 2015
New Revision: 1682410

URL: http://svn.apache.org/r1682410
Log:
Fix for CONNECTORS-1204.

Modified:
    manifoldcf/trunk/   (props changed)
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
    manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties
    manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_ja_JP.properties
    manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_zh_CN.properties
    manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/RepositoryDocument.java

Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri May 29 10:03:38 2015
@@ -53,6 +53,7 @@
 /manifoldcf/branches/CONNECTORS-1177:1670213-1670613
 /manifoldcf/branches/CONNECTORS-120:1406712-1407974,1407982-1411043,1411049-1416451
 /manifoldcf/branches/CONNECTORS-120-1:1416450-1417056
+/manifoldcf/branches/CONNECTORS-1204:1682204-1682409
 /manifoldcf/branches/CONNECTORS-13:1525862-1527182,1539324-1541634
 /manifoldcf/branches/CONNECTORS-470:1349741-1360750,1360808
 /manifoldcf/branches/CONNECTORS-474:1349741-1353803

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri May 29 10:03:38 2015
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 2.2-dev =====================
 
+CONNECTORS-1204: Add ability to post original binary length of document
+to Solr from JCIFS connector.  Also improve efficiency of JCIFS connector.
+(Karl Wright)
+
 CONNECTORS-1193: Add ability to discard web content based on a 
 set of regular expressions.
 (Arcadius Ahouansou)

Modified: manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Fri May 29 10:03:38 2015
@@ -52,6 +52,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InterruptedIOException;
 import java.net.MalformedURLException;
+import java.net.UnknownHostException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Date;
@@ -472,15 +473,11 @@ public class SharedDriveConnector extend
         }
       }
     }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
-      throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
+      throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
     }
@@ -571,116 +568,142 @@ public class SharedDriveConnector extend
       String[] documentDeny = null;
       boolean documentSecurityOn = false;
       
+      // Common info we really need to fetch only once
+      long fileLength = 0L;
+      long lastModified = 0L;
+      boolean fileExists = false;
+      boolean fileIsDirectory = false;
+      
       try
       {
         file = new SmbFile(documentIdentifier,pa);
+        fileExists = fileExists(file);
 
         // File has to exist AND have a non-null canonical path to be readable.  If the canonical path is
         // null, it means that the windows permissions are not right and directory/file is not readable!!!
         String newPath = getFileCanonicalPath(file);
         // We MUST check the specification here, otherwise a recrawl may not delete what it's supposed to!
-        if (fileExists(file) && newPath != null && checkInclude(file,newPath,spec,activities))
+        if (fileExists && newPath != null)
         {
-          if (fileIsDirectory(file))
+          fileIsDirectory = fileIsDirectory(file);
+          if (checkInclude(fileIsDirectory,newPath,spec))
           {
-            // Hmm, this is not correct; version string should be empty for windows directories, since
-            // they are not hierarchical in modified date propagation.
-            // It's a directory. The version ID will be the
-            // last modified date.
-            //long lastModified = fileLastModified(file);
-            //versionString = new Long(lastModified).toString();
-            versionString = "";
-
-          }
-          else
-          {
-            // It's a file of acceptable length.
-            // The ability to get ACLs, list files, and an inputstream under DFS all work now.
-            // The SmbFile for parentFolder acls.
-            SmbFile parentFolder = new SmbFile(file.getParent(),pa);
-
-            // Compute the security information
-            String[] modelArray = new String[0];
-            
-            List<String> allowList = new ArrayList<String>();
-            List<String> denyList = new ArrayList<String>();
-            shareSecurityOn = getFileShareSecuritySet(allowList, denyList, file, shareAcls);
-            shareAllow = allowList.toArray(modelArray);
-            shareDeny = denyList.toArray(modelArray);
-
-            allowList.clear();
-            denyList.clear();
-            parentSecurityOn = getFileSecuritySet(allowList, denyList, parentFolder, parentFolderAcls);
-            parentAllow = allowList.toArray(modelArray);
-            parentDeny = denyList.toArray(modelArray);
-
-            allowList.clear();
-            denyList.clear();
-            documentSecurityOn = getFileSecuritySet(allowList, denyList, file, acls);
-            documentAllow = allowList.toArray(modelArray);
-            documentDeny = denyList.toArray(modelArray);
-            
-            // The format of this string changed on 11/8/2006 to be comformant with the standard way
-            // acls and metadata descriptions are being stuffed into the version string across connectors.
-
-            // The format of this string changed again on 7/3/2009 to permit the ingestion uri/iri to be included.
-            // This was to support filename/uri mapping functionality.
-
-            StringBuilder sb = new StringBuilder();
-
-            addSecuritySet(sb,shareSecurityOn,shareAllow,shareDeny);
-            addSecuritySet(sb,parentSecurityOn,parentAllow,parentDeny);
-            addSecuritySet(sb,documentSecurityOn,documentAllow,documentDeny);
+            if (fileIsDirectory)
+            {
+              // Hmm, this is not correct; version string should be empty for windows directories, since
+              // they are not hierarchical in modified date propagation.
+              // It's a directory. The version ID will be the
+              // last modified date.
+              //long lastModified = fileLastModified(file);
+              //versionString = new Long(lastModified).toString();
+              versionString = "";
 
-            // Include the path attribute name and value in the parseable area.
-            if (pathAttributeName != null)
+            }
+            else
             {
-              sb.append('+');
-              pack(sb,pathAttributeName,'+');
-              // Calculate path string; we'll include that wholesale in the version
-              pathAttributeValue = documentIdentifier;
-              // 3/13/2008
-              // In looking at what comes into the path metadata attribute by default, and cogitating a bit, I've concluded that
-              // the smb:// and the server/domain name at the start of the path are just plain old noise, and should be stripped.
-              // This changes a behavior that has been around for a while, so there is a risk, but a quick back-and-forth with the
-              // SE's leads me to believe that this is safe.
+              fileLength = fileLength(file);
+              if (checkIncludeFile(fileLength,newPath,spec,activities))
+              {
+                // It's a file of acceptable length.
+                // The ability to get ACLs, list files, and an inputstream under DFS all work now.
+                // The SmbFile for parentFolder acls.
+                SmbFile parentFolder = new SmbFile(file.getParent(),pa);
+
+                // Compute the security information
+                String[] modelArray = new String[0];
+                
+                List<String> allowList = new ArrayList<String>();
+                List<String> denyList = new ArrayList<String>();
+                shareSecurityOn = getFileShareSecuritySet(allowList, denyList, file, shareAcls);
+                shareAllow = allowList.toArray(modelArray);
+                shareDeny = denyList.toArray(modelArray);
+
+                allowList.clear();
+                denyList.clear();
+                parentSecurityOn = getFileSecuritySet(allowList, denyList, parentFolder, parentFolderAcls);
+                parentAllow = allowList.toArray(modelArray);
+                parentDeny = denyList.toArray(modelArray);
+
+                allowList.clear();
+                denyList.clear();
+                documentSecurityOn = getFileSecuritySet(allowList, denyList, file, acls);
+                documentAllow = allowList.toArray(modelArray);
+                documentDeny = denyList.toArray(modelArray);
+                
+                // This is stuff we need for computing the version string AND for indexing
+                lastModified = fileLastModified(file);
+                
+                // The format of this string changed on 11/8/2006 to be comformant with the standard way
+                // acls and metadata descriptions are being stuffed into the version string across connectors.
+
+                // The format of this string changed again on 7/3/2009 to permit the ingestion uri/iri to be included.
+                // This was to support filename/uri mapping functionality.
+
+                StringBuilder sb = new StringBuilder();
+
+                addSecuritySet(sb,shareSecurityOn,shareAllow,shareDeny);
+                addSecuritySet(sb,parentSecurityOn,parentAllow,parentDeny);
+                addSecuritySet(sb,documentSecurityOn,documentAllow,documentDeny);
+
+                // Include the path attribute name and value in the parseable area.
+                if (pathAttributeName != null)
+                {
+                  sb.append('+');
+                  pack(sb,pathAttributeName,'+');
+                  // Calculate path string; we'll include that wholesale in the version
+                  pathAttributeValue = documentIdentifier;
+                  // 3/13/2008
+                  // In looking at what comes into the path metadata attribute by default, and cogitating a bit, I've concluded that
+                  // the smb:// and the server/domain name at the start of the path are just plain old noise, and should be stripped.
+                  // This changes a behavior that has been around for a while, so there is a risk, but a quick back-and-forth with the
+                  // SE's leads me to believe that this is safe.
 
-              if (pathAttributeValue.startsWith("smb://"))
+                  if (pathAttributeValue.startsWith("smb://"))
+                  {
+                    int index = pathAttributeValue.indexOf("/","smb://".length());
+                    if (index == -1)
+                      index = pathAttributeValue.length();
+                    pathAttributeValue = pathAttributeValue.substring(index);
+                  }
+                  // Now, translate
+                  pathAttributeValue = matchMap.translate(pathAttributeValue);
+                  pack(sb,pathAttributeValue,'+');
+                }
+                else
+                  sb.append('-');
+
+                // Calculate the ingestion IRI/URI, and include that in the parseable area.
+                ingestionURI = convertToURI(documentIdentifier,fileMap,uriMap);
+                pack(sb,ingestionURI,'+');
+
+                // The stuff from here on down is non-parseable.
+                sb.append(new Long(lastModified).toString()).append(":")
+                  .append(new Long(fileLength).toString());
+                // Also include the specification-based answer for the question of whether fingerprinting is
+                // going to be done.  Although we may not consider this to truly be "version" information, the
+                // specification does affect whether anything is ingested or not, so it really is.  The alternative
+                // is to fingerprint right here, in the version part of the world, but that's got a performance
+                // downside, because it means that we'd have to suck over pretty much everything just to determine
+                // what we wanted to ingest.
+                boolean ifIndexable = wouldFileBeIncluded(newPath,spec,true);
+                boolean ifNotIndexable = wouldFileBeIncluded(newPath,spec,false);
+                if (ifIndexable == ifNotIndexable)
+                  sb.append("I");
+                else
+                  sb.append(ifIndexable?"Y":"N");
+                versionString = sb.toString();
+              }
+              else
               {
-                int index = pathAttributeValue.indexOf("/","smb://".length());
-                if (index == -1)
-                  index = pathAttributeValue.length();
-                pathAttributeValue = pathAttributeValue.substring(index);
+                activities.deleteDocument(documentIdentifier);
+                continue;
               }
-              // Now, translate
-              pathAttributeValue = matchMap.translate(pathAttributeValue);
-              pack(sb,pathAttributeValue,'+');
             }
-            else
-              sb.append('-');
-
-            // Calculate the ingestion IRI/URI, and include that in the parseable area.
-            ingestionURI = convertToURI(documentIdentifier,fileMap,uriMap);
-            pack(sb,ingestionURI,'+');
-
-            // The stuff from here on down is non-parseable.
-            // Get the file's modified date.
-            long lastModified = fileLastModified(file);
-            sb.append(new Long(lastModified).toString()).append(":")
-              .append(new Long(fileLength(file)).toString());
-            // Also include the specification-based answer for the question of whether fingerprinting is
-            // going to be done.  Although we may not consider this to truly be "version" information, the
-            // specification does affect whether anything is ingested or not, so it really is.  The alternative
-            // is to fingerprint right here, in the version part of the world, but that's got a performance
-            // downside, because it means that we'd have to suck over pretty much everything just to determine
-            // what we wanted to ingest.
-            boolean ifIndexable = wouldFileBeIncluded(newPath,spec,true);
-            boolean ifNotIndexable = wouldFileBeIncluded(newPath,spec,false);
-            if (ifIndexable == ifNotIndexable)
-              sb.append("I");
-            else
-              sb.append(ifIndexable?"Y":"N");
-            versionString = sb.toString();
+          }
+          else
+          {
+            activities.deleteDocument(documentIdentifier);
+            continue;
           }
         }
         else
@@ -742,9 +765,9 @@ public class SharedDriveConnector extend
           try
           {
 
-            if (fileExists(file))
+            if (fileExists)
             {
-              if (fileIsDirectory(file))
+              if (fileIsDirectory)
               {
                 if (Logging.connectors.isDebugEnabled())
                   Logging.connectors.debug("JCIFS: '"+documentIdentifier+"' is a directory");
@@ -781,8 +804,9 @@ public class SharedDriveConnector extend
                 {
                   String uri = ingestionURI;
                   String fileNameString = file.getName();
-                  Date lastModifiedDate = new Date(file.lastModified());
+                  Date lastModifiedDate = new Date(lastModified);
                   Date creationDate = new Date(file.createTime());
+                  Long originalLength = new Long(fileLength);
                   String contentType = mapExtensionToMimeType(fileNameString);
 
                   if (!activities.checkURLIndexable(uri))
@@ -821,6 +845,8 @@ public class SharedDriveConnector extend
                   String shareName = file.getShare();
 
                   rd.setFileName(fileNameString);
+                  rd.setOriginalSize(originalLength);
+                  
                   if (contentType != null)
                     rd.setMimeType(contentType);
                   rd.addField("lastModified", lastModifiedDate.toString());
@@ -882,7 +908,7 @@ public class SharedDriveConnector extend
 
                       if (checkIngest(tempFile, newPath, spec, activities))
                       {
-                        long fileLength = tempFile.length();
+                        // Not needed; fetched earlier: long fileLength = tempFile.length();
                         if (!activities.checkLengthIndexable(fileLength))
                         {
                           Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
@@ -902,7 +928,7 @@ public class SharedDriveConnector extend
                             
                           activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
                           errorCode = "OK";
-                          fileLengthLong = new Long(tempFile.length());
+                          fileLengthLong = new Long(fileLength);
                         }
                         finally
                         {
@@ -931,7 +957,7 @@ public class SharedDriveConnector extend
                     if (Logging.connectors.isDebugEnabled())
                       Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
 
-                    long fileLength = fileLength(file);
+                    // Not needed; fetched earlier: long fileLength = fileLength(file);
                     if (!activities.checkLengthIndexable(fileLength))
                     {
                       Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
@@ -952,7 +978,7 @@ public class SharedDriveConnector extend
                         
                       activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
                       errorCode = "OK";
-                      fileLengthLong = new Long(fileLength(file));
+                      fileLengthLong = new Long(fileLength);
                     }
                     finally
                     {
@@ -1380,14 +1406,47 @@ public class SharedDriveConnector extend
 
   // Protected methods
 
+  /** Check if a file's stats are OK for inclusion.
+  */
+  protected static boolean checkIncludeFile(long fileLength, String fileName, Specification documentSpecification, IFingerprintActivity activities)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // If it's a file, make sure the maximum length is not exceeded
+    if (!activities.checkLengthIndexable(fileLength) ||
+      !activities.checkMimeTypeIndexable(mapExtensionToMimeType(fileName)))
+      return false;
+    long maxFileLength = Long.MAX_VALUE;
+    for (int i = 0; i < documentSpecification.getChildCount(); i++)
+    {
+      SpecificationNode sn = documentSpecification.getChild(i++);
+      if (sn.getType().equals(NODE_MAXLENGTH))
+      {
+        try
+        {
+          String value = sn.getAttributeValue(ATTRIBUTE_VALUE);
+          if (value != null && value.length() > 0)
+            maxFileLength = new Long(value).longValue();
+        }
+        catch (NumberFormatException e)
+        {
+          throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
+        }
+      }
+    }
+    if (fileLength > maxFileLength)
+      return false;
+    return true;
+  }
+
+
   /** Check if a file or directory should be included, given a document specification.
-  *@param file is the file object.
+  *@param isDirectory is true if the file is a directory.
   *@param fileName is the canonical file name.
   *@param documentSpecification is the specification.
   *@return true if it should be included.
   */
-  protected boolean checkInclude(SmbFile file, String fileName, Specification documentSpecification, IFingerprintActivity activities)
-    throws ManifoldCFException, ServiceInterruption
+  protected boolean checkInclude(boolean isDirectory, String fileName, Specification documentSpecification)
+    throws ManifoldCFException
   {
     if (Logging.connectors.isDebugEnabled())
       Logging.connectors.debug("JCIFS: In checkInclude for '"+fileName+"'");
@@ -1400,7 +1459,6 @@ public class SharedDriveConnector extend
     {
       String pathPart;
       String filePart;
-      boolean isDirectory = fileIsDirectory(file);
       if (isDirectory)
       {
 
@@ -1423,36 +1481,7 @@ public class SharedDriveConnector extend
         }
       }
 
-      // If it's a file, make sure the maximum length is not exceeded
       int i;
-      if (!isDirectory)
-      {
-        long fileLength = fileLength(file);
-        if (!activities.checkLengthIndexable(fileLength) ||
-          !activities.checkMimeTypeIndexable(mapExtensionToMimeType(fileName)))
-          return false;
-        long maxFileLength = Long.MAX_VALUE;
-        i = 0;
-        while (i < documentSpecification.getChildCount())
-        {
-          SpecificationNode sn = documentSpecification.getChild(i++);
-          if (sn.getType().equals(NODE_MAXLENGTH))
-          {
-            try
-            {
-              String value = sn.getAttributeValue(ATTRIBUTE_VALUE);
-              if (value != null && value.length() > 0)
-                maxFileLength = new Long(value).longValue();
-            }
-            catch (NumberFormatException e)
-            {
-              throw new ManifoldCFException("Bad number: "+e.getMessage(),e);
-            }
-          }
-        }
-        if (fileLength > maxFileLength)
-          return false;
-      }
 
       // Scan until we match a startpoint
       i = 0;
@@ -1564,25 +1593,11 @@ public class SharedDriveConnector extend
       }
       return false;
     }
-    catch (jcifs.smb.SmbAuthException e)
-    {
-      Logging.connectors.warn("JCIFS: Authorization exception checking inclusion for "+fileName+" - skipping");
-      return false;
-    }
-    catch (SmbException se)
-    {
-      processSMBException(se, fileName, "checking inclusion", "canonical path mapping");
-      return false;
-    }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
@@ -1712,15 +1727,11 @@ public class SharedDriveConnector extend
       }
       return false;
     }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
@@ -1860,25 +1871,11 @@ public class SharedDriveConnector extend
       }
       return false;
     }
-    catch (jcifs.smb.SmbAuthException e)
-    {
-      Logging.connectors.warn("JCIFS: Authorization exception checking ingestion for "+fileName+" - skipping");
-      return false;
-    }
-    catch (SmbException se)
-    {
-      processSMBException(se, fileName, "checking ingestion", "reading document");
-      return false;
-    }
-    catch (java.net.SocketTimeoutException e)
+    catch (MalformedURLException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
-    catch (InterruptedIOException e)
-    {
-      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    }
-    catch (IOException e)
+    catch (UnknownHostException e)
     {
       throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
     }
@@ -2107,7 +2104,7 @@ public class SharedDriveConnector extend
   /** Map a "path" specification to a full identifier.
   */
   protected String mapToIdentifier(String path)
-    throws IOException
+    throws MalformedURLException, UnknownHostException
   {
     String smburi = smbconnectionPath;
     String uri = smburi + path + "/";
@@ -4823,11 +4820,30 @@ public class SharedDriveConnector extend
           // documents that we will immediately turn around and remove.  However, if this
           // check was not here, everything should still function, provided the getDocumentVersions()
           // method does the right thing.
-          if (checkInclude(f, newPath, spec, activities))
+          boolean fileIsDirectory = fileIsDirectory(f);
+          if (checkInclude(fileIsDirectory, newPath, spec))
           {
-            if (Logging.connectors.isDebugEnabled())
-              Logging.connectors.debug("JCIFS: Recorded path is '" + newPath + "' and is included.");
-            activities.addDocumentReference(newPath);
+            if (fileIsDirectory)
+            {
+              if (Logging.connectors.isDebugEnabled())
+                Logging.connectors.debug("JCIFS: Recorded path is '" + newPath + "' and is included.");
+              activities.addDocumentReference(newPath);
+            }
+            else
+            {
+              long fileLength = fileLength(f);
+              if (checkIncludeFile(fileLength, newPath, spec, activities))
+              {
+                if (Logging.connectors.isDebugEnabled())
+                  Logging.connectors.debug("JCIFS: Recorded path is '" + newPath + "' and is included.");
+                activities.addDocumentReference(newPath);
+              }
+              else
+              {
+                if (Logging.connectors.isDebugEnabled())
+                  Logging.connectors.debug("JCIFS: Recorded path '"+newPath+"' is excluded!");
+              }
+            }
           }
           else
           {

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Fri May 29 10:03:38 2015
@@ -100,6 +100,7 @@ public class HttpPoster
   private final String allowAttributeName;
   private final String denyAttributeName;
   private final String idAttributeName;
+  private final String originalSizeAttributeName;
   private final String modifiedDateAttributeName;
   private final String createdDateAttributeName;
   private final String indexedDateAttributeName;
@@ -131,7 +132,7 @@ public class HttpPoster
     int zkClientTimeout, int zkConnectTimeout,
     String updatePath, String removePath, String statusPath,
     String allowAttributeName, String denyAttributeName, String idAttributeName,
-    String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
+    String originalSizeAttributeName, String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
     String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
     Long maxDocumentLength,
     String commitWithin, boolean useExtractUpdateHandler)
@@ -147,6 +148,7 @@ public class HttpPoster
     this.allowAttributeName = allowAttributeName;
     this.denyAttributeName = denyAttributeName;
     this.idAttributeName = idAttributeName;
+    this.originalSizeAttributeName = originalSizeAttributeName;
     this.modifiedDateAttributeName = modifiedDateAttributeName;
     this.createdDateAttributeName = createdDateAttributeName;
     this.indexedDateAttributeName = indexedDateAttributeName;
@@ -179,7 +181,7 @@ public class HttpPoster
     String updatePath, String removePath, String statusPath,
     String realm, String userID, String password,
     String allowAttributeName, String denyAttributeName, String idAttributeName,
-    String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
+    String originalSizeAttributeName, String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
     String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
     IKeystoreManager keystoreManager, Long maxDocumentLength,
     String commitWithin, boolean useExtractUpdateHandler)
@@ -195,6 +197,7 @@ public class HttpPoster
     this.allowAttributeName = allowAttributeName;
     this.denyAttributeName = denyAttributeName;
     this.idAttributeName = idAttributeName;
+    this.originalSizeAttributeName = originalSizeAttributeName;
     this.modifiedDateAttributeName = modifiedDateAttributeName;
     this.createdDateAttributeName = createdDateAttributeName;
     this.indexedDateAttributeName = indexedDateAttributeName;
@@ -995,6 +998,14 @@ public class HttpPoster
       }
       
       // Write the rest of the attributes
+      if ( originalSizeAttributeName != null )
+      {
+        Long size = document.getOriginalSize();
+        if ( size != null )
+        {
+          outputDoc.addField( originalSizeAttributeName, size.toString() );
+        }
+      }
       if ( modifiedDateAttributeName != null )
       {
         Date date = document.getModifiedDate();
@@ -1067,6 +1078,13 @@ public class HttpPoster
       // Write the id field
       writeField(out,LITERAL+idAttributeName,documentURI);
       // Write the rest of the attributes
+      if (originalSizeAttributeName != null)
+      {
+        Long size = document.getOriginalSize();
+        if (size != null)
+          // Write value
+          writeField(out,LITERAL+modifiedDateAttributeName,size.toString());
+      }
       if (modifiedDateAttributeName != null)
       {
         Date date = document.getModifiedDate();

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java Fri May 29 10:03:38 2015
@@ -83,6 +83,8 @@ public class SolrConfig
   public static final String PARAM_STATUSPATH = "Server status handler";
   /** Id field */
   public static final String PARAM_IDFIELD = "Solr id field name";
+  /** Optional original size field */
+  public static final String PARAM_ORIGINALSIZEFIELD = "Solr original size field name";
   /** Optional modified date field */
   public static final String PARAM_MODIFIEDDATEFIELD = "Solr modified date field name";
   /** Optional created date field */

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Fri May 29 10:03:38 2015
@@ -85,6 +85,7 @@ public class SolrConnector extends org.a
   
   // Attributes going into Solr
   protected String idAttributeName = null;
+  protected String originalSizeAttributeName = null;
   protected String modifiedDateAttributeName = null;
   protected String createdDateAttributeName = null;
   protected String indexedDateAttributeName = null;
@@ -181,6 +182,7 @@ public class SolrConnector extends org.a
     excludedMimeTypesString = null;
     excludedMimeTypes = null;
     idAttributeName = null;
+    originalSizeAttributeName = null;
     modifiedDateAttributeName = null;
     createdDateAttributeName = null;
     indexedDateAttributeName = null;
@@ -214,6 +216,10 @@ public class SolrConnector extends org.a
       if (idAttributeName == null || idAttributeName.length() == 0)
         idAttributeName = "id";
 
+      originalSizeAttributeName = params.getParameter(SolrConfig.PARAM_ORIGINALSIZEFIELD);
+      if (originalSizeAttributeName == null || originalSizeAttributeName.length() == 0)
+        originalSizeAttributeName = null;
+      
       modifiedDateAttributeName = params.getParameter(SolrConfig.PARAM_MODIFIEDDATEFIELD);
       if (modifiedDateAttributeName == null || modifiedDateAttributeName.length() == 0)
         modifiedDateAttributeName = null;
@@ -355,7 +361,7 @@ public class SolrConnector extends org.a
             connectTimeout,socketTimeout,
             updatePath,removePath,statusPath,realm,userID,password,
             allowAttributeName,denyAttributeName,idAttributeName,
-            modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
+            originalSizeAttributeName,modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
             fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
             keystoreManager,maxDocumentLength,commitWithin,useExtractUpdateHandler);
           
@@ -411,7 +417,7 @@ public class SolrConnector extends org.a
             zkClientTimeout,zkConnectTimeout,
             updatePath,removePath,statusPath,
             allowAttributeName,denyAttributeName,idAttributeName,
-            modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
+            originalSizeAttributeName,modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
             fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
             maxDocumentLength,commitWithin,useExtractUpdateHandler);
           
@@ -1009,6 +1015,10 @@ public class SolrConnector extends org.a
     String idField = parameters.getParameter(SolrConfig.PARAM_IDFIELD);
     if (idField == null)
       idField = "id";
+
+    String originalSizeField = parameters.getParameter(SolrConfig.PARAM_ORIGINALSIZEFIELD);
+    if (originalSizeField == null)
+      originalSizeField = "";
     
     String modifiedDateField = parameters.getParameter(SolrConfig.PARAM_MODIFIEDDATEFIELD);
     if (modifiedDateField == null)
@@ -1463,6 +1473,12 @@ public class SolrConnector extends org.a
 "    </td>\n"+
 "  </tr>\n"+
 "  <tr>\n"+
+"    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"SolrConnector.OriginalSizeFieldName") + "</nobr></td>\n"+
+"    <td class=\"value\">\n"+
+"      <input name=\"originalsizefield\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(originalSizeField)+"\"/>\n"+
+"    </td>\n"+
+"  </tr>\n"+
+"  <tr>\n"+
 "    <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"SolrConnector.ModifiedDateFieldName") + "</nobr></td>\n"+
 "    <td class=\"value\">\n"+
 "      <input name=\"modifieddatefield\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(modifiedDateField)+"\"/>\n"+
@@ -1526,6 +1542,7 @@ public class SolrConnector extends org.a
     {
       out.print(
 "<input type=\"hidden\" name=\"idfield\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(idField)+"\"/>\n"+
+"<input type=\"hidden\" name=\"originalsizefield\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(originalSizeField)+"\"/>\n"+
 "<input type=\"hidden\" name=\"modifieddatefield\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(modifiedDateField)+"\"/>\n"+
 "<input type=\"hidden\" name=\"createddatefield\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(createdDateField)+"\"/>\n"+
 "<input type=\"hidden\" name=\"indexeddatefield\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(indexedDateField)+"\"/>\n"+
@@ -1814,6 +1831,10 @@ public class SolrConnector extends org.a
     if (idField != null)
       parameters.setParameter(SolrConfig.PARAM_IDFIELD,idField);
 
+    String originalSizeField = variableContext.getParameter("originalsizefield");
+    if (originalSizeField != null)
+      parameters.setParameter(SolrConfig.PARAM_ORIGINALSIZEFIELD,originalSizeField);
+
     String modifiedDateField = variableContext.getParameter("modifieddatefield");
     if (modifiedDateField != null)
       parameters.setParameter(SolrConfig.PARAM_MODIFIEDDATEFIELD,modifiedDateField);
@@ -2254,6 +2275,14 @@ public class SolrConnector extends org.a
       }
       else
         sb.append('-');
+
+      if (originalSizeAttributeName != null)
+      {
+          sb.append('+');
+          pack(sb,originalSizeAttributeName,'+');
+      }
+      else
+        sb.append('-');
 
       if (modifiedDateAttributeName != null)
       {

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties Fri May 29 10:03:38 2015
@@ -53,6 +53,7 @@ SolrConnector.Add=Add
 SolrConnector.AddZookeeperHost=Add ZooKeeper host
 SolrConnector.Certificate=Certificate:
 SolrConnector.IDFieldName=ID field name:
+SolrConnector.OriginalSizeFieldName=Original size field name:
 SolrConnector.ModifiedDateFieldName=Modified date field name:
 SolrConnector.CreatedDateFieldName=Created date field name:
 SolrConnector.IndexedDateFieldName=Indexed date field name:

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_ja_JP.properties?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_ja_JP.properties Fri May 29 10:03:38 2015
@@ -53,6 +53,7 @@ SolrConnector.Add=追加
 SolrConnector.AddZookeeperHost=ZooKeeperホストを追加
 SolrConnector.Certificate=証明証:
 SolrConnector.IDFieldName=IDフィールド名:
+SolrConnector.OriginalSizeFieldName=Original size field name:
 SolrConnector.ModifiedDateFieldName=更新日付フィールド名:
 SolrConnector.CreatedDateFieldName=作成日付フィールド名:
 SolrConnector.IndexedDateFieldName=Indexed date field name:

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_zh_CN.properties?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_zh_CN.properties (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_zh_CN.properties Fri May 29 10:03:38 2015
@@ -53,6 +53,7 @@ SolrConnector.Add=添加
 SolrConnector.AddZookeeperHost=添加ZooKeeper主机
 SolrConnector.Certificate=证书: 
 SolrConnector.IDFieldName=ID字段名: 
+SolrConnector.OriginalSizeFieldName=Original size field name:
 SolrConnector.ModifiedDateFieldName=更新日期字段名: 
 SolrConnector.CreatedDateFieldName=生成日期字段名: 
 SolrConnector.IndexedDateFieldName=索引化的日期字段名:

Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/RepositoryDocument.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/RepositoryDocument.java?rev=1682410&r1=1682409&r2=1682410&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/RepositoryDocument.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/RepositoryDocument.java Fri May 29 10:03:38 2015
@@ -56,6 +56,7 @@ public class RepositoryDocument
   protected Date createdDate = null;
   protected Date modifiedDate = null;
   protected Date indexingDate = null;
+  protected Long originalSize = null;
   
   /** Constructor.
   */
@@ -79,6 +80,7 @@ public class RepositoryDocument
     rval.createdDate = createdDate;
     rval.modifiedDate = modifiedDate;
     rval.indexingDate = indexingDate;
+    rval.originalSize = originalSize;
     for (String key : fields.keySet())
     {
       rval.fields.put(key,fields.get(key));
@@ -112,6 +114,23 @@ public class RepositoryDocument
     readerFields.clear();
   }
   
+  /** Set the document's original (repository) size.  Use null to indicate that the size is
+  * unknown.
+  *@param size is the size.
+  */
+  public void setOriginalSize(Long size)
+  {
+    originalSize = size;
+  }
+  
+  /** Get the document's original size.
+  *@return the original repository document size, or null if unknown.
+  */
+  public Long getOriginalSize()
+  {
+    return originalSize;
+  }
+  
   /** Set the document's created date.  Use null to indicate that the date is unknown.
   *@param date is the date.
   */