You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/04/26 19:54:12 UTC

svn commit: r1476320 - in /manifoldcf/trunk: ./ connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/ connectors...

Author: kwright
Date: Fri Apr 26 17:54:12 2013
New Revision: 1476320

URL: http://svn.apache.org/r1476320
Log:
Fix for CONNECTORS-681. Check the URL not the temp file's filename.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java
    manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
    manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1476320&r1=1476319&r2=1476320&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Apr 26 17:54:12 2013
@@ -3,6 +3,13 @@ $Id$
 
 ======================= 1.2-dev =====================
 
+CONNECTORS-681: ElasticSearch and OpenSearchServer connectors
+both misused the File object passed to them in checkFileIndexable()
+in order to see if the extension was a supported one.  Instead they
+should have been checking the URL.  Added that code as well as changed
+the JCIFS connector to check indexability using the URL means.
+(konrad, Karl Wright)
+
 CONNECTORS-679: Web connector hangs during throttling.  Reason
 appears to be that it is possible to interrupt the beginRead() method
 after it goes into "obtain estimate" mode.  Added code to make it clean

Modified: manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java?rev=1476320&r1=1476319&r2=1476320&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java (original)
+++ manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java Fri Apr 26 17:54:12 2013
@@ -342,9 +342,23 @@ public class ElasticSearchConnector exte
   public boolean checkDocumentIndexable(String outputDescription, File localFile)
       throws ManifoldCFException, ServiceInterruption
   {
+    // No filtering here; we don't look inside the file and don't know its extension.  That's done via the url
+    // filter
+    return true;
+  }
+  
+  /** Pre-determine whether a document's URL is indexable by this connector.  This method is used by participating repository connectors
+  * to help filter out documents that are not worth indexing.
+  *@param outputDescription is the document's output version.
+  *@param url is the URL of the document.
+  *@return true if the file is indexable.
+  */
+  @Override
+  public boolean checkURLIndexable(String outputDescription, String url)
+    throws ManifoldCFException, ServiceInterruption
+  {
     ElasticSearchSpecs specs = getSpecsCache(outputDescription);
-    return specs
-        .checkExtension(FilenameUtils.getExtension(localFile.getName()));
+    return specs.checkExtension(FilenameUtils.getExtension(url));
   }
 
   @Override

Modified: manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1476320&r1=1476319&r2=1476320&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Fri Apr 26 17:54:12 2013
@@ -594,8 +594,8 @@ public class SharedDriveConnector extend
             // is to fingerprint right here, in the version part of the world, but that's got a performance
             // downside, because it means that we'd have to suck over pretty much everything just to determine
             // what we wanted to ingest.
-            boolean ifIndexable = wouldFileBeIncluded(newPath,spec,true);
-            boolean ifNotIndexable = wouldFileBeIncluded(newPath,spec,false);
+            boolean ifIndexable = wouldFileBeIncluded(activities,newPath,ingestionURI,spec,true);
+            boolean ifNotIndexable = wouldFileBeIncluded(activities,newPath,ingestionURI,spec,false);
             if (ifIndexable == ifNotIndexable)
               sb.append("I");
             else
@@ -726,115 +726,127 @@ public class SharedDriveConnector extend
               String fileName = getFileCanonicalPath(file);
               if (fileName != null && !file.isHidden())
               {
-                // manipulate path to include the DFS alias, not the literal path
-                // String newPath = matchPrefix + fileName.substring(matchReplace.length());
-                String newPath = fileName;
-                if (checkNeedFileData(newPath, spec))
+                // Initialize repository document with common stuff, and find the URI
+                RepositoryDocument rd = new RepositoryDocument();
+                String uri = prepareForIndexing(rd,file,version);
+
+                if (activities.checkMimeTypeIndexable(mapExtensionToMimeType(fileName)) &&
+                  activities.checkURLIndexable(uri))
                 {
-                  if (Logging.connectors.isDebugEnabled())
-                    Logging.connectors.debug("JCIFS: Local file data needed for '"+documentIdentifier+"'");
 
-                  // Create a temporary file, and use that for the check and then the ingest
-                  File tempFile = File.createTempFile("_sdc_",null);
-                  try
+                  // manipulate path to include the DFS alias, not the literal path
+                  // String newPath = matchPrefix + fileName.substring(matchReplace.length());
+                  String newPath = fileName;
+                  if (checkNeedFileData(activities, newPath, uri, spec))
                   {
-                    FileOutputStream os = new FileOutputStream(tempFile);
+                    if (Logging.connectors.isDebugEnabled())
+                      Logging.connectors.debug("JCIFS: Local file data needed for '"+documentIdentifier+"'");
+
+                    // Create a temporary file, and use that for the check and then the ingest
+                    File tempFile = File.createTempFile("_sdc_",null);
                     try
                     {
-
-                      // Now, make a local copy so we can fingerprint
-                      InputStream inputStream = getFileInputStream(file);
+                      FileOutputStream os = new FileOutputStream(tempFile);
                       try
                       {
-                        // Copy!
-                        if (transferBuffer == null)
-                          transferBuffer = new byte[65536];
-                        while (true)
+
+                        // Now, make a local copy so we can fingerprint
+                        InputStream inputStream = getFileInputStream(file);
+                        try
+                        {
+                          // Copy!
+                          if (transferBuffer == null)
+                            transferBuffer = new byte[65536];
+                          while (true)
+                          {
+                            int amt = inputStream.read(transferBuffer,0,transferBuffer.length);
+                            if (amt == -1)
+                              break;
+                            os.write(transferBuffer,0,amt);
+                          }
+                        }
+                        finally
                         {
-                          int amt = inputStream.read(transferBuffer,0,transferBuffer.length);
-                          if (amt == -1)
-                            break;
-                          os.write(transferBuffer,0,amt);
+                          inputStream.close();
                         }
                       }
                       finally
                       {
-                        inputStream.close();
+                        os.close();
                       }
-                    }
-                    finally
-                    {
-                      os.close();
-                    }
 
-
-                    if (checkIngest(tempFile, newPath, spec, activities))
-                    {
-                      if (Logging.connectors.isDebugEnabled())
-                        Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
-                      // OK, do ingestion itself!
-                      InputStream inputStream = new FileInputStream(tempFile);
-                      try
+                      if (checkIngest(tempFile, newPath, uri, spec, activities))
                       {
-                        RepositoryDocument rd = new RepositoryDocument();
-                        rd.setBinary(inputStream, tempFile.length());
-                        
-                        indexDocument(activities,rd,file,documentIdentifier,version);
+                        if (Logging.connectors.isDebugEnabled())
+                          Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
+                        // OK, do ingestion itself!
+                        InputStream inputStream = new FileInputStream(tempFile);
+                        try
+                        {
+                          rd.setBinary(inputStream, tempFile.length());
+                          
+                          activities.ingestDocument(documentIdentifier, version, uri, rd);
+                        }
+                        finally
+                        {
+                          inputStream.close();
+                        }
+
+                        // I put this record here deliberately for two reasons:
+                        // (1) the other path includes ingestion time, and
+                        // (2) if anything fails up to and during ingestion, I want THAT failure record to be written, not this one.
+                        // So, really, ACTIVITY_ACCESS is a bit more than just fetch for JCIFS...
+                        activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+                          new Long(tempFile.length()),documentIdentifier,"Success",null,null);
+
                       }
-                      finally
+                      else
                       {
-                        inputStream.close();
+                        // We must actively remove the document here, because the getDocumentVersions()
+                        // method has no way of signalling this, since it does not do the fingerprinting.
+                        if (Logging.connectors.isDebugEnabled())
+                          Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
+                        activities.deleteDocument(documentIdentifier, version);
+                        // We should record the access here as well, since this is a non-exception way through the code path.
+                        // (I noticed that this was not being recorded in the history while fixing 25477.)
+                        activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+                          new Long(tempFile.length()),documentIdentifier,"Success",null,null);
                       }
-
-                      // I put this record here deliberately for two reasons:
-                      // (1) the other path includes ingestion time, and
-                      // (2) if anything fails up to and during ingestion, I want THAT failure record to be written, not this one.
-                      // So, really, ACTIVITY_ACCESS is a bit more than just fetch for JCIFS...
-                      activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
-                        new Long(tempFile.length()),documentIdentifier,"Success",null,null);
-
                     }
-                    else
+                    finally
                     {
-                      // We must actively remove the document here, because the getDocumentVersions()
-                      // method has no way of signalling this, since it does not do the fingerprinting.
-                      if (Logging.connectors.isDebugEnabled())
-                        Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
-                      activities.deleteDocument(documentIdentifier, version);
-                      // We should record the access here as well, since this is a non-exception way through the code path.
-                      // (I noticed that this was not being recorded in the history while fixing 25477.)
-                      activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
-                        new Long(tempFile.length()),documentIdentifier,"Success",null,null);
+                      tempFile.delete();
                     }
                   }
-                  finally
+                  else
                   {
-                    tempFile.delete();
+                    if (Logging.connectors.isDebugEnabled())
+                      Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
+
+                    // Presume that since the file was queued that it fulfilled the needed criteria.
+                    // Go off and ingest the fast way.
+
+                    // Ingest the document.
+                    InputStream inputStream = getFileInputStream(file);
+                    try
+                    {
+                      rd.setBinary(inputStream, fileLength(file));
+                      
+                      activities.ingestDocument(documentIdentifier, version, uri, rd);
+                    }
+                    finally
+                    {
+                      inputStream.close();
+                    }
+                    activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+                      new Long(fileLength(file)),documentIdentifier,"Success",null,null);
                   }
                 }
                 else
                 {
-                  if (Logging.connectors.isDebugEnabled())
-                    Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
-
-                  // Presume that since the file was queued that it fulfilled the needed criteria.
-                  // Go off and ingest the fast way.
-
-                  // Ingest the document.
-                  InputStream inputStream = getFileInputStream(file);
-                  try
-                  {
-                    RepositoryDocument rd = new RepositoryDocument();
-                    rd.setBinary(inputStream, fileLength(file));
-                    
-                    indexDocument(activities,rd,file,documentIdentifier,version);
-                  }
-                  finally
-                  {
-                    inputStream.close();
-                  }
-                  activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
-                    new Long(fileLength(file)),documentIdentifier,"Success",null,null);
+                  Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept it");
+                  activities.recordActivity(null,ACTIVITY_ACCESS,
+                    null,documentIdentifier,"Skip","Output connector refused",null);
                 }
               }
               else
@@ -965,8 +977,8 @@ public class SharedDriveConnector extend
 
   }
 
-  protected static void indexDocument(IProcessActivity activities, RepositoryDocument rd, SmbFile file, String documentIdentifier, String version)
-    throws ManifoldCFException, ServiceInterruption, SmbException
+  protected static String prepareForIndexing(RepositoryDocument rd, SmbFile file, String version)
+    throws ManifoldCFException, SmbException
   {
     String fileNameString = file.getName();
     Date lastModifiedDate = new Date(file.lastModified());
@@ -983,9 +995,9 @@ public class SharedDriveConnector extend
     index = setPathMetadata(rd,version,index);
     StringBuilder ingestURI = new StringBuilder();
     index = unpack(ingestURI,version,index,'+');
-    activities.ingestDocument(documentIdentifier, version, ingestURI.toString(), rd);
+    return ingestURI.toString();
   }
-
+  
   /** Map an extension to a mime type */
   protected static String mapExtensionToMimeType(String fileName)
   {
@@ -1601,12 +1613,13 @@ public class SharedDriveConnector extend
   /** Pretend that a file is either indexable or not, and return whether or not it would be ingested.
   * This is only ever called for files.
   *@param fileName is the canonical file name.
+  *@param url is the file's url.
   *@param documentSpecification is the specification.
   *@param pretendIndexable should be set to true if the document's contents would be fingerprinted as "indexable",
   *       or false otherwise.
   *@return true if the file would be ingested given the parameters.
   */
-  protected boolean wouldFileBeIncluded(String fileName, DocumentSpecification documentSpecification,
+  protected boolean wouldFileBeIncluded(IFingerprintActivity activities, String fileName, String url, DocumentSpecification documentSpecification,
     boolean pretendIndexable)
     throws ManifoldCFException
   {
@@ -1691,6 +1704,7 @@ public class SharedDriveConnector extend
                     isIndexable = false;
                   else
                   {
+                    // Evaluate the parts of being indexable that are based on the filename, mime type, and url
                     isIndexable = pretendIndexable;
                   }
 
@@ -1740,10 +1754,10 @@ public class SharedDriveConnector extend
   *@param documentSpecification is the document specification.
   *@return true if the file needs to be fingerprinted.
   */
-  protected boolean checkNeedFileData(String fileName, DocumentSpecification documentSpecification)
+  protected boolean checkNeedFileData(IFingerprintActivity activities, String fileName, String url, DocumentSpecification documentSpecification)
     throws ManifoldCFException
   {
-    return wouldFileBeIncluded(fileName,documentSpecification,true) != wouldFileBeIncluded(fileName,documentSpecification,false);
+    return wouldFileBeIncluded(activities,fileName,url,documentSpecification,true) != wouldFileBeIncluded(activities,fileName,url,documentSpecification,false);
   }
 
   /** Check if a file should be ingested, given a document specification and a local copy of the
@@ -1751,11 +1765,12 @@ public class SharedDriveConnector extend
   * file data by checkNeedFileData() will be checked by this method.
   *@param localFile is the file.
   *@param fileName is the JCIFS file name.
+  *@param url is the file's url.
   *@param documentSpecification is the specification.
   *@param activities are the activities available to determine indexability.
   *@return true if the file should be ingested.
   */
-  protected boolean checkIngest(File localFile, String fileName, DocumentSpecification documentSpecification, IFingerprintActivity activities)
+  protected boolean checkIngest(File localFile, String fileName, String url, DocumentSpecification documentSpecification, IFingerprintActivity activities)
     throws ManifoldCFException, ServiceInterruption
   {
     if (Logging.connectors.isDebugEnabled())

Modified: manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java?rev=1476320&r1=1476319&r2=1476320&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java (original)
+++ manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java Fri Apr 26 17:54:12 2013
@@ -340,9 +340,7 @@ public class OpenSearchServerConnector e
   @Override
   public boolean checkDocumentIndexable(String outputDescription, File localFile)
       throws ManifoldCFException, ServiceInterruption {
-    OpenSearchServerSpecs specs = getSpecsCache(outputDescription);
-    return specs
-        .checkExtension(FilenameUtils.getExtension(localFile.getName()));
+    return true;
   }
 
   @Override
@@ -352,6 +350,19 @@ public class OpenSearchServerConnector e
     return specs.checkMimeType(mimeType);
   }
 
+  /** Pre-determine whether a document's URL is indexable by this connector.  This method is used by participating repository connectors
+  * to help filter out documents that are not worth indexing.
+  *@param outputDescription is the document's output version.
+  *@param url is the URL of the document.
+  *@return true if the file is indexable.
+  */
+  @Override
+  public boolean checkURLIndexable(String outputDescription, String url)
+    throws ManifoldCFException, ServiceInterruption {
+    OpenSearchServerSpecs specs = getSpecsCache(outputDescription);
+    return specs.checkExtension(FilenameUtils.getExtension(url));
+  }
+    
   @Override
   public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out,
       Locale locale, ConfigParams parameters) throws ManifoldCFException, IOException {