You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/07/24 11:52:54 UTC

svn commit: r1613055 - /manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java

Author: kwright
Date: Thu Jul 24 09:52:54 2014
New Revision: 1613055

URL: http://svn.apache.org/r1613055
Log:
Add some protection against poorly coded transformation connectors.  addOrReplaceDocument signals now must make it through the pipeline, or the framework will issue a noDocument() invocation itself.

Modified:
    manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java

Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java?rev=1613055&r1=1613054&r2=1613055&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java Thu Jul 24 09:52:54 2014
@@ -3189,9 +3189,17 @@ public class IncrementalIngester extends
     public int addOrReplaceDocumentWithException(String documentURI, RepositoryDocument document)
       throws ManifoldCFException, ServiceInterruption, IOException
     {
-      return pipelineConnector.addOrReplaceDocumentWithException(
+      // If the transformation connector doesn't do what it should, compensate!
+      MonitoredAddActivityWrapper wrapper = new MonitoredAddActivityWrapper(addActivity);
+      int rval = pipelineConnector.addOrReplaceDocumentWithException(
         documentURI,pipelineDescriptionString,
-        document,authorityNameString,addActivity);
+        document,authorityNameString,wrapper);
+      // The wrapper detects activity by the connector, so if we don't see either sendDocument() or
+      // noDocument(), we issue noDocument() ourselves.  If the connector was an output connector,
+      // this will wind up being a no-op, but otherwise it will guarantee that recording takes place.
+      if (!wrapper.wasDocumentActedUpon())
+        addActivity.noDocument();
+      return rval;
     }
     
     public void noDocument()
@@ -3729,4 +3737,137 @@ public class IncrementalIngester extends
     }
     
   }
+  
+  /** This class passes everything through, and monitors what happens so that the
+  * framework can compensate for any transformation connector coding errors.
+  */
+  protected static class MonitoredAddActivityWrapper implements IOutputAddActivity
+  {
+    protected final IOutputAddActivity activities;
+    
+    protected boolean documentProcessed = false;
+    
+    public MonitoredAddActivityWrapper(IOutputAddActivity activities)
+    {
+      this.activities = activities;
+    }
+    
+    public boolean wasDocumentActedUpon()
+    {
+      return documentProcessed;
+    }
+
+    /** Send a document via the pipeline to the next output connection.
+    *@param documentURI is the document's URI.
+    *@param document is the document data to be processed (handed to the output data store).
+    *@return the document status (accepted or permanently rejected); return codes are listed in IPipelineConnector.
+    *@throws IOException only if there's an IO error reading the data from the document.
+    */
+    @Override
+    public int sendDocument(String documentURI, RepositoryDocument document)
+      throws ManifoldCFException, ServiceInterruption, IOException
+    {
+      int rval = activities.sendDocument(documentURI,document);
+      documentProcessed = true;
+      return rval;
+    }
+
+    /** Send NO document via the pipeline to the next output connection.  This is equivalent
+    * to sending an empty document placeholder.
+    */
+    @Override
+    public void noDocument()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      activities.noDocument();
+      documentProcessed = true;
+    }
+
+    /** Qualify an access token appropriately, to match access tokens as returned by mod_aa.  This method
+    * includes the authority name with the access token, if any, so that each authority may establish its own token space.
+    *@param authorityNameString is the name of the authority to use to qualify the access token.
+    *@param accessToken is the raw, repository access token.
+    *@return the properly qualified access token.
+    */
+    @Override
+    public String qualifyAccessToken(String authorityNameString, String accessToken)
+      throws ManifoldCFException
+    {
+      return activities.qualifyAccessToken(authorityNameString,accessToken);
+    }
+
+    /** Record time-stamped information about the activity of the output connector.
+    *@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970).  Every
+    *       activity has an associated time; the startTime field records when the activity began.  A null value
+    *       indicates that the start time and the finishing time are the same.
+    *@param activityType is a string which is fully interpretable only in the context of the connector involved, which is
+    *       used to categorize what kind of activity is being recorded.  For example, a web connector might record a
+    *       "fetch document" activity.  Cannot be null.
+    *@param dataSize is the number of bytes of data involved in the activity, or null if not applicable.
+    *@param entityURI is a (possibly long) string which identifies the object involved in the history record.
+    *       The interpretation of this field will differ from connector to connector.  May be null.
+    *@param resultCode contains a terse description of the result of the activity.  The description is limited in
+    *       size to 255 characters, and can be interpreted only in the context of the current connector.  May be null.
+    *@param resultDescription is a (possibly long) human-readable string which adds detail, if required, to the result
+    *       described in the resultCode field.  This field is not meant to be queried on.  May be null.
+    */
+    @Override
+    public void recordActivity(Long startTime, String activityType, Long dataSize,
+      String entityURI, String resultCode, String resultDescription)
+      throws ManifoldCFException
+    {
+      activities.recordActivity(startTime,activityType,dataSize,entityURI,resultCode,resultDescription);
+    }
+
+    /** Detect if a mime type is acceptable downstream or not.  This method is used to determine whether it makes sense to fetch a document
+    * in the first place.
+    *@param mimeType is the mime type of the document.
+    *@return true if the mime type can be accepted by the downstream connection.
+    */
+    @Override
+    public boolean checkMimeTypeIndexable(String mimeType)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      return activities.checkMimeTypeIndexable(mimeType);
+    }
+
+    /** Pre-determine whether a document (passed here as a File object) is acceptable downstream.  This method is
+    * used to determine whether a document needs to be actually transferred.  This hook is provided mainly to support
+    * search engines that only handle a small set of accepted file types.
+    *@param localFile is the local file to check.
+    *@return true if the file is acceptable by the downstream connection.
+    */
+    @Override
+    public boolean checkDocumentIndexable(File localFile)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      return activities.checkDocumentIndexable(localFile);
+    }
+
+    /** Pre-determine whether a document's length is acceptable downstream.  This method is used
+    * to determine whether to fetch a document in the first place.
+    *@param length is the length of the document.
+    *@return true if the file is acceptable by the downstream connection.
+    */
+    @Override
+    public boolean checkLengthIndexable(long length)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      return activities.checkLengthIndexable(length);
+    }
+
+    /** Pre-determine whether a document's URL is acceptable downstream.  This method is used
+    * to help filter out documents that cannot be indexed in advance.
+    *@param url is the URL of the document.
+    *@return true if the file is acceptable by the downstream connection.
+    */
+    @Override
+    public boolean checkURLIndexable(String url)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      return activities.checkURLIndexable(url);
+    }
+
+  }
+  
 }