You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/07/18 16:23:11 UTC

svn commit: r1611659 - in /manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler: interfaces/IProcessActivity.java system/WorkerThread.java

Author: kwright
Date: Fri Jul 18 14:23:11 2014
New Revision: 1611659

URL: http://svn.apache.org/r1611659
Log:
Define new process activity methods to support component id's

Modified:
    manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
    manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Modified: manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1611659&r1=1611658&r2=1611659&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java (original)
+++ manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java Fri Jul 18 14:23:11 2014
@@ -56,6 +56,19 @@ public interface IProcessActivity extend
     String newVersionString)
     throws ManifoldCFException;
 
+  /** Check if a document needs to be reindexed, based on a computed version string.
+  * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed version
+  * string.  This method will return "true" if the document needs to be re-indexed.
+  *@param documentIdentifier is the document identifier.
+  *@param componentIdentifier is the component document identifier, if any.
+  *@param newVersionString is the newly-computed version string.
+  *@return true if the document needs to be reindexed.
+  */
+  public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+    String componentIdentifier,
+    String newVersionString)
+    throws ManifoldCFException;
+
   /** Add a document description to the current job's queue.
   *@param documentIdentifier is the local document identifier to add (for the connector that
   * fetched the document).
@@ -139,7 +152,23 @@ public interface IProcessActivity extend
   *@param data is the document data.  The data is closed after ingestion is complete.
   *@throws IOException only when data stream reading fails.
   */
-  public void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
+  public void ingestDocumentWithException(String documentIdentifier,
+    String version, String documentURI, RepositoryDocument data)
+    throws ManifoldCFException, ServiceInterruption, IOException;
+
+  /** Ingest the current document.
+  *@param documentIdentifier is the document's identifier.
+  *@param componentIdentifier is the component document identifier, if any.
+  *@param version is the version of the document, as reported by the getDocumentVersions() method of the
+  *       corresponding repository connector.
+  *@param documentURI is the URI to use to retrieve this document from the search interface (and is
+  *       also the unique key in the index).
+  *@param data is the document data.  The data is closed after ingestion is complete.
+  *@throws IOException only when data stream reading fails.
+  */
+  public void ingestDocumentWithException(String documentIdentifier,
+    String componentIdentifier,
+    String version, String documentURI, RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption, IOException;
 
   /** Ingest the current document.
@@ -161,17 +190,53 @@ public interface IProcessActivity extend
   *@param documentIdentifier is the document's local identifier.
   *@param version is the version string to be recorded for the document.
   */
-  public void noDocument(String documentIdentifier, String version)
+  public void noDocument(String documentIdentifier,
+    String version)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Delete the specified document permanently from the search engine index, and from the status table.
+  /** Remove the specified document from the search engine index, and update the
+  * recorded version information for the document.
+  *@param documentIdentifier is the document's local identifier.
+  *@param componentIdentifier is the component document identifier, if any.
+  *@param version is the version string to be recorded for the document.
+  */
+  public void noDocument(String documentIdentifier,
+    String componentIdentifier,
+    String version)
+    throws ManifoldCFException, ServiceInterruption;
+
+  /** Remove the specified document component permanently from the search engine index, and from the status table.
   * This method does NOT keep track of any document version information for the document and thus can
   * lead to "churn", whereby the same document is queued, processed,
   * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
   * in any case where the same decision will need to be made over and over.
   *@param documentIdentifier is the document's identifier.
+  *@param componentIdentifier is the component document identifier, if any.
   */
-  public void deleteDocument(String documentIdentifier)
+  public void removeDocument(String documentIdentifier)
+    throws ManifoldCFException, ServiceInterruption;
+
+  /** Remove the specified document component permanently from the search engine index, and from the status table.
+  * This method does NOT keep track of any document version information for the document and thus can
+  * lead to "churn", whereby the same document is queued, processed,
+  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+  * in any case where the same decision will need to be made over and over.
+  *@param documentIdentifier is the document's identifier.
+  *@param componentIdentifier is the component document identifier, if any.
+  */
+  public void removeDocument(String documentIdentifier,
+    String componentIdentifier)
+    throws ManifoldCFException, ServiceInterruption;
+
+  /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
+  * documents with the same URL, however, will still be removed.)  This is
+  * useful if the version string changes but the document contents are known not
+  * to have changed.
+  *@param documentIdentifier is the document identifier.
+  *@param version is the document version.
+  */
+  public void recordDocument(String documentIdentifier,
+    String version)
     throws ManifoldCFException;
 
   /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
@@ -179,9 +244,23 @@ public interface IProcessActivity extend
   * useful if the version string changes but the document contents are known not
   * to have changed.
   *@param documentIdentifier is the document identifier.
+  *@param componentIdentifier is the component document identifier, if any.
   *@param version is the document version.
   */
-  public void recordDocument(String documentIdentifier, String version)
+  public void recordDocument(String documentIdentifier,
+    String componentIdentifier,
+    String version)
+    throws ManifoldCFException;
+
+  /** Delete the specified document permanently from the search engine index, and from the status table,
+  * along with all its components.
+  * This method does NOT keep track of any document version information for the document and thus can
+  * lead to "churn", whereby the same document is queued, processed,
+  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+  * in any case where the same decision will need to be made over and over.
+  *@param documentIdentifier is the document's identifier.
+  */
+  public void deleteDocument(String documentIdentifier)
     throws ManifoldCFException;
 
   /** Delete the current document from the search engine index, while keeping track of the version information
@@ -194,7 +273,6 @@ public interface IProcessActivity extend
   public void deleteDocument(String documentIdentifier, String version)
     throws ManifoldCFException, ServiceInterruption;
 
-
   /** Override the schedule for the next time a document is crawled.
   * Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound,
   * or a combination of these, on a specific document.  This method is only effective if the job is a continuous one, and if the

Modified: manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1611659&r1=1611658&r2=1611659&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original)
+++ manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Fri Jul 18 14:23:11 2014
@@ -1316,6 +1316,24 @@ public class WorkerThread extends Thread
       return ingester.checkFetchDocument(spec,newVersionString,parameterVersion,connection.getACLAuthority());
     }
 
+    /** Check if a document needs to be reindexed, based on a computed version string.
+    * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed version
+    * string.  This method will return "true" if the document needs to be re-indexed.
+    *@param documentIdentifier is the document identifier.
+    *@param componentIdentifier is the component document identifier, if any.
+    *@param newVersionString is the newly-computed version string.
+    *@return true if the document needs to be reindexed.
+    */
+    @Override
+    public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+      String componentIdentifier,
+      String newVersionString)
+      throws ManifoldCFException
+    {
+      // MHL
+      return false;
+    }
+
     /** Add a document description to the current job's queue.
     *@param localIdentifier is the local document identifier to add (for the connector that
     * fetched the document).
@@ -1423,6 +1441,23 @@ public class WorkerThread extends Thread
       touchedSet.add(documentIdentifier);
     }
 
+    /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
+    * documents with the same URL, however, will still be removed.)  This is
+    * useful if the version string changes but the document contents are known not
+    * to have changed.
+    *@param documentIdentifier is the document identifier.
+    *@param componentIdentifier is the component document identifier, if any.
+    *@param version is the document version.
+    */
+    @Override
+    public void recordDocument(String documentIdentifier,
+      String componentIdentifier,
+      String version)
+      throws ManifoldCFException
+    {
+      // MHL
+    }
+
     /** Ingest the current document.
     *@param localIdentifier is the document's local identifier.
     *@param version is the version of the document, as reported by the getDocumentVersions() method of the
@@ -1499,6 +1534,25 @@ public class WorkerThread extends Thread
       touchedSet.add(documentIdentifier);
     }
 
+    /** Ingest the current document.
+    *@param documentIdentifier is the document's identifier.
+    *@param componentIdentifier is the component document identifier, if any.
+    *@param version is the version of the document, as reported by the getDocumentVersions() method of the
+    *       corresponding repository connector.
+    *@param documentURI is the URI to use to retrieve this document from the search interface (and is
+    *       also the unique key in the index).
+    *@param data is the document data.  The data is closed after ingestion is complete.
+    *@throws IOException only when data stream reading fails.
+    */
+    @Override
+    public void ingestDocumentWithException(String documentIdentifier,
+      String componentIdentifier,
+      String version, String documentURI, RepositoryDocument data)
+      throws ManifoldCFException, ServiceInterruption, IOException
+    {
+      // MHL
+    }
+
     /** Remove the specified document from the search engine index, while keeping track of the version information
     * for it (to reduce churn).
     *@param documentIdentifier is the document's local identifier.
@@ -1521,6 +1575,61 @@ public class WorkerThread extends Thread
       touchedSet.add(documentIdentifier);
     }
 
+    /** Remove the specified document from the search engine index, and update the
+    * recorded version information for the document.
+    *@param documentIdentifier is the document's local identifier.
+    *@param componentIdentifier is the component document identifier, if any.
+    *@param version is the version string to be recorded for the document.
+    */
+    @Override
+    public void noDocument(String documentIdentifier,
+      String componentIdentifier,
+      String version)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // MHL
+    }
+
+    /** Remove the specified document component permanently from the search engine index, and from the status table.
+    * This method does NOT keep track of any document version information for the document and thus can
+    * lead to "churn", whereby the same document is queued, processed,
+    * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+    * in any case where the same decision will need to be made over and over.
+    *@param documentIdentifier is the document's identifier.
+    *@param componentIdentifier is the component document identifier, if any.
+    */
+    @Override
+    public void removeDocument(String documentIdentifier)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // Remove from incremental ingester ONLY.
+      String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
+      ingester.documentDelete(
+        pipelineSpecification.getBasicPipelineSpecification(),
+        connectionName,documentIdentifierHash,
+        ingestLogger);
+      
+      // Note that we touched it, so it won't get checked
+      touchedSet.add(documentIdentifier);
+    }
+
+    /** Remove the specified document component permanently from the search engine index, and from the status table.
+    * This method does NOT keep track of any document version information for the document and thus can
+    * lead to "churn", whereby the same document is queued, processed,
+    * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+    * in any case where the same decision will need to be made over and over.
+    *@param documentIdentifier is the document's identifier.
+    *@param componentIdentifier is the component document identifier, if any.
+    */
+    @Override
+    public void removeDocument(String documentIdentifier,
+      String componentIdentifier)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // MHL
+    }
+
+
     /** Delete the current document from the search engine index, while keeping track of the version information
     * for it (to reduce churn).
     * Use noDocument() above instead.