You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/07/18 11:33:15 UTC

svn commit: r1611573 - in /manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler: connectors/BaseRepositoryConnector.java interfaces/IProcessActivity.java system/WorkerThread.java

Author: kwright
Date: Fri Jul 18 09:33:14 2014
New Revision: 1611573

URL: http://svn.apache.org/r1611573
Log:
Simplify the IProcessActivity interface, so that connectors do not need to doubly inform the framework about their activities (or lack thereof)

Modified:
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1611573&r1=1611572&r2=1611573&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java Fri Jul 18 09:33:14 2014
@@ -374,8 +374,7 @@ public abstract class BaseRepositoryConn
           }
           else
           {
-            // These documents have been checked and found NOT to need reprocessing
-            activities.noteUnchangedDocument(documentIdentifier);
+            // Document is unchanged.  We leave it up to the framework to decide what that means.
           }
           scanDocuments.add(documentIdentifier);
         }

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1611573&r1=1611572&r2=1611573&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java Fri Jul 18 09:33:14 2014
@@ -31,15 +31,13 @@ import org.apache.manifoldcf.agents.inte
 * (3) For each document processed, there can be one of several dispositions:
 *   (a) There is no such document (anymore): deleteDocument() called for the document.
 *   (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
-*   (c) The document is determined to be unchanged and no updates are needed: noteUnchangedDocument() is called
+*   (c) The document is determined to be unchanged and no updates are needed: nothing needs to be called
 *     for the document.
 *   (d) The document is determined to be unchanged BUT the version string needs to be updated: recordDocument()
 *     is called for the document.
 *   (e) The document is determined to be unindexable BUT it still exists in the repository: noDocument()
 *    is called for the document.
 *   (f) There was a service interruption: ServiceInterruption is thrown.
-*   (g) Nothing is called describing the document's disposition.  In that case, for backwards compatibility,
-*    the framework marks the document as having been processed.
 * (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
 *    is available to return an opinion on that matter.
 */
@@ -158,15 +156,6 @@ public interface IProcessActivity extend
   public void ingestDocument(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Note the fact that a document exists but is unchanged, and nothing further
-  * needs to be done to it.
-  * Call this method if it is determined that the document in question is identical to
-  * the formerly indexed document, AND when the version string for the document
-  * has not changed either.
-  */
-  public void noteUnchangedDocument(String documentIdentifier)
-    throws ManifoldCFException;
-
   /** Remove the specified document from the search engine index, and update the
   * recorded version information for the document.
   *@param documentIdentifier is the document's local identifier.

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1611573&r1=1611572&r2=1611573&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Fri Jul 18 09:33:14 2014
@@ -448,16 +448,13 @@ public class WorkerThread extends Thread
                             requeueList.add(qd);
                           }
                         }
-                        else if (activity.wasDocumentUnchanged(qd.getDocumentDescription().getDocumentIdentifier()))
-                        {
-
-                          finishList.add(qd);
-                          ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
-                        }
                         else
-                        {
-                          // All documents not specifically called out above are simply finished, since we know they haven't been deleted.
                           finishList.add(qd);
+                        
+                        // Note whether the document was untouched; if so, update it
+                        if (!activity.wasDocumentTouched(qd.getDocumentDescription().getDocumentIdentifier()))
+                        {
+                          ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
                         }
                       }
 
@@ -479,6 +476,8 @@ public class WorkerThread extends Thread
                           checkClasses[i] = connectionName;
                           checkIDs[i] = ingesterCheckList.get(i);
                         }
+                        // This method should exercise reasonable intelligence.  If the document has never been indexed, it should detect that
+                        // and stop.  Otherwise, it should update the statistics accordingly.
                         ingester.documentCheckMultiple(pipelineSpecificationBasic,checkClasses,checkIDs,currentTime);
                       }
 
@@ -1117,8 +1116,8 @@ public class WorkerThread extends Thread
     // Whether the document was aborted or not
     protected final Set<String> abortSet = new HashSet<String>();
 
-    // Whether the document was checked or not
-    protected final Set<String> documentCheckedSet = new HashSet<String>();
+    // Whether the document was touched or not
+    protected final Set<String> touchedSet = new HashSet<String>();
     
     // Whether document was deleted
     protected final Set<String> documentDeletedSet = new HashSet<String>();
@@ -1178,11 +1177,11 @@ public class WorkerThread extends Thread
       referenceList.clear();
     }
 
-    /** Check whether a document (and its version string) was unchanged or not.
+    /** Check whether a document (and its version string) was touched or not.
     */
-    public boolean wasDocumentUnchanged(String documentIdentifier)
+    public boolean wasDocumentTouched(String documentIdentifier)
     {
-      return documentCheckedSet.contains(documentIdentifier);
+      return touchedSet.contains(documentIdentifier);
     }
     
     /** Check whether document was deleted or not.
@@ -1408,19 +1407,6 @@ public class WorkerThread extends Thread
       return jobManager.retrieveParentDataAsFiles(jobID,ManifoldCF.hash(localIdentifier),dataName);
     }
 
-    /** Note the fact that a document exists but is unchanged, and nothing further
-    * needs to be done to it.
-    * Call this method if it is determined that the document in question is identical to
-    * the formerly indexed document, AND when the version string for the document
-    * has not changed either.
-    */
-    @Override
-    public void noteUnchangedDocument(String documentIdentifier)
-      throws ManifoldCFException
-    {
-      documentCheckedSet.add(documentIdentifier);
-    }
-
     /** Record a document version, but don't ingest it.
     *@param documentIdentifier is the document identifier.
     *@param version is the document version.
@@ -1434,6 +1420,7 @@ public class WorkerThread extends Thread
         pipelineSpecification.getBasicPipelineSpecification(),
         connectionName,documentIdentifierHash,
         version,currentTime);
+      touchedSet.add(documentIdentifier);
     }
 
     /** Ingest the current document.
@@ -1509,6 +1496,7 @@ public class WorkerThread extends Thread
         documentURI,
         ingestLogger);
       
+      touchedSet.add(documentIdentifier);
     }
 
     /** Remove the specified document from the search engine index, while keeping track of the version information
@@ -1529,6 +1517,8 @@ public class WorkerThread extends Thread
         connection.getACLAuthority(),
         currentTime,
         ingestLogger);
+      
+      touchedSet.add(documentIdentifier);
     }
 
     /** Delete the current document from the search engine index, while keeping track of the version information