You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/05/10 14:28:08 UTC

svn commit: r1593693 - in /manifoldcf/trunk: connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/ framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/ framework/pull-agent/src/main/java/...

Author: kwright
Date: Sat May 10 12:28:08 2014
New Revision: 1593693

URL: http://svn.apache.org/r1593693
Log:
Update Livelink connector to use chained model

Modified:
    manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java

Modified: manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java?rev=1593693&r1=1593692&r2=1593693&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java (original)
+++ manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java Sat May 10 12:28:08 2014
@@ -209,6 +209,17 @@ public class LivelinkConnector extends o
   {
   }
 
+  /** Tell the world what model this connector uses for getDocumentIdentifiers().
+  * This must return a model value as specified above.
+  *@return the model type value.
+  */
+  @Override
+  public int getConnectorModel()
+  {
+    // Livelink is a chained hierarchy model
+    return MODEL_CHAINED_ADD_CHANGE;
+  }
+
   /** Connect.  The configuration parameters are included.
   *@param configParams are the configuration parameters for this connection.
   */

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java?rev=1593693&r1=1593692&r2=1593693&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java Sat May 10 12:28:08 2014
@@ -76,8 +76,9 @@ public interface IRepositoryConnector ex
   // MODEL_CHAINED_ADD_CHANGE_DELETE would be appropriate.  But, if a changed node can only discover child
   // additions and changes, then MODEL_CHAINED_ADD_CHANGE would be the right choice.
 
-  /** Supply all seeds every time.  The connector does not pay any attention to the start time or end time
-  * of the request, and simply returns a complete list of seeds. */
+  /** This is the legacy ManifoldCF catch-all crawling model.  All existing documents will be rechecked when a crawl
+  * is done, every time.  This model was typically used for models where seeds were essentially fixed and all
+  * real documents were discovered during crawling. */
   public static final int MODEL_ALL = 0;
   /** This indicates that the seeds are never complete; the previous seeds are lost and cannot be retrieved. */
   public static final int MODEL_PARTIAL = 4;

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java?rev=1593693&r1=1593692&r2=1593693&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java Sat May 10 12:28:08 2014
@@ -6125,12 +6125,15 @@ public class JobManager implements IJobM
         jobQueue.preparePartialScan(jobID);
       return;
     }
-    
-    // Similarly, minimal crawl attempts no delete phase unless the connector explicitly forbids it, or unless
-    // the job criteria have changed.
+
+    // Look for a minimum crawl.
+    // Minimum crawls do only what is seeded, in general.  These are partial scans, always.  MODEL_ALL disables this
+    // functionality, as does a scan from the beginning of time (after the job spec has been changed).
     if (requestMinimum && connectorModel != IRepositoryConnector.MODEL_ALL && !fromBeginningOfTime)
     {
-      // If it is a chained model, do the partial prep.
+      // Minimum crawl requested.
+      // If it is a chained model, do the partial prep.  If it's a non-chained model, do nothing for prep; the seeding
+      // will flag the documents we want to look at.
       if (connectorModel == IRepositoryConnector.MODEL_CHAINED_ADD ||
         connectorModel == IRepositoryConnector.MODEL_CHAINED_ADD_CHANGE)
         jobQueue.preparePartialScan(jobID);
@@ -6139,9 +6142,23 @@ public class JobManager implements IJobM
     
     if (!continuousJob && connectorModel != IRepositoryConnector.MODEL_PARTIAL &&
       (connectorModel == IRepositoryConnector.MODEL_ALL || fromBeginningOfTime))
+    {
+      // Prepare for a full scan if:
+      // (a) not a continuous job, and
+      // (b) not a partial model (which always disables full scans), and
+      // (c) either MODEL_ALL or from the beginning of time (which are essentially equivalent)
       prepareFullScan(jobID,legalLinkTypes,hopcountMethod);
+    }
     else
+    {
+      // Map COMPLETE and UNCHANGED to PENDINGPURGATORY, if:
+      // (a) job is continuous, OR
+      // (b) MODEL_PARTIAL, OR
+      // (c) not MODEL_ALL AND not from beginning of time
+      // This causes all existing documents to be rechecked!  This is needed because the model is not
+      // complete at this point; we have ADD but we don't have either CHANGE or DELETE.
       jobQueue.prepareIncrementalScan(jobID);
+    }
   }
 
   /** Queue all existing.