You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/05/10 14:28:08 UTC
svn commit: r1593693 - in /manifoldcf/trunk:
connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/
framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/
framework/pull-agent/src/main/java/...
Author: kwright
Date: Sat May 10 12:28:08 2014
New Revision: 1593693
URL: http://svn.apache.org/r1593693
Log:
Update Livelink connector to use chained model
Modified:
manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java
Modified: manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java?rev=1593693&r1=1593692&r2=1593693&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java (original)
+++ manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java Sat May 10 12:28:08 2014
@@ -209,6 +209,17 @@ public class LivelinkConnector extends o
{
}
+ /** Tell the world what model this connector uses for getDocumentIdentifiers().
+ * This must return a model value as specified above.
+ *@return the model type value.
+ */
+ @Override
+ public int getConnectorModel()
+ {
+ // Livelink is a chained hierarchy model
+ return MODEL_CHAINED_ADD_CHANGE;
+ }
+
/** Connect. The configuration parameters are included.
*@param configParams are the configuration parameters for this connection.
*/
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java?rev=1593693&r1=1593692&r2=1593693&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java Sat May 10 12:28:08 2014
@@ -76,8 +76,9 @@ public interface IRepositoryConnector ex
// MODEL_CHAINED_ADD_CHANGE_DELETE would be appropriate. But, if a changed node can only discover child
// additions and changes, then MODEL_CHAINED_ADD_CHANGE would be the right choice.
- /** Supply all seeds every time. The connector does not pay any attention to the start time or end time
- * of the request, and simply returns a complete list of seeds. */
+ /** This is the legacy ManifoldCF catch-all crawling model. All existing documents will be rechecked when a crawl
+ * is done, every time. This model was typically used for models where seeds were essentially fixed and all
+ * real documents were discovered during crawling. */
public static final int MODEL_ALL = 0;
/** This indicates that the seeds are never complete; the previous seeds are lost and cannot be retrieved. */
public static final int MODEL_PARTIAL = 4;
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java?rev=1593693&r1=1593692&r2=1593693&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java Sat May 10 12:28:08 2014
@@ -6125,12 +6125,15 @@ public class JobManager implements IJobM
jobQueue.preparePartialScan(jobID);
return;
}
-
- // Similarly, minimal crawl attempts no delete phase unless the connector explicitly forbids it, or unless
- // the job criteria have changed.
+
+ // Look for a minimum crawl.
+ // Minimum crawls do only what is seeded, in general. These are partial scans, always. MODEL_ALL disables this
+ // functionality, as does a scan from the beginning of time (after the job spec has been changed).
if (requestMinimum && connectorModel != IRepositoryConnector.MODEL_ALL && !fromBeginningOfTime)
{
- // If it is a chained model, do the partial prep.
+ // Minimum crawl requested.
+ // If it is a chained model, do the partial prep. If it's a non-chained model, do nothing for prep; the seeding
+ // will flag the documents we want to look at.
if (connectorModel == IRepositoryConnector.MODEL_CHAINED_ADD ||
connectorModel == IRepositoryConnector.MODEL_CHAINED_ADD_CHANGE)
jobQueue.preparePartialScan(jobID);
@@ -6139,9 +6142,23 @@ public class JobManager implements IJobM
if (!continuousJob && connectorModel != IRepositoryConnector.MODEL_PARTIAL &&
(connectorModel == IRepositoryConnector.MODEL_ALL || fromBeginningOfTime))
+ {
+ // Prepare for a full scan if:
+ // (a) not a continuous job, and
+ // (b) not a partial model (which always disables full scans), and
+ // (c) either MODEL_ALL or from the beginning of time (which are essentially equivalent)
prepareFullScan(jobID,legalLinkTypes,hopcountMethod);
+ }
else
+ {
+ // Map COMPLETE and UNCHANGED to PENDINGPURGATORY, if:
+ // (a) job is continuous, OR
+ // (b) MODEL_PARTIAL, OR
+ // (c) not MODEL_ALL AND not from beginning of time
+ // This causes all existing documents to be rechecked! This is needed because the model is not
+ // complete at this point; we have ADD but we don't have either CHANGE or DELETE.
jobQueue.prepareIncrementalScan(jobID);
+ }
}
/** Queue all existing.