You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/07/18 11:33:15 UTC
svn commit: r1611573 - in
/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler:
connectors/BaseRepositoryConnector.java interfaces/IProcessActivity.java
system/WorkerThread.java
Author: kwright
Date: Fri Jul 18 09:33:14 2014
New Revision: 1611573
URL: http://svn.apache.org/r1611573
Log:
Simplify the IProcessActivity interface, so that connectors do not need to doubly inform the framework about their activities (or lack thereof)
Modified:
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1611573&r1=1611572&r2=1611573&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java Fri Jul 18 09:33:14 2014
@@ -374,8 +374,7 @@ public abstract class BaseRepositoryConn
}
else
{
- // These documents have been checked and found NOT to need reprocessing
- activities.noteUnchangedDocument(documentIdentifier);
+ // Document is unchanged. We leave it up to the framework to decide what that means.
}
scanDocuments.add(documentIdentifier);
}
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1611573&r1=1611572&r2=1611573&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java Fri Jul 18 09:33:14 2014
@@ -31,15 +31,13 @@ import org.apache.manifoldcf.agents.inte
* (3) For each document processed, there can be one of several dispositions:
* (a) There is no such document (anymore): deleteDocument() called for the document.
* (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
-* (c) The document is determined to be unchanged and no updates are needed: noteUnchangedDocument() is called
+* (c) The document is determined to be unchanged and no updates are needed: nothing needs to be called
* for the document.
* (d) The document is determined to be unchanged BUT the version string needs to be updated: recordDocument()
* is called for the document.
* (e) The document is determined to be unindexable BUT it still exists in the repository: noDocument()
* is called for the document.
* (f) There was a service interruption: ServiceInterruption is thrown.
-* (g) Nothing is called describing the document's disposition. In that case, for backwards compatibility,
-* the framework marks the document as having been processed.
* (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
* is available to return an opinion on that matter.
*/
@@ -158,15 +156,6 @@ public interface IProcessActivity extend
public void ingestDocument(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
throws ManifoldCFException, ServiceInterruption;
- /** Note the fact that a document exists but is unchanged, and nothing further
- * needs to be done to it.
- * Call this method if it is determined that the document in question is identical to
- * the formerly indexed document, AND when the version string for the document
- * has not changed either.
- */
- public void noteUnchangedDocument(String documentIdentifier)
- throws ManifoldCFException;
-
/** Remove the specified document from the search engine index, and update the
* recorded version information for the document.
*@param documentIdentifier is the document's local identifier.
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1611573&r1=1611572&r2=1611573&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Fri Jul 18 09:33:14 2014
@@ -448,16 +448,13 @@ public class WorkerThread extends Thread
requeueList.add(qd);
}
}
- else if (activity.wasDocumentUnchanged(qd.getDocumentDescription().getDocumentIdentifier()))
- {
-
- finishList.add(qd);
- ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
- }
else
- {
- // All documents not specifically called out above are simply finished, since we know they haven't been deleted.
finishList.add(qd);
+
+ // Note whether the document was untouched; if so, update it
+ if (!activity.wasDocumentTouched(qd.getDocumentDescription().getDocumentIdentifier()))
+ {
+ ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
}
}
@@ -479,6 +476,8 @@ public class WorkerThread extends Thread
checkClasses[i] = connectionName;
checkIDs[i] = ingesterCheckList.get(i);
}
+ // This method should exercise reasonable intelligence. If the document has never been indexed, it should detect that
+ // and stop. Otherwise, it should update the statistics accordingly.
ingester.documentCheckMultiple(pipelineSpecificationBasic,checkClasses,checkIDs,currentTime);
}
@@ -1117,8 +1116,8 @@ public class WorkerThread extends Thread
// Whether the document was aborted or not
protected final Set<String> abortSet = new HashSet<String>();
- // Whether the document was checked or not
- protected final Set<String> documentCheckedSet = new HashSet<String>();
+ // Whether the document was touched or not
+ protected final Set<String> touchedSet = new HashSet<String>();
// Whether document was deleted
protected final Set<String> documentDeletedSet = new HashSet<String>();
@@ -1178,11 +1177,11 @@ public class WorkerThread extends Thread
referenceList.clear();
}
- /** Check whether a document (and its version string) was unchanged or not.
+ /** Check whether a document (and its version string) was touched or not.
*/
- public boolean wasDocumentUnchanged(String documentIdentifier)
+ public boolean wasDocumentTouched(String documentIdentifier)
{
- return documentCheckedSet.contains(documentIdentifier);
+ return touchedSet.contains(documentIdentifier);
}
/** Check whether document was deleted or not.
@@ -1408,19 +1407,6 @@ public class WorkerThread extends Thread
return jobManager.retrieveParentDataAsFiles(jobID,ManifoldCF.hash(localIdentifier),dataName);
}
- /** Note the fact that a document exists but is unchanged, and nothing further
- * needs to be done to it.
- * Call this method if it is determined that the document in question is identical to
- * the formerly indexed document, AND when the version string for the document
- * has not changed either.
- */
- @Override
- public void noteUnchangedDocument(String documentIdentifier)
- throws ManifoldCFException
- {
- documentCheckedSet.add(documentIdentifier);
- }
-
/** Record a document version, but don't ingest it.
*@param documentIdentifier is the document identifier.
*@param version is the document version.
@@ -1434,6 +1420,7 @@ public class WorkerThread extends Thread
pipelineSpecification.getBasicPipelineSpecification(),
connectionName,documentIdentifierHash,
version,currentTime);
+ touchedSet.add(documentIdentifier);
}
/** Ingest the current document.
@@ -1509,6 +1496,7 @@ public class WorkerThread extends Thread
documentURI,
ingestLogger);
+ touchedSet.add(documentIdentifier);
}
/** Remove the specified document from the search engine index, while keeping track of the version information
@@ -1529,6 +1517,8 @@ public class WorkerThread extends Thread
connection.getACLAuthority(),
currentTime,
ingestLogger);
+
+ touchedSet.add(documentIdentifier);
}
/** Delete the current document from the search engine index, while keeping track of the version information