You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/07/18 16:23:11 UTC
svn commit: r1611659 - in
/manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler:
interfaces/IProcessActivity.java system/WorkerThread.java
Author: kwright
Date: Fri Jul 18 14:23:11 2014
New Revision: 1611659
URL: http://svn.apache.org/r1611659
Log:
Define new process activity methods to support component id's
Modified:
manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Modified: manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1611659&r1=1611658&r2=1611659&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java (original)
+++ manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java Fri Jul 18 14:23:11 2014
@@ -56,6 +56,19 @@ public interface IProcessActivity extend
String newVersionString)
throws ManifoldCFException;
+ /** Check if a document needs to be reindexed, based on a computed version string.
+ * Call this method to determine whether reindexing is necessary. Pass in a newly-computed version
+ * string. This method will return "true" if the document needs to be re-indexed.
+ *@param documentIdentifier is the document identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param newVersionString is the newly-computed version string.
+ *@return true if the document needs to be reindexed.
+ */
+ public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+ String componentIdentifier,
+ String newVersionString)
+ throws ManifoldCFException;
+
/** Add a document description to the current job's queue.
*@param documentIdentifier is the local document identifier to add (for the connector that
* fetched the document).
@@ -139,7 +152,23 @@ public interface IProcessActivity extend
*@param data is the document data. The data is closed after ingestion is complete.
*@throws IOException only when data stream reading fails.
*/
- public void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
+ public void ingestDocumentWithException(String documentIdentifier,
+ String version, String documentURI, RepositoryDocument data)
+ throws ManifoldCFException, ServiceInterruption, IOException;
+
+ /** Ingest the current document.
+ *@param documentIdentifier is the document's identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param version is the version of the document, as reported by the getDocumentVersions() method of the
+ * corresponding repository connector.
+ *@param documentURI is the URI to use to retrieve this document from the search interface (and is
+ * also the unique key in the index).
+ *@param data is the document data. The data is closed after ingestion is complete.
+ *@throws IOException only when data stream reading fails.
+ */
+ public void ingestDocumentWithException(String documentIdentifier,
+ String componentIdentifier,
+ String version, String documentURI, RepositoryDocument data)
throws ManifoldCFException, ServiceInterruption, IOException;
/** Ingest the current document.
@@ -161,17 +190,53 @@ public interface IProcessActivity extend
*@param documentIdentifier is the document's local identifier.
*@param version is the version string to be recorded for the document.
*/
- public void noDocument(String documentIdentifier, String version)
+ public void noDocument(String documentIdentifier,
+ String version)
throws ManifoldCFException, ServiceInterruption;
- /** Delete the specified document permanently from the search engine index, and from the status table.
+ /** Remove the specified document from the search engine index, and update the
+ * recorded version information for the document.
+ *@param documentIdentifier is the document's local identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param version is the version string to be recorded for the document.
+ */
+ public void noDocument(String documentIdentifier,
+ String componentIdentifier,
+ String version)
+ throws ManifoldCFException, ServiceInterruption;
+
+ /** Remove the specified document component permanently from the search engine index, and from the status table.
* This method does NOT keep track of any document version information for the document and thus can
* lead to "churn", whereby the same document is queued, processed,
* and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
* in any case where the same decision will need to be made over and over.
*@param documentIdentifier is the document's identifier.
+ *@param componentIdentifier is the component document identifier, if any.
*/
- public void deleteDocument(String documentIdentifier)
+ public void removeDocument(String documentIdentifier)
+ throws ManifoldCFException, ServiceInterruption;
+
+ /** Remove the specified document component permanently from the search engine index, and from the status table.
+ * This method does NOT keep track of any document version information for the document and thus can
+ * lead to "churn", whereby the same document is queued, processed,
+ * and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
+ * in any case where the same decision will need to be made over and over.
+ *@param documentIdentifier is the document's identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ */
+ public void removeDocument(String documentIdentifier,
+ String componentIdentifier)
+ throws ManifoldCFException, ServiceInterruption;
+
+ /** Record a document version, WITHOUT reindexing it, or removing it. (Other
+ * documents with the same URL, however, will still be removed.) This is
+ * useful if the version string changes but the document contents are known not
+ * to have changed.
+ *@param documentIdentifier is the document identifier.
+ *@param version is the document version.
+ */
+ public void recordDocument(String documentIdentifier,
+ String version)
throws ManifoldCFException;
/** Record a document version, WITHOUT reindexing it, or removing it. (Other
@@ -179,9 +244,23 @@ public interface IProcessActivity extend
* useful if the version string changes but the document contents are known not
* to have changed.
*@param documentIdentifier is the document identifier.
+ *@param componentIdentifier is the component document identifier, if any.
*@param version is the document version.
*/
- public void recordDocument(String documentIdentifier, String version)
+ public void recordDocument(String documentIdentifier,
+ String componentIdentifier,
+ String version)
+ throws ManifoldCFException;
+
+ /** Delete the specified document permanently from the search engine index, and from the status table,
+ * along with all its components.
+ * This method does NOT keep track of any document version information for the document and thus can
+ * lead to "churn", whereby the same document is queued, processed,
+ * and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
+ * in any case where the same decision will need to be made over and over.
+ *@param documentIdentifier is the document's identifier.
+ */
+ public void deleteDocument(String documentIdentifier)
throws ManifoldCFException;
/** Delete the current document from the search engine index, while keeping track of the version information
@@ -194,7 +273,6 @@ public interface IProcessActivity extend
public void deleteDocument(String documentIdentifier, String version)
throws ManifoldCFException, ServiceInterruption;
-
/** Override the schedule for the next time a document is crawled.
* Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound,
* or a combination of these, on a specific document. This method is only effective if the job is a continuous one, and if the
Modified: manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1611659&r1=1611658&r2=1611659&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original)
+++ manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Fri Jul 18 14:23:11 2014
@@ -1316,6 +1316,24 @@ public class WorkerThread extends Thread
return ingester.checkFetchDocument(spec,newVersionString,parameterVersion,connection.getACLAuthority());
}
+ /** Check if a document needs to be reindexed, based on a computed version string.
+ * Call this method to determine whether reindexing is necessary. Pass in a newly-computed version
+ * string. This method will return "true" if the document needs to be re-indexed.
+ *@param documentIdentifier is the document identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param newVersionString is the newly-computed version string.
+ *@return true if the document needs to be reindexed.
+ */
+ @Override
+ public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+ String componentIdentifier,
+ String newVersionString)
+ throws ManifoldCFException
+ {
+ // MHL
+ return false;
+ }
+
/** Add a document description to the current job's queue.
*@param localIdentifier is the local document identifier to add (for the connector that
* fetched the document).
@@ -1423,6 +1441,23 @@ public class WorkerThread extends Thread
touchedSet.add(documentIdentifier);
}
+ /** Record a document version, WITHOUT reindexing it, or removing it. (Other
+ * documents with the same URL, however, will still be removed.) This is
+ * useful if the version string changes but the document contents are known not
+ * to have changed.
+ *@param documentIdentifier is the document identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param version is the document version.
+ */
+ @Override
+ public void recordDocument(String documentIdentifier,
+ String componentIdentifier,
+ String version)
+ throws ManifoldCFException
+ {
+ // MHL
+ }
+
/** Ingest the current document.
*@param localIdentifier is the document's local identifier.
*@param version is the version of the document, as reported by the getDocumentVersions() method of the
@@ -1499,6 +1534,25 @@ public class WorkerThread extends Thread
touchedSet.add(documentIdentifier);
}
+ /** Ingest the current document.
+ *@param documentIdentifier is the document's identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param version is the version of the document, as reported by the getDocumentVersions() method of the
+ * corresponding repository connector.
+ *@param documentURI is the URI to use to retrieve this document from the search interface (and is
+ * also the unique key in the index).
+ *@param data is the document data. The data is closed after ingestion is complete.
+ *@throws IOException only when data stream reading fails.
+ */
+ @Override
+ public void ingestDocumentWithException(String documentIdentifier,
+ String componentIdentifier,
+ String version, String documentURI, RepositoryDocument data)
+ throws ManifoldCFException, ServiceInterruption, IOException
+ {
+ // MHL
+ }
+
/** Remove the specified document from the search engine index, while keeping track of the version information
* for it (to reduce churn).
*@param documentIdentifier is the document's local identifier.
@@ -1521,6 +1575,61 @@ public class WorkerThread extends Thread
touchedSet.add(documentIdentifier);
}
+ /** Remove the specified document from the search engine index, and update the
+ * recorded version information for the document.
+ *@param documentIdentifier is the document's local identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ *@param version is the version string to be recorded for the document.
+ */
+ @Override
+ public void noDocument(String documentIdentifier,
+ String componentIdentifier,
+ String version)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // MHL
+ }
+
+ /** Remove the specified document component permanently from the search engine index, and from the status table.
+ * This method does NOT keep track of any document version information for the document and thus can
+ * lead to "churn", whereby the same document is queued, processed,
+ * and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
+ * in any case where the same decision will need to be made over and over.
+ *@param documentIdentifier is the document's identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ */
+ @Override
+ public void removeDocument(String documentIdentifier)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Remove from incremental ingester ONLY.
+ String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
+ ingester.documentDelete(
+ pipelineSpecification.getBasicPipelineSpecification(),
+ connectionName,documentIdentifierHash,
+ ingestLogger);
+
+ // Note that we touched it, so it won't get checked
+ touchedSet.add(documentIdentifier);
+ }
+
+ /** Remove the specified document component permanently from the search engine index, and from the status table.
+ * This method does NOT keep track of any document version information for the document and thus can
+ * lead to "churn", whereby the same document is queued, processed,
+ * and removed on subsequent crawls. It is therefore preferable to use noDocument() instead,
+ * in any case where the same decision will need to be made over and over.
+ *@param documentIdentifier is the document's identifier.
+ *@param componentIdentifier is the component document identifier, if any.
+ */
+ @Override
+ public void removeDocument(String documentIdentifier,
+ String componentIdentifier)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // MHL
+ }
+
+
/** Delete the current document from the search engine index, while keeping track of the version information
* for it (to reduce churn).
* Use noDocument() above instead.