You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/11/26 01:01:26 UTC
svn commit: r1641727 - in /manifoldcf/branches/dev_1x: ./ framework/
framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/
framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/
Author: kwright
Date: Wed Nov 26 00:01:26 2014
New Revision: 1641727
URL: http://svn.apache.org/r1641727
Log:
Pull up fix for CONNECTORS-1115 from trunk.
Modified:
manifoldcf/branches/dev_1x/ (props changed)
manifoldcf/branches/dev_1x/CHANGES.txt
manifoldcf/branches/dev_1x/framework/ (props changed)
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk:r1641724
Modified: manifoldcf/branches/dev_1x/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/CHANGES.txt?rev=1641727&r1=1641726&r2=1641727&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/CHANGES.txt (original)
+++ manifoldcf/branches/dev_1x/CHANGES.txt Wed Nov 26 00:01:26 2014
@@ -3,6 +3,10 @@ $Id$
======================= 1.8-dev =====================
+CONNECTORS-1115: Add ability to retain all components of a document,
+so that individual ones do not need to be specified.
+(Markus Schuch, Karl Wright)
+
CONNECTORS-1114: SQL exception calling removeDocument().
(Markus Schuch, Karl Wright)
Propchange: manifoldcf/branches/dev_1x/framework/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk/framework:r1641724
Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1641727&r1=1641726&r2=1641727&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java (original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java Wed Nov 26 00:01:26 2014
@@ -224,6 +224,14 @@ public interface IProcessActivity extend
String componentIdentifier)
throws ManifoldCFException;
+ /** Retain all existing document components of a primary document. Use this method to signal that
+ * no document components need to be reindexed. The default behavior is to remove
+ * components that are not mentioned during processing.
+ *@param documentIdentifier is the document's identifier.
+ */
+ public void retainAllComponentDocument(String documentIdentifier)
+ throws ManifoldCFException;
+
/** Record a document version, WITHOUT reindexing it, or removing it. (Other
* documents with the same URL, however, will still be removed.) This is
* useful if the version string changes but the document contents are known not
Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1641727&r1=1641726&r2=1641727&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Wed Nov 26 00:01:26 2014
@@ -1157,6 +1157,8 @@ public class WorkerThread extends Thread
protected final Set<String> documentDeletedSet = new HashSet<String>();
// Whether a component was touched or not, keyed by document identifier.
+ // If there's an entry here, then it means that *all* components for the document are to be retained.
+ protected final Set<String> allComponentsSet = new HashSet<String>();
// This does not include primary document. The set is keyed by component id hash.
protected final Map<String,Set<String>> touchedComponentSet = new HashMap<String,Set<String>>();
// This represents primary documents.
@@ -1229,6 +1231,8 @@ public class WorkerThread extends Thread
public boolean wasDocumentComponentTouched(String documentIdentifier,
String componentIdentifierHash)
{
+ if (allComponentsSet.contains(documentIdentifier))
+ return true;
Set<String> components = touchedComponentSet.get(documentIdentifier);
if (components == null)
return false;
@@ -1711,6 +1715,19 @@ public class WorkerThread extends Thread
noDocument(documentIdentifier,version);
}
+ /** Retain all existing document components of a primary document. Use this method to signal that
+ * no document components need to be reindexed. The default behavior is to remove
+ * components that are not mentioned during processing.
+ *@param documentIdentifier is the document's identifier.
+ */
+ @Override
+ public void retainAllComponentDocument(String documentIdentifier)
+ throws ManifoldCFException
+ {
+ checkAllComponentsMultipleDispositions(documentIdentifier);
+ touchAllComponentsSet(documentIdentifier);
+ }
+
/** Delete the specified document from the search engine index, and from the status table. This
* method does NOT keep track of version
* information for the document and thus can lead to "churn", whereby the same document is queued, processed,
@@ -2124,6 +2141,17 @@ public class WorkerThread extends Thread
return ManifoldCF.createJobSpecificString(jobID,simpleString);
}
+ protected void checkAllComponentsMultipleDispositions(String documentIdentifier)
+ {
+ if (abortSet.contains(documentIdentifier))
+ throw new IllegalStateException("Multiple document dispositions not allowed: Abort cannot be combined with component disposition; document '"+documentIdentifier+"'");
+ if (documentDeletedSet.contains(documentIdentifier))
+ throw new IllegalStateException("Multiple document dispositions not allowed: Document delete cannot be combined with component disposition; document '"+documentIdentifier+"'");
+ Set<String> components = touchedComponentSet.get(documentIdentifier);
+ if (components != null && components.size() > 0)
+ throw new IllegalStateException("Multiple document dispositions not allowed: Retain all components cannot be combined with individual component disposition; document '"+documentIdentifier+"'");
+ }
+
protected void checkMultipleDispositions(String documentIdentifier, String componentIdentifier, String componentIdentifierHash)
{
if (abortSet.contains(documentIdentifier))
@@ -2138,12 +2166,19 @@ public class WorkerThread extends Thread
}
else
{
+ if (allComponentsSet.contains(documentIdentifier))
+ throw new IllegalStateException("Multiple document component dispositions not allowed: document '"+documentIdentifier+"', component '"+componentIdentifier+"'");
Set<String> components = touchedComponentSet.get(documentIdentifier);
if (components != null && components.contains(componentIdentifierHash))
throw new IllegalStateException("Multiple document component dispositions not allowed: document '"+documentIdentifier+"', component '"+componentIdentifier+"'");
}
}
+ protected void touchAllComponentsSet(String documentIdentifier)
+ {
+ allComponentsSet.add(documentIdentifier);
+ }
+
protected void touchComponentSet(String documentIdentifier, String componentIdentifierHash)
{
if (componentIdentifierHash == null)