You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/07/05 20:53:02 UTC
svn commit: r1608109 [2/2] - in /manifoldcf/trunk:
connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/
connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation...
Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java Sat Jul 5 18:53:01 2014
@@ -72,16 +72,7 @@ public interface IIncrementalIngester
*@param spec is the output specification.
*@return the description string.
*/
- @Deprecated
- public String getOutputDescription(String outputConnectionName, OutputSpecification spec)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Get an output version string for a document.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param spec is the output specification.
- *@return the description string.
- */
- public String getOutputDescription(String outputConnectionName, Specification spec)
+ public VersionContext getOutputDescription(String outputConnectionName, Specification spec)
throws ManifoldCFException, ServiceInterruption;
/** Get transformation version string for a document.
@@ -89,7 +80,7 @@ public interface IIncrementalIngester
*@param spec is the transformation specification.
*@return the description string.
*/
- public String getTransformationDescription(String transformationConnectionName, Specification spec)
+ public VersionContext getTransformationDescription(String transformationConnectionName, Specification spec)
throws ManifoldCFException, ServiceInterruption;
/** Check if a mime type is indexable.
@@ -104,16 +95,6 @@ public interface IIncrementalIngester
IOutputCheckActivity activity)
throws ManifoldCFException, ServiceInterruption;
- /** Check if a mime type is indexable.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param outputDescription is the output description string.
- *@param mimeType is the mime type to check.
- *@return true if the mimeType is indexable.
- */
- @Deprecated
- public boolean checkMimeTypeIndexable(String outputConnectionName, String outputDescription, String mimeType)
- throws ManifoldCFException, ServiceInterruption;
-
/** Check if a file is indexable.
*@param pipelineSpecification is the pipeline specification.
*@param localFile is the local file to check.
@@ -126,16 +107,6 @@ public interface IIncrementalIngester
IOutputCheckActivity activity)
throws ManifoldCFException, ServiceInterruption;
- /** Check if a file is indexable.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param outputDescription is the output description string.
- *@param localFile is the local file to check.
- *@return true if the local file is indexable.
- */
- @Deprecated
- public boolean checkDocumentIndexable(String outputConnectionName, String outputDescription, File localFile)
- throws ManifoldCFException, ServiceInterruption;
-
/** Pre-determine whether a document's length is indexable by this connector. This method is used by participating repository connectors
* to help filter out documents that are too long to be indexable.
*@param pipelineSpecification is the pipeline specification.
@@ -149,17 +120,6 @@ public interface IIncrementalIngester
IOutputCheckActivity activity)
throws ManifoldCFException, ServiceInterruption;
- /** Pre-determine whether a document's length is indexable by this connector. This method is used by participating repository connectors
- * to help filter out documents that are too long to be indexable.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param outputDescription is the output description string.
- *@param length is the length of the document.
- *@return true if the file is indexable.
- */
- @Deprecated
- public boolean checkLengthIndexable(String outputConnectionName, String outputDescription, long length)
- throws ManifoldCFException, ServiceInterruption;
-
/** Pre-determine whether a document's URL is indexable by this connector. This method is used by participating repository connectors
* to help filter out documents that not indexable.
*@param pipelineSpecification is the pipeline specification.
@@ -173,17 +133,6 @@ public interface IIncrementalIngester
IOutputCheckActivity activity)
throws ManifoldCFException, ServiceInterruption;
- /** Pre-determine whether a document's URL is indexable by this connector. This method is used by participating repository connectors
- * to help filter out documents that not indexable.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param outputDescription is the output description string.
- *@param url is the url of the document.
- *@return true if the file is indexable.
- */
- @Deprecated
- public boolean checkURLIndexable(String outputConnectionName, String outputDescription, String url)
- throws ManifoldCFException, ServiceInterruption;
-
/** Determine whether we need to fetch or refetch a document.
* Pass in information including the pipeline specification with existing version info, plus new document and parameter version strings.
* If no outputs need to be updated, then this method will return false. If any outputs need updating, then true is returned.
@@ -203,23 +152,6 @@ public interface IIncrementalIngester
/** Record a document version, but don't ingest it.
* The purpose of this method is to keep track of the frequency at which ingestion "attempts" take place.
* ServiceInterruption is thrown if this action must be rescheduled.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hashed document identifier.
- *@param documentVersion is the document version.
- *@param recordTime is the time at which the recording took place, in milliseconds since epoch.
- *@param activities is the object used in case a document needs to be removed from the output index as the result of this operation.
- */
- @Deprecated
- public void documentRecord(String outputConnectionName,
- String identifierClass, String identifierHash,
- String documentVersion, long recordTime,
- IOutputActivity activities)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Record a document version, but don't ingest it.
- * The purpose of this method is to keep track of the frequency at which ingestion "attempts" take place.
- * ServiceInterruption is thrown if this action must be rescheduled.
*@param pipelineSpecificationBasic is the basic pipeline specification needed.
*@param identifierClass is the name of the space in which the identifier hash should be interpreted.
*@param identifierHash is the hashed document identifier.
@@ -239,64 +171,6 @@ public interface IIncrementalIngester
* method also REMOVES ALL OLD METADATA. When complete, the index will contain only the metadata
* described by the RepositoryDocument object passed to this method.
* ServiceInterruption is thrown if the document ingestion must be rescheduled.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hashed document identifier.
- *@param documentVersion is the document version.
- *@param outputVersion is the output version string constructed from the output specification by the output connector.
- *@param authorityName is the name of the authority associated with the document, if any.
- *@param data is the document data. The data is closed after ingestion is complete.
- *@param ingestTime is the time at which the ingestion took place, in milliseconds since epoch.
- *@param documentURI is the URI of the document, which will be used as the key of the document in the index.
- *@param activities is an object providing a set of methods that the implementer can use to perform the operation.
- *@return true if the ingest was ok, false if the ingest is illegal (and should not be repeated).
- */
- @Deprecated
- public boolean documentIngest(String outputConnectionName,
- String identifierClass, String identifierHash,
- String documentVersion,
- String outputVersion,
- String authorityName,
- RepositoryDocument data,
- long ingestTime, String documentURI,
- IOutputActivity activities)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Ingest a document.
- * This ingests the document, and notes it. If this is a repeat ingestion of the document, this
- * method also REMOVES ALL OLD METADATA. When complete, the index will contain only the metadata
- * described by the RepositoryDocument object passed to this method.
- * ServiceInterruption is thrown if the document ingestion must be rescheduled.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hashed document identifier.
- *@param documentVersion is the document version.
- *@param parameterVersion is the forced parameter version.
- *@param outputVersion is the output version string constructed from the output specification by the output connector.
- *@param authorityName is the name of the authority associated with the document, if any.
- *@param data is the document data. The data is closed after ingestion is complete.
- *@param ingestTime is the time at which the ingestion took place, in milliseconds since epoch.
- *@param documentURI is the URI of the document, which will be used as the key of the document in the index.
- *@param activities is an object providing a set of methods that the implementer can use to perform the operation.
- *@return true if the ingest was ok, false if the ingest is illegal (and should not be repeated).
- */
- @Deprecated
- public boolean documentIngest(String outputConnectionName,
- String identifierClass, String identifierHash,
- String documentVersion,
- String outputVersion,
- String parameterVersion,
- String authorityName,
- RepositoryDocument data,
- long ingestTime, String documentURI,
- IOutputActivity activities)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Ingest a document.
- * This ingests the document, and notes it. If this is a repeat ingestion of the document, this
- * method also REMOVES ALL OLD METADATA. When complete, the index will contain only the metadata
- * described by the RepositoryDocument object passed to this method.
- * ServiceInterruption is thrown if the document ingestion must be rescheduled.
*@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information.
*@param identifierClass is the name of the space in which the identifier hash should be interpreted.
*@param identifierHash is the hashed document identifier.
@@ -323,32 +197,6 @@ public interface IIncrementalIngester
/** Note the fact that we checked a document (and found that it did not need to be ingested, because the
* versions agreed).
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
- *@param identifierHashes are the set of document identifier hashes.
- *@param checkTime is the time at which the check took place, in milliseconds since epoch.
- */
- @Deprecated
- public void documentCheckMultiple(String outputConnectionName,
- String[] identifierClasses, String[] identifierHashes,
- long checkTime)
- throws ManifoldCFException;
-
- /** Note the fact that we checked a document (and found that it did not need to be ingested, because the
- * versions agreed).
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hashed document identifier.
- *@param checkTime is the time at which the check took place, in milliseconds since epoch.
- */
- @Deprecated
- public void documentCheck(String outputConnectionName,
- String identifierClass, String identifierHash,
- long checkTime)
- throws ManifoldCFException;
-
- /** Note the fact that we checked a document (and found that it did not need to be ingested, because the
- * versions agreed).
*@param pipelineSpecificationBasic is a pipeline specification.
*@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
*@param identifierHashes are the set of document identifier hashes.
@@ -374,42 +222,6 @@ public interface IIncrementalIngester
throws ManifoldCFException;
/** Delete multiple documents from the search engine index.
- *@param outputConnectionNames are the names of the output connections associated with this action.
- *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
- *@param identifierHashes is tha array of document identifier hashes if the documents.
- *@param activities is the object to use to log the details of the ingestion attempt. May be null.
- */
- @Deprecated
- public void documentDeleteMultiple(String[] outputConnectionNames,
- String[] identifierClasses, String[] identifierHashes,
- IOutputRemoveActivity activities)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Delete multiple documents from the search engine index.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
- *@param identifierHashes is tha array of document identifier hashes if the documents.
- *@param activities is the object to use to log the details of the ingestion attempt. May be null.
- */
- @Deprecated
- public void documentDeleteMultiple(String outputConnectionName,
- String[] identifierClasses, String[] identifierHashes,
- IOutputRemoveActivity activities)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Delete a document from the search engine index.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hash of the id of the document.
- *@param activities is the object to use to log the details of the ingestion attempt. May be null.
- */
- @Deprecated
- public void documentDelete(String outputConnectionName,
- String identifierClass, String identifierHash,
- IOutputRemoveActivity activities)
- throws ManifoldCFException, ServiceInterruption;
-
- /** Delete multiple documents from the search engine index.
*@param pipelineSpecificationBasics are the pipeline specifications associated with the documents.
*@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
*@param identifierHashes is tha array of document identifier hashes if the documents.
@@ -445,41 +257,6 @@ public interface IIncrementalIngester
IOutputRemoveActivity activities)
throws ManifoldCFException, ServiceInterruption;
- /** Look up ingestion data for a SET of documents.
- *@param outputConnectionNames are the names of the output connections associated with this action.
- *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
- *@param identifierHashes is the array of document identifier hashes to look up.
- *@return the array of document data. Null will come back for any identifier that doesn't
- * exist in the index.
- */
- @Deprecated
- public DocumentIngestStatus[] getDocumentIngestDataMultiple(String[] outputConnectionNames,
- String[] identifierClasses, String[] identifierHashes)
- throws ManifoldCFException;
-
- /** Look up ingestion data for a SET of documents.
- *@param outputConnectionName is the names of the output connection associated with this action.
- *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
- *@param identifierHashes is the array of document identifier hashes to look up.
- *@return the array of document data. Null will come back for any identifier that doesn't
- * exist in the index.
- */
- @Deprecated
- public DocumentIngestStatus[] getDocumentIngestDataMultiple(String outputConnectionName,
- String[] identifierClasses, String[] identifierHashes)
- throws ManifoldCFException;
-
- /** Look up ingestion data for a documents.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hash of the id of the document.
- *@return the current document's ingestion data, or null if the document is not currently ingested.
- */
- @Deprecated
- public DocumentIngestStatus getDocumentIngestData(String outputConnectionName,
- String identifierClass, String identifierHash)
- throws ManifoldCFException;
-
/** Look up ingestion data for a set of documents.
*@param rval is a map of output key to document data, in no particular order, which will be loaded with all matching results.
*@param pipelineSpecificationBasics are the pipeline specifications corresponding to the identifier classes and hashes.
@@ -518,30 +295,6 @@ public interface IIncrementalIngester
/** Calculate the average time interval between changes for a document.
* This is based on the data gathered for the document.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
- *@param identifierHashes is the hashes of the ids of the documents.
- *@return the number of milliseconds between changes, or 0 if this cannot be calculated.
- */
- @Deprecated
- public long[] getDocumentUpdateIntervalMultiple(String outputConnectionName,
- String[] identifierClasses, String[] identifierHashes)
- throws ManifoldCFException;
-
- /** Calculate the average time interval between changes for a document.
- * This is based on the data gathered for the document.
- *@param outputConnectionName is the name of the output connection associated with this action.
- *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
- *@param identifierHash is the hash of the id of the document.
- *@return the number of milliseconds between changes, or 0 if this cannot be calculated.
- */
- @Deprecated
- public long getDocumentUpdateInterval(String outputConnectionName,
- String identifierClass, String identifierHash)
- throws ManifoldCFException;
-
- /** Calculate the average time interval between changes for a document.
- * This is based on the data gathered for the document.
*@param pipelineSpecificationBasic is the basic pipeline specification.
*@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted.
*@param identifierHashes is the hashes of the ids of the documents.
Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineConnector.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineConnector.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineConnector.java Sat Jul 5 18:53:01 2014
@@ -43,7 +43,7 @@ public interface IPipelineConnector exte
/** Document permanently rejected */
public final static int DOCUMENTSTATUS_REJECTED = 1;
- /** Get a pipeline version string, given a pipeline specification object. The version string is used to
+ /** Get a pipeline version object, given a pipeline specification object. The version string is used to
* uniquely describe the pertinent details of the specification and the configuration, to allow the Connector
* Framework to determine whether a document will need to be processed again.
* Note that the contents of any document cannot be considered by this method; only configuration and specification information
@@ -51,10 +51,10 @@ public interface IPipelineConnector exte
*
* This method presumes that the underlying connector object has been configured.
*@param spec is the current pipeline specification object for this connection for the job that is doing the crawling.
- *@return a string, of unlimited length, which uniquely describes configuration and specification in such a way that
+ *@return a version object, including a string of unlimited length, which uniquely describes configuration and specification in such a way that
* if two such strings are equal, nothing that affects how or whether the document is indexed will be different.
*/
- public String getPipelineDescription(Specification spec)
+ public VersionContext getPipelineDescription(Specification spec)
throws ManifoldCFException, ServiceInterruption;
/** Detect if a mime type is acceptable or not. This method is used to determine whether it makes sense to fetch a document
@@ -64,7 +64,7 @@ public interface IPipelineConnector exte
*@param checkActivity is an object including the activities that can be performed by this method.
*@return true if the mime type can be accepted by this connector.
*/
- public boolean checkMimeTypeIndexable(String pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
+ public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption;
/** Pre-determine whether a document (passed here as a File object) is acceptable or not. This method is
@@ -75,7 +75,7 @@ public interface IPipelineConnector exte
*@param checkActivity is an object including the activities that can be done by this method.
*@return true if the file is acceptable, false if not.
*/
- public boolean checkDocumentIndexable(String pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
+ public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption;
/** Pre-determine whether a document's length is acceptable. This method is used
@@ -85,7 +85,7 @@ public interface IPipelineConnector exte
*@param checkActivity is an object including the activities that can be done by this method.
*@return true if the file is acceptable, false if not.
*/
- public boolean checkLengthIndexable(String pipelineDescription, long length, IOutputCheckActivity checkActivity)
+ public boolean checkLengthIndexable(VersionContext pipelineDescription, long length, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption;
/** Pre-determine whether a document's URL is acceptable. This method is used
@@ -95,7 +95,7 @@ public interface IPipelineConnector exte
*@param checkActivity is an object including the activities that can be done by this method.
*@return true if the file is acceptable, false if not.
*/
- public boolean checkURLIndexable(String pipelineDescription, String url, IOutputCheckActivity checkActivity)
+ public boolean checkURLIndexable(VersionContext pipelineDescription, String url, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption;
/** Add (or replace) a document in the output data store using the connector.
@@ -114,7 +114,7 @@ public interface IPipelineConnector exte
*@return the document status (accepted or permanently rejected).
*@throws IOException only if there's a stream error reading the document data.
*/
- public int addOrReplaceDocumentWithException(String documentURI, String pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
+ public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException;
// UI support methods.
Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineSpecification.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineSpecification.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineSpecification.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineSpecification.java Sat Jul 5 18:53:01 2014
@@ -36,6 +36,6 @@ public interface IPipelineSpecification
*@param stage is the stage to get the connection name for.
*@return the description string that stage.
*/
- public String getStageDescriptionString(int stage);
+ public VersionContext getStageDescriptionString(int stage);
}
Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/output/BaseOutputConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/output/BaseOutputConnector.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/output/BaseOutputConnector.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/output/BaseOutputConnector.java Sat Jul 5 18:53:01 2014
@@ -89,10 +89,10 @@ public abstract class BaseOutputConnecto
*@return true if the mime type can be accepted by this connector.
*/
@Override
- public boolean checkMimeTypeIndexable(String pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
+ public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
- return checkMimeTypeIndexable(pipelineDescription, mimeType);
+ return checkMimeTypeIndexable(pipelineDescription.getVersionString(), mimeType);
}
/** Detect if a mime type is indexable or not. This method is used by participating repository connectors to pre-filter the number of
@@ -127,10 +127,10 @@ public abstract class BaseOutputConnecto
*@return true if the file is acceptable, false if not.
*/
@Override
- public boolean checkDocumentIndexable(String pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
+ public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
- return checkDocumentIndexable(pipelineDescription, localFile);
+ return checkDocumentIndexable(pipelineDescription.getVersionString(), localFile);
}
/** Pre-determine whether a document (passed here as a File object) is indexable by this connector. This method is used by participating
@@ -166,10 +166,10 @@ public abstract class BaseOutputConnecto
*@return true if the file is acceptable, false if not.
*/
@Override
- public boolean checkLengthIndexable(String pipelineDescription, long length, IOutputCheckActivity checkActivity)
+ public boolean checkLengthIndexable(VersionContext pipelineDescription, long length, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
- return checkLengthIndexable(pipelineDescription, length);
+ return checkLengthIndexable(pipelineDescription.getVersionString(), length);
}
/** Pre-determine whether a document's length is indexable by this connector. This method is used by participating repository connectors
@@ -192,10 +192,10 @@ public abstract class BaseOutputConnecto
*@return true if the file is acceptable, false if not.
*/
@Override
- public boolean checkURLIndexable(String pipelineDescription, String url, IOutputCheckActivity checkActivity)
+ public boolean checkURLIndexable(VersionContext pipelineDescription, String url, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
- return checkURLIndexable(pipelineDescription, url);
+ return checkURLIndexable(pipelineDescription.getVersionString(), url);
}
/** Pre-determine whether a document's URL is indexable by this connector. This method is used by participating repository connectors
@@ -222,10 +222,10 @@ public abstract class BaseOutputConnecto
* if two such strings are equal, nothing that affects how or whether the document is indexed will be different.
*/
@Override
- public String getPipelineDescription(Specification spec)
+ public VersionContext getPipelineDescription(Specification spec)
throws ManifoldCFException, ServiceInterruption
{
- return getOutputDescription((OutputSpecification)spec);
+ return new VersionContext(getOutputDescription((OutputSpecification)spec),params,spec);
}
/** Get an output version string, given an output specification. The output version string is used to uniquely describe the pertinent details of
@@ -263,10 +263,10 @@ public abstract class BaseOutputConnecto
*@throws IOException only if there's a stream error reading the document data.
*/
@Override
- public int addOrReplaceDocumentWithException(String documentURI, String pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
+ public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
{
- return addOrReplaceDocument(documentURI, pipelineDescription, document, authorityNameString, activities);
+ return addOrReplaceDocument(documentURI, pipelineDescription.getVersionString(), document, authorityNameString, activities);
}
/** Add (or replace) a document in the output data store using the connector.
Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/transformation/BaseTransformationConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/transformation/BaseTransformationConnector.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/transformation/BaseTransformationConnector.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/transformation/BaseTransformationConnector.java Sat Jul 5 18:53:01 2014
@@ -78,7 +78,7 @@ public abstract class BaseTransformation
*@return true if the mime type can be accepted by this connector.
*/
@Override
- public boolean checkMimeTypeIndexable(String pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
+ public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
return checkActivity.checkMimeTypeIndexable(mimeType);
@@ -93,7 +93,7 @@ public abstract class BaseTransformation
*@return true if the file is acceptable, false if not.
*/
@Override
- public boolean checkDocumentIndexable(String pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
+ public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
return checkActivity.checkDocumentIndexable(localFile);
@@ -107,7 +107,7 @@ public abstract class BaseTransformation
*@return true if the file is acceptable, false if not.
*/
@Override
- public boolean checkLengthIndexable(String pipelineDescription, long length, IOutputCheckActivity checkActivity)
+ public boolean checkLengthIndexable(VersionContext pipelineDescription, long length, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
return checkActivity.checkLengthIndexable(length);
@@ -121,7 +121,7 @@ public abstract class BaseTransformation
*@return true if the file is acceptable, false if not.
*/
@Override
- public boolean checkURLIndexable(String pipelineDescription, String url, IOutputCheckActivity checkActivity)
+ public boolean checkURLIndexable(VersionContext pipelineDescription, String url, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
return checkActivity.checkURLIndexable(url);
@@ -139,10 +139,10 @@ public abstract class BaseTransformation
* if two such strings are equal, nothing that affects how or whether the document is indexed will be different.
*/
@Override
- public String getPipelineDescription(Specification spec)
+ public VersionContext getPipelineDescription(Specification spec)
throws ManifoldCFException, ServiceInterruption
{
- return "";
+ return new VersionContext("",params,spec);
}
/** Add (or replace) a document in the output data store using the connector.
@@ -162,7 +162,7 @@ public abstract class BaseTransformation
*@throws IOException only if there's a stream error reading the document data.
*/
@Override
- public int addOrReplaceDocumentWithException(String documentURI, String pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
+ public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
{
return DOCUMENTSTATUS_REJECTED;
Added: manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/VersionContext.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/VersionContext.java?rev=1608109&view=auto
==============================================================================
--- manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/VersionContext.java (added)
+++ manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/VersionContext.java Sat Jul 5 18:53:01 2014
@@ -0,0 +1,67 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.interfaces;
+
+/** An instance of this class represents a version string, in combination with the configuration parameters
+* and specification that produced it. Some clients will use the version string (e.g. the database), while others
+* may find it more convenient to use the parameters or the specification. However:
+* (1) It is ALWAYS wrong to use data from configuration or specification that is NOT represented in some
+* way in the version string, either by exact representation, or by some proxy value;
+* (2) Configuration and Specification are guaranteed to be the identical ones which were used during creation
+* of the version string;
+* (3) Configuration and Specification are provided as CONVENIENCES; they are not to be considered primary
+* data for these objects.
+*/
+public class VersionContext
+{
+ public static final String _rcsid = "@(#)$Id$";
+
+ // Member variables
+ protected final String versionString;
+ protected final ConfigParams params;
+ protected final Specification specification;
+
+ /** Constructor.
+ */
+ public VersionContext(String versionString, ConfigParams params, Specification specification)
+ {
+ this.versionString = versionString;
+ this.params = params;
+ this.specification = specification;
+ }
+
+ /** Retrieve the version String */
+ public String getVersionString()
+ {
+ return versionString;
+ }
+
+ /** Retrieve the configuration parameters */
+ public ConfigParams getParams()
+ {
+ return params;
+ }
+
+ /** Retrieve the specification */
+ public Specification getSpecification()
+ {
+ return specification;
+ }
+
+}
Propchange: manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/VersionContext.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/interfaces/VersionContext.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java Sat Jul 5 18:53:01 2014
@@ -150,7 +150,7 @@ public abstract class BaseRepositoryConn
*@return an updated seeding version string, to be stored with the job.
*/
@Override
- public String addSeedDocumentsWithVersion(ISeedingActivity activities, Specification spec,
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
@@ -325,6 +325,43 @@ public abstract class BaseRepositoryConn
}
/** Get document versions given an array of document identifiers.
+ * This method is called for EVERY document that is considered. It is therefore important to perform
+ * as little work as possible here.
+ * The connector will be connected before this method can be called.
+ *@param documentVersions is the versions object, to be filled in by this method.
+ *@param documentIdentifiers is the array of local document identifiers, as understood by this connector.
+ *@param oldVersions is the corresponding array of version strings that have been saved for the document identifiers.
+ * A null value indicates that this is a first-time fetch, while an empty string indicates that the previous document
+ * had an empty version string.
+ *@param activities is the interface this method should use to perform whatever framework actions are desired.
+ *@param spec is the current document specification for the current job. If there is a dependency on this
+ * specification, then the version string should include the pertinent data, so that reingestion will occur
+ * when the specification changes. This is primarily useful for metadata.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
+ */
+ @Override
+ public void getDocumentVersions(
+ DocumentVersions documentVersions,
+ String[] documentIdentifiers, String[] oldVersions,
+ IVersionActivity activities,
+ Specification spec, int jobMode, boolean usesDefaultAuthority)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ String[] rval = getDocumentVersions(documentIdentifiers,oldVersions,activities,
+ spec,jobMode,usesDefaultAuthority);
+ for (int i = 0; i < rval.length; i++)
+ {
+ if (rval[i] != null)
+ {
+ documentVersions.setDocumentVersion(documentIdentifiers[i],new VersionContext(rval[i],params,spec));
+ if (rval[i].length() == 0)
+ documentVersions.alwaysRefetch(documentIdentifiers[i]);
+ }
+ }
+ }
+
+ /** Get document versions given an array of document identifiers.
* This method is called for EVERY document that is considered. It is
* therefore important to perform as little work as possible here.
*@param documentIdentifiers is the array of local document identifiers, as understood by this connector.
@@ -341,7 +378,6 @@ public abstract class BaseRepositoryConn
* Empty version strings indicate that there is no versioning ability for the corresponding document, and the document
* will always be processed.
*/
- @Override
public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
Specification spec, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption
@@ -448,10 +484,36 @@ public abstract class BaseRepositoryConn
* the getDocumentVersions() method, including those that returned null versions. It may be used to free resources
* committed during the getDocumentVersions() method. It is guaranteed to be called AFTER any calls to
* processDocuments() for the documents in question.
+ * The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers.
- *@param versions is the corresponding set of version identifiers (individual identifiers may be null).
+ *@param versions is the corresponding set of version strings (individual identifiers may have no version).
*/
@Override
+ public void releaseDocumentVersions(String[] documentIdentifiers, DocumentVersions versions)
+ throws ManifoldCFException
+ {
+ String[] versionStrings = new String[documentIdentifiers.length];
+ for (int i = 0; i < versionStrings.length; i++)
+ {
+ VersionContext vc = versions.getDocumentVersion(documentIdentifiers[i]);
+ boolean alwaysFetch = versions.isAlwaysRefetch(documentIdentifiers[i]);
+ if (alwaysFetch)
+ versionStrings[i] = "";
+ else if (vc == null)
+ versionStrings[i] = null;
+ else
+ versionStrings[i] = vc.getVersionString();
+ }
+ releaseDocumentVersions(documentIdentifiers,versionStrings);
+ }
+
+ /** Free a set of documents. This method is called for all documents whose versions have been fetched using
+ * the getDocumentVersions() method, including those that returned null versions. It may be used to free resources
+ * committed during the getDocumentVersions() method. It is guaranteed to be called AFTER any calls to
+ * processDocuments() for the documents in question.
+ *@param documentIdentifiers is the set of document identifiers.
+ *@param versions is the corresponding set of version identifiers (individual identifiers may be null).
+ */
public void releaseDocumentVersions(String[] documentIdentifiers, String[] versions)
throws ManifoldCFException
{
@@ -472,6 +534,42 @@ public abstract class BaseRepositoryConn
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
+ * The connector will be connected before this method can be called.
+ *@param documentIdentifiers is the set of document identifiers to process.
+ *@param versions are the version strings returned by getDocumentVersions() above.
+ *@param activities is the interface this method should use to queue up new document references
+ * and ingest documents.
+ *@param scanOnly is an array corresponding to the document identifiers. It is set to true to indicate when the processing
+ * should only find other references, and should not actually call the ingestion methods.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ */
+ @Override
+ public void processDocuments(String[] documentIdentifiers, DocumentVersions versions, IProcessActivity activities,
+ boolean[] scanOnly, int jobMode)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ Specification spec = null;
+ String[] versionStrings = new String[documentIdentifiers.length];
+ for (int i = 0; i < versionStrings.length; i++)
+ {
+ VersionContext vc = versions.getDocumentVersion(documentIdentifiers[i]);
+ if (vc != null)
+ spec = vc.getSpecification();
+ boolean alwaysFetch = versions.isAlwaysRefetch(documentIdentifiers[i]);
+ if (alwaysFetch)
+ versionStrings[i] = "";
+ else if (vc == null)
+ versionStrings[i] = null;
+ else
+ versionStrings[i] = vc.getVersionString();
+ }
+ processDocuments(documentIdentifiers,versionStrings,activities,spec,scanOnly,jobMode);
+ }
+
+ /** Process a set of documents.
+ * This is the method that should cause each document to be fetched, processed, and the results either added
+ * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
+ * The document specification allows this class to filter what is done based on the job.
*@param documentIdentifiers is the set of document identifiers to process.
*@param versions is the corresponding document versions to process, as returned by getDocumentVersions() above.
* The implementation may choose to ignore this parameter and always process the current version.
@@ -482,7 +580,6 @@ public abstract class BaseRepositoryConn
* should only find other references, and should not actually call the ingestion methods.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*/
- @Override
public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities,
Specification spec, boolean[] scanOnly, int jobMode)
throws ManifoldCFException, ServiceInterruption
Added: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/DocumentVersions.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/DocumentVersions.java?rev=1608109&view=auto
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/DocumentVersions.java (added)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/DocumentVersions.java Sat Jul 5 18:53:01 2014
@@ -0,0 +1,74 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.interfaces;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class represents a set of document versions, organized by document identifier.
+* It's part of the IRepositoryConnector API.
+*/
+public class DocumentVersions
+{
+ public static final String _rcsid = "@(#)$Id$";
+
+ protected final Map<String,VersionContext> documentVersions = new HashMap<String,VersionContext>();
+ protected final Set<String> alwaysRefetch = new HashSet<String>();
+
+ /** Constructor */
+ public DocumentVersions()
+ {
+ }
+
+ /** Set a non-special document version.
+ *@param documentIdentifier is the document identifier.
+ *@param documentVersion is the document version.
+ */
+ public void setDocumentVersion(String documentIdentifier, VersionContext documentVersion)
+ {
+ documentVersions.put(documentIdentifier,documentVersion);
+ }
+
+ /** Signal to always refetch document.
+ *@param documentIdentifier is the document identifier.
+ */
+ public void alwaysRefetch(String documentIdentifier)
+ {
+ alwaysRefetch.add(documentIdentifier);
+ }
+
+ /** Get the document version, if any.
+ *@param documentIdentifier is the document identifier.
+ *@return the document version, if any. Null indicates that no such document was found.
+ */
+ public VersionContext getDocumentVersion(String documentIdentifier)
+ {
+ return documentVersions.get(documentIdentifier);
+ }
+
+ /** Check whether we should always refetch a specified document.
+ *@param documentIdentifier is the document identifier.
+ *@return true if we are directed to always refetch. False will be returned by default.
+ */
+ public boolean isAlwaysRefetch(String documentIdentifier)
+ {
+ return alwaysRefetch.contains(documentIdentifier);
+ }
+
+}
Propchange: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/DocumentVersions.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/DocumentVersions.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java Sat Jul 5 18:53:01 2014
@@ -178,7 +178,7 @@ public interface IRepositoryConnector ex
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
- public String addSeedDocumentsWithVersion(ISeedingActivity activities, Specification spec,
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption;
@@ -186,6 +186,7 @@ public interface IRepositoryConnector ex
* This method is called for EVERY document that is considered. It is therefore important to perform
* as little work as possible here.
* The connector will be connected before this method can be called.
+ *@param documentVersions is the versions object, to be filled in by this method.
*@param documentIdentifiers is the array of local document identifiers, as understood by this connector.
*@param oldVersions is the corresponding array of version strings that have been saved for the document identifiers.
* A null value indicates that this is a first-time fetch, while an empty string indicates that the previous document
@@ -196,11 +197,11 @@ public interface IRepositoryConnector ex
* when the specification changes. This is primarily useful for metadata.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
- *@return the corresponding version strings, with null in the places where the document no longer exists.
- * Empty version strings indicate that there is no versioning ability for the corresponding document, and the document
- * will always be processed.
*/
- public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
+ public void getDocumentVersions(
+ DocumentVersions documentVersions,
+ String[] documentIdentifiers, String[] oldVersions,
+ IVersionActivity activities,
Specification spec, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption;
@@ -210,17 +211,15 @@ public interface IRepositoryConnector ex
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
- *@param versions is the corresponding document versions to process, as returned by getDocumentVersions() above.
- * The implementation may choose to ignore this parameter and always process the current version.
+ *@param versions are the version strings returned by getDocumentVersions() above.
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
- *@param spec is the document specification.
*@param scanOnly is an array corresponding to the document identifiers. It is set to true to indicate when the processing
* should only find other references, and should not actually call the ingestion methods.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*/
- public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities,
- Specification spec, boolean[] scanOnly, int jobMode)
+ public void processDocuments(String[] documentIdentifiers, DocumentVersions versions, IProcessActivity activities,
+ boolean[] scanOnly, int jobMode)
throws ManifoldCFException, ServiceInterruption;
/** Free a set of documents. This method is called for all documents whose versions have been fetched using
@@ -229,9 +228,9 @@ public interface IRepositoryConnector ex
* processDocuments() for the documents in question.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers.
- *@param versions is the corresponding set of version identifiers (individual identifiers may be null).
+ *@param versions is the corresponding set of version strings (individual identifiers may have no version).
*/
- public void releaseDocumentVersions(String[] documentIdentifiers, String[] versions)
+ public void releaseDocumentVersions(String[] documentIdentifiers, DocumentVersions versions)
throws ManifoldCFException;
/** Get the maximum number of documents to amalgamate together into one batch, for this connector.
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecification.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecification.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecification.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecification.java Sat Jul 5 18:53:01 2014
@@ -27,17 +27,17 @@ import org.apache.manifoldcf.crawler.int
public class PipelineSpecification implements IPipelineSpecification
{
protected final IPipelineSpecificationBasic basicSpecification;
- protected final String[] pipelineDescriptionStrings;
+ protected final VersionContext[] pipelineDescriptionStrings;
public PipelineSpecification(IPipelineSpecificationBasic basicSpecification, IJobDescription job, IIncrementalIngester ingester)
throws ManifoldCFException, ServiceInterruption
{
this.basicSpecification = basicSpecification;
- this.pipelineDescriptionStrings = new String[basicSpecification.getStageCount()];
+ this.pipelineDescriptionStrings = new VersionContext[basicSpecification.getStageCount()];
for (int i = 0; i < pipelineDescriptionStrings.length; i++)
{
// Note: this needs to change when output connections become part of the pipeline
- String descriptionString;
+ VersionContext descriptionString;
if (basicSpecification.checkStageOutputConnection(i))
{
descriptionString = ingester.getOutputDescription(basicSpecification.getStageConnectionName(i),job.getPipelineStageSpecification(i));
@@ -64,7 +64,7 @@ public class PipelineSpecification imple
*@return the description string that stage.
*/
@Override
- public String getStageDescriptionString(int stage)
+ public VersionContext getStageDescriptionString(int stage)
{
return pipelineDescriptionStrings[stage];
}
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/SeedingThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/SeedingThread.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/SeedingThread.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/SeedingThread.java Sat Jul 5 18:53:01 2014
@@ -147,7 +147,7 @@ public class SeedingThread extends Threa
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Seeding thread: Getting seeds for job "+jobID.toString());
- newSeedingVersion = connector.addSeedDocumentsWithVersion(activity,jobDescription.getSpecification(),lastSeedingVersion,currentTime,jobType);
+ newSeedingVersion = connector.addSeedDocuments(activity,jobDescription.getSpecification(),lastSeedingVersion,currentTime,jobType);
activity.doneSeeding(model==connector.MODEL_PARTIAL);
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StartupThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StartupThread.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StartupThread.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StartupThread.java Sat Jul 5 18:53:01 2014
@@ -150,7 +150,7 @@ public class StartupThread extends Threa
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Adding initial seed documents for job "+jobID.toString()+"...");
// Get the initial seed documents, and make sure those are added
- newSeedingVersion = connector.addSeedDocumentsWithVersion(activity,jobDescription.getSpecification(),lastSeedingVersion,currentTime,jobType);
+ newSeedingVersion = connector.addSeedDocuments(activity,jobDescription.getSpecification(),lastSeedingVersion,currentTime,jobType);
// Flush anything left
activity.doneSeeding(model==connector.MODEL_PARTIAL);
if (Logging.threads.isDebugEnabled())
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1608109&r1=1608108&r2=1608109&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Sat Jul 5 18:53:01 2014
@@ -81,7 +81,6 @@ public class WorkerThread extends Thread
IRepositoryConnectorPool repositoryConnectorPool = RepositoryConnectorPoolFactory.make(threadContext);
List<DocumentToProcess> fetchList = new ArrayList<DocumentToProcess>();
- Map<String,String> versionMap = new HashMap<String,String>();
List<QueuedDocument> finishList = new ArrayList<QueuedDocument>();
Map<String,Integer> idHashIndexMap = new HashMap<String,Integer>();
@@ -175,7 +174,6 @@ public class WorkerThread extends Thread
// Clear out all of our disposition lists
fetchList.clear();
finishList.clear();
- versionMap.clear();
deleteList.clear();
ingesterCheckList.clear();
hopcountremoveList.clear();
@@ -334,12 +332,14 @@ public class WorkerThread extends Thread
// === Fetch documents ===
// We start by getting the document version string.
- String[] newVersionStringArray = null;
+ DocumentVersions documentVersions = new DocumentVersions();
+ boolean successfulVersions = false;
try
{
- newVersionStringArray = connector.getDocumentVersions(currentDocIDArray,oldVersionStringArray,
+ connector.getDocumentVersions(documentVersions,currentDocIDArray,oldVersionStringArray,
versionActivity,spec,jobType,isDefaultAuthority);
-
+ successfulVersions = true;
+
if (Logging.threads.isDebugEnabled())
Logging.threads.debug("Worker thread done getting versions for "+Integer.toString(currentDocIDArray.length)+" documents");
@@ -398,7 +398,7 @@ public class WorkerThread extends Thread
}
// If version fetch was successful, the go on to processing phase
- if (newVersionStringArray != null)
+ if (successfulVersions)
{
// This try{ } is for releasing document versions at the connector level.
try
@@ -429,10 +429,9 @@ public class WorkerThread extends Thread
// We call the incremental ingester to make the decision for us as to whether we refetch a document or not.
String documentIDHash = dd.getDocumentIdentifierHash();
- String newDocVersion = newVersionStringArray[i];
- versionMap.put(documentIDHash,newDocVersion);
+ VersionContext newDocContext = documentVersions.getDocumentVersion(dd.getDocumentIdentifier());
- if (newDocVersion == null)
+ if (newDocContext == null)
{
deleteList.add(qd);
}
@@ -444,7 +443,7 @@ public class WorkerThread extends Thread
// See if we need to add, or update.
IPipelineSpecificationWithVersions specWithVersions = new PipelineSpecificationWithVersions(pipelineSpecification,qd);
boolean allowIngest = ingester.checkFetchDocument(specWithVersions,
- newDocVersion,
+ newDocContext.getVersionString(),
newParameterVersion,
aclAuthority);
@@ -507,7 +506,6 @@ public class WorkerThread extends Thread
// Build a list of id's and flags
String[] processIDs = new String[fetchList.size()];
String[] processIDHashes = new String[fetchList.size()];
- String[] versions = new String[fetchList.size()];
boolean[] scanOnly = new boolean[fetchList.size()];
for (int i = 0; i < fetchList.size(); i++)
@@ -516,7 +514,6 @@ public class WorkerThread extends Thread
DocumentDescription dd = dToP.getDocument().getDocumentDescription();
processIDs[i] = dd.getDocumentIdentifier();
processIDHashes[i] = dd.getDocumentIdentifierHash();
- versions[i] = versionMap.get(dd.getDocumentIdentifierHash());
scanOnly[i] = dToP.getScanOnly();
}
@@ -530,7 +527,7 @@ public class WorkerThread extends Thread
try
{
- connector.processDocuments(processIDs,versions,activity,job.getSpecification(),scanOnly,jobType);
+ connector.processDocuments(processIDs,documentVersions,activity,scanOnly,jobType);
// Flush remaining references into the database!
activity.flush();
@@ -780,7 +777,7 @@ public class WorkerThread extends Thread
finally
{
// Release any document temporary storage held by the connector
- connector.releaseDocumentVersions(currentDocIDArray,newVersionStringArray);
+ connector.releaseDocumentVersions(currentDocIDArray,documentVersions);
}
}