You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/08/31 20:17:43 UTC
svn commit: r1621613 [2/2] - in /manifoldcf/trunk:
connectors/alfresco/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfresco/
connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/
connectors/document...
Modified: manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Sun Aug 31 18:17:42 2014
@@ -51,12 +51,14 @@ import org.apache.manifoldcf.core.interf
import org.apache.manifoldcf.core.interfaces.IPasswordMapperActivity;
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
+import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.IKeystoreManager;
import org.apache.manifoldcf.core.interfaces.KeystoreManagerFactory;
import org.apache.manifoldcf.core.interfaces.Configuration;
import org.apache.manifoldcf.core.interfaces.ConfigurationNode;
import org.apache.manifoldcf.core.interfaces.LockManagerFactory;
+import org.apache.manifoldcf.crawler.interfaces.ISeedingActivity;
import org.apache.manifoldcf.crawler.interfaces.DocumentSpecification;
import org.apache.manifoldcf.crawler.interfaces.IDocumentIdentifierStream;
import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
@@ -429,24 +431,69 @@ public class SharedDriveConnector extend
}
- /** Given a document specification, get either a list of starting document identifiers (seeds),
- * or a list of changes (deltas), depending on whether this is a "crawled" connector or not.
- * These document identifiers will be loaded into the job's queue at the beginning of the
- * job's execution.
- * This method can return changes only (because it is provided a time range). For full
- * recrawls, the start time is always zero.
- * Note that it is always ok to return MORE documents rather than less with this method.
+ /** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
+ * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+ *
+ * This method can choose to find repository changes that happen only during the specified time interval.
+ * The seeds recorded by this method will be viewed by the framework based on what the
+ * getConnectorModel() method returns.
+ *
+ * It is not a big problem if the connector chooses to create more seeds than are
+ * strictly necessary; it is merely a question of overall work required.
+ *
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
+ * be called once, when the job starts, and at various periodic intervals as the job executes.
+ *
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
+ * getConnectorModel().
+ *
+ * Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
+ *@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
- *@return the stream of local document identifiers that should be added to the queue.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public IDocumentIdentifierStream getDocumentIdentifiers(DocumentSpecification spec, long startTime, long endTime)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
getSession();
- return new IdentifierStream(spec);
+ try
+ {
+ for (int i = 0; i < spec.getChildCount(); i++)
+ {
+ SpecificationNode n = spec.getChild(i);
+ if (n.getType().equals(NODE_STARTPOINT))
+ {
+ // The id returned MUST be in canonical form!!!
+ String seed = mapToIdentifier(n.getAttributeValue(ATTRIBUTE_PATH));
+ if (Logging.connectors.isDebugEnabled())
+ {
+ Logging.connectors.debug("Seed = '"+seed+"'");
+ }
+ activities.addSeedDocument(seed);
+ }
+ }
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
+ }
+ catch (InterruptedIOException e)
+ {
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
+ }
+ return "";
}
@@ -812,7 +859,7 @@ public class SharedDriveConnector extend
// method has no way of signalling this, since it does not do the fingerprinting.
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
- activities.deleteDocument(documentIdentifier, version);
+ activities.noDocument(documentIdentifier, version);
// We should record the access here as well, since this is a non-exception way through the code path.
// (I noticed that this was not being recorded in the history while fixing 25477.)
activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
@@ -878,7 +925,7 @@ public class SharedDriveConnector extend
activities.recordActivity(null,ACTIVITY_ACCESS,
null,documentIdentifier,"Skip","Authorization: "+e.getMessage(),null);
// We call the delete even if it's a directory; this is harmless.
- activities.deleteDocument(documentIdentifier, version);
+ activities.noDocument(documentIdentifier, version);
}
catch (SmbException se)
{
@@ -937,7 +984,7 @@ public class SharedDriveConnector extend
Logging.connectors.debug("JCIFS: Skipping document/directory "+documentIdentifier+" because it cannot be found");
activities.recordActivity(null,ACTIVITY_ACCESS,
null,documentIdentifier,"Not found",null,null);
- activities.deleteDocument(documentIdentifier, version);
+ activities.noDocument(documentIdentifier, version);
}
else if (se.getMessage().indexOf("is denied") != -1)
{
@@ -945,7 +992,7 @@ public class SharedDriveConnector extend
// We call the delete even if it's a directory; this is harmless and it cleans up the jobqueue row.
activities.recordActivity(null,ACTIVITY_ACCESS,
null,documentIdentifier,"Skip","Authorization: "+se.getMessage(),null);
- activities.deleteDocument(documentIdentifier, version);
+ activities.noDocument(documentIdentifier, version);
}
else
{
@@ -2642,85 +2689,6 @@ public class SharedDriveConnector extend
return e1m.equals(e2m);
}
- /** Document identifier stream.
- */
- protected class IdentifierStream implements IDocumentIdentifierStream
- {
- protected String[] ids = null;
- protected int currentIndex = 0;
-
- public IdentifierStream(DocumentSpecification spec)
- throws ManifoldCFException
- {
- try
- {
- // Walk the specification for the "startpoint" types. Amalgamate these into a list of strings.
- // Presume that all roots are startpoint nodes
- int i = 0;
- int j = 0;
- while (i < spec.getChildCount())
- {
- SpecificationNode n = spec.getChild(i);
- if (n.getType().equals(NODE_STARTPOINT))
- j++;
- i++;
- }
- ids = new String[j];
- i = 0;
- j = 0;
- while (i < ids.length)
- {
- SpecificationNode n = spec.getChild(i);
- if (n.getType().equals(NODE_STARTPOINT))
- {
- // The id returned MUST be in canonical form!!!
- ids[j] = mapToIdentifier(n.getAttributeValue(ATTRIBUTE_PATH));
-
- if (Logging.connectors.isDebugEnabled())
- {
- Logging.connectors.debug("Seed = '"+ids[j]+"'");
- }
- j++;
- }
- i++;
- }
- }
- catch (java.net.SocketTimeoutException e)
- {
- throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
- }
- catch (InterruptedIOException e)
- {
- throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
- }
- catch (IOException e)
-
- {
- throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
- }
- }
-
- /** Get the next identifier.
- *@return the next document identifier, or null if there are no more.
- */
- public String getNextIdentifier()
- throws ManifoldCFException, ServiceInterruption
- {
- if (currentIndex == ids.length)
- return null;
- return ids[currentIndex++];
- }
-
- /** Close the stream.
- */
- public void close()
- throws ManifoldCFException
- {
- ids = null;
- }
-
- }
-
// UI support methods.
//
// These support methods come in two varieties. The first bunch is involved in setting up connection configuration information. The second bunch
Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java (original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java Sun Aug 31 18:17:42 2014
@@ -210,27 +210,37 @@ public class JDBCConnector extends org.a
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
- * The times passed to this method may be interpreted for greatest efficiency. The time ranges
- * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
- * to the "current time", each time the job is run. For continuous crawling jobs, this method will
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
- * When a job's specification is changed, the framework automatically resets the seeding start time to 0. The
- * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime, int jobMode)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
+ long startTime;
+ if (lastSeedVersion == null)
+ startTime = 0L;
+ else
+ {
+ // Unpack seed time from seed version string
+ startTime = new Long(lastSeedVersion).longValue();
+ }
+
getSession();
// Set up the query
@@ -239,7 +249,7 @@ public class JDBCConnector extends org.a
VariableMap vm = new VariableMap();
addConstant(vm,JDBCConstants.idReturnVariable,JDBCConstants.idReturnColumnName);
addVariable(vm,JDBCConstants.startTimeVariable,startTime);
- addVariable(vm,JDBCConstants.endTimeVariable,endTime);
+ addVariable(vm,JDBCConstants.endTimeVariable,seedTime);
// Do the substitution
ArrayList paramList = new ArrayList();
@@ -300,6 +310,7 @@ public class JDBCConnector extends org.a
{
idSet.close();
}
+ return new Long(seedTime).toString();
}
/** Get document versions given an array of document identifiers.
@@ -1811,7 +1822,7 @@ public class JDBCConnector extends org.a
public String versionQuery;
public String dataQuery;
- public TableSpec(DocumentSpecification ds)
+ public TableSpec(Specification ds)
{
int i = 0;
while (i < ds.getChildCount())
Modified: manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -39,6 +39,7 @@ import org.apache.manifoldcf.crawler.sys
import org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
+import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.commons.lang.StringUtils;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
@@ -812,48 +813,37 @@ public class JiraRepositoryConnector ext
Messages.outputResourceWithVelocity(out,locale,EDIT_SPEC_HEADER_FORWARD,paramMap);
}
- /**
- * Queue "seed" documents. Seed documents are the starting places for
- * crawling activity. Documents are seeded when this method calls
- * appropriate methods in the passed in ISeedingActivity object.
- *
- * This method can choose to find repository changes that happen only during
- * the specified time interval. The seeds recorded by this method will be
- * viewed by the framework based on what the getConnectorModel() method
- * returns.
- *
- * It is not a big problem if the connector chooses to create more seeds
- * than are strictly necessary; it is merely a question of overall work
- * required.
- *
- * The times passed to this method may be interpreted for greatest
- * efficiency. The time ranges any given job uses with this connector will
- * not overlap, but will proceed starting at 0 and going to the "current
- * time", each time the job is run. For continuous crawling jobs, this
- * method will be called once, when the job starts, and at various periodic
- * intervals as the job executes.
- *
- * When a job's specification is changed, the framework automatically resets
- * the seeding start time to 0. The seeding start time may also be set to 0
- * on each job run, depending on the connector model returned by
- * getConnectorModel().
- *
- * Note that it is always ok to send MORE documents rather than less to this
- * method.
- *
- * @param activities is the interface this method should use to perform
- * whatever framework actions are desired.
- * @param spec is a document specification (that comes from the job).
- * @param startTime is the beginning of the time range to consider,
- * inclusive.
- * @param endTime is the end of the time range to consider, exclusive.
- * @param jobMode is an integer describing how the job is being run, whether
- * continuous or once-only.
- */
+ /** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
+ * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+ *
+ * This method can choose to find repository changes that happen only during the specified time interval.
+ * The seeds recorded by this method will be viewed by the framework based on what the
+ * getConnectorModel() method returns.
+ *
+ * It is not a big problem if the connector chooses to create more seeds than are
+ * strictly necessary; it is merely a question of overall work required.
+ *
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
+ * be called once, when the job starts, and at various periodic intervals as the job executes.
+ *
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
+ * getConnectorModel().
+ *
+ * Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
+ *@param activities is the interface this method should use to perform whatever framework actions are desired.
+ *@param spec is a document specification (that comes from the job).
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
+ */
@Override
- public void addSeedDocuments(ISeedingActivity activities,
- DocumentSpecification spec, long startTime, long endTime, int jobMode)
- throws ManifoldCFException, ServiceInterruption {
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
+ throws ManifoldCFException, ServiceInterruption {
String jiraDriveQuery = JiraConfig.JIRA_QUERY_DEFAULT;
int i = 0;
@@ -906,6 +896,7 @@ public class JiraRepositoryConnector ext
} catch (ResponseException e) {
handleResponseException(e);
}
+ return "";
}
Modified: manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java (original)
+++ manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java Sun Aug 31 18:17:42 2014
@@ -945,14 +945,34 @@ public class LivelinkConnector extends o
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+ *
+ * This method can choose to find repository changes that happen only during the specified time interval.
+ * The seeds recorded by this method will be viewed by the framework based on what the
+ * getConnectorModel() method returns.
+ *
+ * It is not a big problem if the connector chooses to create more seeds than are
+ * strictly necessary; it is merely a question of overall work required.
+ *
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
+ * be called once, when the job starts, and at various periodic intervals as the job executes.
+ *
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
+ * getConnectorModel().
+ *
+ * Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
getSession();
@@ -1063,7 +1083,7 @@ public class LivelinkConnector extends o
}
}
-
+ return "";
}
Modified: manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java (original)
+++ manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java Sun Aug 31 18:17:42 2014
@@ -67,6 +67,8 @@ public class MeridioConnector extends or
protected String urlBase = null;
protected String urlVersionBase = null;
+ private final static int maxHitsToReturn = 100;
+
/** Deny access token for Meridio */
private final static String denyToken = GLOBAL_DENY_TOKEN;
@@ -563,39 +565,130 @@ public class MeridioConnector extends or
return true;
}
- /** Given a document specification, get either a list of starting document identifiers (seeds),
- * or a list of changes (deltas), depending on whether this is a "crawled" connector or not.
- * These document identifiers will be loaded into the job's queue at the beginning of the
- * job's execution.
- * This method can return changes only (because it is provided a time range). For full
- * recrawls, the start time is always zero.
- * Note that it is always ok to return MORE documents rather than less with this method.
+ /** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
+ * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+ *
+ * This method can choose to find repository changes that happen only during the specified time interval.
+ * The seeds recorded by this method will be viewed by the framework based on what the
+ * getConnectorModel() method returns.
+ *
+ * It is not a big problem if the connector chooses to create more seeds than are
+ * strictly necessary; it is merely a question of overall work required.
+ *
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
+ * be called once, when the job starts, and at various periodic intervals as the job executes.
+ *
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
+ * getConnectorModel().
+ *
+ * Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
+ *@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
- *@return the stream of local document identifiers that should be added to the queue.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public IDocumentIdentifierStream getDocumentIdentifiers(DocumentSpecification spec, long startTime, long endTime)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
- Logging.connectors.debug("Meridio: Entering 'getDocumentIdentifiers' method");
-
- try
+ Logging.connectors.debug("Meridio: Entering 'addSeedDocuments' method");
+ long startTime;
+ if (lastSeedVersion == null)
+ startTime = 0L;
+ else
{
- // Adjust start time so that we don't miss documents that squeeze in with earlier timestamps after we've already scanned that interval.
- // Chose an interval of 15 minutes, but I've never seen this effect take place over a time interval even 1/10 of that.
- long timeAdjust = 15L * 60000L;
- if (startTime > timeAdjust)
- startTime -= timeAdjust;
- else
- startTime = 0L;
- return new IdentifierStream(spec, startTime, endTime);
+ // Unpack seed time from seed version string
+ startTime = new Long(lastSeedVersion).longValue();
}
- finally
+ // Adjust start time so that we don't miss documents that squeeze in with earlier timestamps after we've already scanned that interval.
+ // Chose an interval of 15 minutes, but I've never seen this effect take place over a time interval even 1/10 of that.
+ long timeAdjust = 15L * 60000L;
+ if (startTime > timeAdjust)
+ startTime -= timeAdjust;
+ else
+ startTime = 0L;
+
+ while (true)
{
- Logging.connectors.debug("Meridio: Exiting 'getDocumentIdentifiers' method");
+ getSession();
+
+ try
+ {
+ DMSearchResults searchResults;
+ int numResultsReturnedByStream = 0;
+
+ while (true)
+ {
+ searchResults = documentSpecificationSearch(spec,
+ startTime, seedTime, numResultsReturnedByStream + 1, maxHitsToReturn);
+
+ for (int i = 0; i < searchResults.returnedHitsCount; i++)
+ {
+ long documentId =
+ searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS()[i].getDocId();
+
+ String strDocumentId = new Long(documentId).toString();
+ activities.addSeedDocument(strDocumentId);
+ }
+
+ numResultsReturnedByStream += searchResults.returnedHitsCount;
+ if (numResultsReturnedByStream == searchResults.totalHitsCount)
+ break;
+ }
+ return new Long(seedTime).toString();
+ }
+ catch (org.apache.axis.AxisFault e)
+ {
+ long currentTime = System.currentTimeMillis();
+ if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
+ {
+ org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
+ if (elem != null)
+ {
+ elem.normalize();
+ String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
+ throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" accessing Meridio: "+e.getMessage(),e);
+ }
+ throw new ManifoldCFException("Unknown http error occurred while performing search: "+e.getMessage(),e);
+ }
+ if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
+ {
+ String exceptionName = e.getFaultString();
+ if (exceptionName.equals("java.lang.InterruptedException"))
+ throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
+ }
+ if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
+ {
+ if (e.getFaultString().indexOf(" 23031#") != -1)
+ {
+ // This means that the session has expired, so reset it and retry
+ meridio_ = null;
+ continue;
+ }
+ }
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Meridio: Got an unknown remote exception while performing search - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString()+" - retrying",e);
+ throw new ServiceInterruption("Remote procedure exception: "+e.getMessage(), e, currentTime + 300000L,
+ currentTime + 3 * 60 * 60000L,-1,false);
+ }
+ catch (RemoteException remoteException)
+ {
+ throw new ManifoldCFException("Meridio: A Remote Exception occurred while " +
+ "performing a search: "+remoteException.getMessage(), remoteException);
+ }
+ catch (MeridioDataSetException meridioDataSetException)
+ {
+ throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
+ "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
+ }
}
+
}
@@ -3723,7 +3816,7 @@ public class MeridioConnector extends or
private static String [] getMIMETypes
(
- DocumentSpecification spec
+ Specification spec
)
{
ArrayList al = new ArrayList ();
@@ -3760,7 +3853,7 @@ public class MeridioConnector extends or
*/
private DMSearchResults documentSpecificationSearch
(
- DocumentSpecification docSpec, // The castor representation of the Document Specification
+ Specification docSpec, // The castor representation of the Document Specification
long startTime,
long endTime,
int startPositionOfHits,
@@ -3794,7 +3887,7 @@ public class MeridioConnector extends or
*/
private DMSearchResults documentSpecificationSearch
(
- DocumentSpecification docSpec, // The castor representation of the Document Specification
+ Specification docSpec, // The castor representation of the Document Specification
long startTime,
long endTime,
int startPositionOfHits,
@@ -3838,7 +3931,7 @@ public class MeridioConnector extends or
*/
protected DMSearchResults documentSpecificationSearch
(
- DocumentSpecification docSpec,
+ Specification docSpec,
long startTime,
long endTime,
int startPositionOfHits,
@@ -4342,208 +4435,6 @@ public class MeridioConnector extends or
}
- private final static int maxHitsToReturn = 100;
-
- /** Document identifier stream.
- */
- protected class IdentifierStream implements IDocumentIdentifierStream
- {
- protected DMSearchResults searchResults = null;
- protected int currentResult = 0;
- protected int numResultsReturnedByStream = 0;
-
- DocumentSpecification spec_ = null;
- long startTime_ = 0L;
- long endTime_ = 0L;
-
-
- public IdentifierStream
- (
- DocumentSpecification spec,
- long startTime,
- long endTime
- )
- throws ManifoldCFException,ServiceInterruption
- {
- Logging.connectors.debug("Meridio: Entering 'IdentifierStream' constructor");
- while (true)
- {
- getSession();
-
- try
- {
- spec_ = spec;
- startTime_ = startTime;
- endTime_ = endTime;
-
- searchResults = documentSpecificationSearch(spec,
- startTime, endTime, 1, maxHitsToReturn);
-
- Logging.connectors.debug("Meridio: Exiting 'IdentifierStream' constructor");
-
- return;
- }
- catch (org.apache.axis.AxisFault e)
- {
- long currentTime = System.currentTimeMillis();
- if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
- {
- org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
- if (elem != null)
- {
- elem.normalize();
- String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
- throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" accessing Meridio: "+e.getMessage(),e);
- }
- throw new ManifoldCFException("Unknown http error occurred while performing search: "+e.getMessage(),e);
- }
- if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
- {
- String exceptionName = e.getFaultString();
- if (exceptionName.equals("java.lang.InterruptedException"))
- throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
- }
- if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
- {
- if (e.getFaultString().indexOf(" 23031#") != -1)
- {
- // This means that the session has expired, so reset it and retry
- meridio_ = null;
- continue;
- }
- }
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Meridio: Got an unknown remote exception while performing search - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString()+" - retrying",e);
- throw new ServiceInterruption("Remote procedure exception: "+e.getMessage(), e, currentTime + 300000L,
- currentTime + 3 * 60 * 60000L,-1,false);
- }
- catch (RemoteException remoteException)
- {
- throw new ManifoldCFException("Meridio: A Remote Exception occurred while " +
- "performing a search: "+remoteException.getMessage(), remoteException);
- }
- catch (MeridioDataSetException meridioDataSetException)
- {
- throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
- "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
- }
- }
- }
-
-
-
- /** Get the next identifier.
- *@return the next document identifier, or null if there are no more.
- */
- public String getNextIdentifier()
- throws ManifoldCFException, ServiceInterruption
- {
- Logging.connectors.debug("Meridio: Entering 'getNextIdentifier' method");
-
- try
- {
- if (null == searchResults ||
- numResultsReturnedByStream == searchResults.totalHitsCount)
- {
- return null;
- }
-
- if (currentResult == searchResults.returnedHitsCount)
- {
- while (true)
- {
- getSession();
- try
- {
- searchResults = documentSpecificationSearch(spec_,
- startTime_, endTime_, numResultsReturnedByStream + 1,
- maxHitsToReturn);
-
- currentResult = 0;
- break;
- }
- catch (org.apache.axis.AxisFault e)
- {
- long currentTime = System.currentTimeMillis();
- if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
- {
- org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
- if (elem != null)
- {
- elem.normalize();
- String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
- throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" performing search: "+e.getMessage());
- }
- throw new ManifoldCFException("Unknown http error occurred while performing search: "+e.getMessage(),e);
- }
- if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
- {
- String exceptionName = e.getFaultString();
- if (exceptionName.equals("java.lang.InterruptedException"))
- throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
- }
- if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
- {
- if (e.getFaultString().indexOf(" 23031#") != -1)
- {
- // This means that the session has expired, so reset it and retry
- meridio_ = null;
- continue;
- }
- }
-
- throw new ManifoldCFException("Meridio: Got an unknown remote exception performing search - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString(),e);
- }
- catch (RemoteException remoteException)
- {
- throw new ServiceInterruption("Meridio: A Remote Exception occurred while " +
- "performing a Meridio search: "+remoteException.getMessage(), remoteException,
- System.currentTimeMillis() + interruptionRetryTime,
- -1L, -1, true);
- }
- catch (MeridioDataSetException meridioDataSetException)
- {
- throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
- "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
- }
- }
- }
-
- long documentId =
- searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS()[currentResult].getDocId();
-
- String strDocumentId = new Long(documentId).toString();
-
- currentResult++;
- numResultsReturnedByStream++;
-
- return strDocumentId;
- }
- finally
- {
- Logging.connectors.debug("Meridio: Exiting 'getNextIdentifier' method");
- }
- }
-
-
-
- /** Close the stream.
- */
- public void close()
- throws ManifoldCFException
- {
- Logging.connectors.debug("Meridio: Entering 'IdentifierStream.close' method");
-
- searchResults = null;
- currentResult = 0;
- numResultsReturnedByStream = 0;
-
- Logging.connectors.debug("Meridio: Exiting 'IdentifierStream.close' method");
- }
- }
-
-
-
/** Returns the categories set up in the Meridio system; these are used by the UI for two
* purposes
*
Modified: manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java (original)
+++ manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java Sun Aug 31 18:17:42 2014
@@ -98,15 +98,8 @@ public class OpenSearchServerConnector e
private HttpClient client = null;
private long expirationTime = -1L;
- // Private data
-
- private String specsCacheOutpuDescription;
- private OpenSearchServerSpecs specsCache;
-
public OpenSearchServerConnector()
{
- specsCacheOutpuDescription = null;
- specsCache = null;
}
@Override
@@ -377,27 +370,6 @@ public class OpenSearchServerConnector e
return new OpenSearchServerSpecs(getSpecNode(os));
}
- final private OpenSearchServerSpecs getSpecsCache(String outputDescription)
- throws ManifoldCFException
- {
- try
- {
- synchronized (this)
- {
- if (!outputDescription.equals(specsCacheOutpuDescription))
- specsCache = null;
- if (specsCache == null)
- specsCache = new OpenSearchServerSpecs(new JSONObject(
- outputDescription));
- return specsCache;
- }
- }
- catch (JSONException e)
- {
- throw new ManifoldCFException(e);
- }
- }
-
@Override
public VersionContext getPipelineDescription(Specification os)
throws ManifoldCFException
@@ -410,7 +382,7 @@ public class OpenSearchServerConnector e
public boolean checkLengthIndexable(VersionContext outputDescription, long length, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption
{
- OpenSearchServerSpecs specs = getSpecsCache(outputDescription.getVersionString());
+ OpenSearchServerSpecs specs = new OpenSearchServerSpecs(getSpecNode(outputDescription.getSpecification()));
long maxFileSize = specs.getMaxFileSize();
if (length > maxFileSize)
return false;
@@ -421,7 +393,7 @@ public class OpenSearchServerConnector e
public boolean checkMimeTypeIndexable(VersionContext outputDescription, String mimeType, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption
{
- OpenSearchServerSpecs specs = getSpecsCache(outputDescription.getVersionString());
+ OpenSearchServerSpecs specs = new OpenSearchServerSpecs(getSpecNode(outputDescription.getSpecification()));
return specs.checkMimeType(mimeType);
}
@@ -440,7 +412,7 @@ public class OpenSearchServerConnector e
public boolean checkURLIndexable(VersionContext outputDescription, String url, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption
{
- OpenSearchServerSpecs specs = getSpecsCache(outputDescription.getVersionString());
+ OpenSearchServerSpecs specs = new OpenSearchServerSpecs(getSpecNode(outputDescription.getSpecification()));
return specs.checkExtension(FilenameUtils.getExtension(url));
}
@@ -482,12 +454,12 @@ public class OpenSearchServerConnector e
return null;
}
- private static Map<String, Integer> ossInstances = null;
+ // Apparently, only one connection to any given Open Search Server instance is allowed at a time.
+
+ private static Map<String, Integer> ossInstances = new TreeMap<String, Integer>();
- private synchronized final Integer addInstance(OpenSearchServerConfig config)
+ private final Integer addInstance(OpenSearchServerConfig config)
{
- if (ossInstances == null)
- ossInstances = new TreeMap<String, Integer>();
synchronized (ossInstances)
{
String uii = config.getUniqueIndexIdentifier();
@@ -503,10 +475,8 @@ public class OpenSearchServerConnector e
}
}
- private synchronized final void removeInstance(OpenSearchServerConfig config)
+ private final void removeInstance(OpenSearchServerConfig config)
{
- if (ossInstances == null)
- return;
synchronized (ossInstances)
{
String uii = config.getUniqueIndexIdentifier();
Modified: manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java (original)
+++ manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java Sun Aug 31 18:17:42 2014
@@ -48,29 +48,6 @@ public class OpenSearchServerSpecs exten
private Set<String> mimeTypeSet;
/**
- * Build a set of OpenSearchServer parameters by reading an JSON object
- *
- * @param json
- * @throws JSONException
- * @throws ManifoldCFException
- */
- public OpenSearchServerSpecs(JSONObject json) throws JSONException,
- ManifoldCFException {
- super(SPECIFICATIONLIST);
- extensionSet = null;
- mimeTypeSet = null;
- for (ParameterEnum param : SPECIFICATIONLIST) {
- String value = null;
- value = json.getString(param.name());
- if (value == null)
- value = param.defaultValue;
- put(param, value);
- }
- extensionSet = createStringSet(getExtensions());
- mimeTypeSet = createStringSet(getMimeTypes());
- }
-
- /**
* Build a set of OpenSearchServer parameters by reading an instance of
* SpecificationNode.
*
Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Sun Aug 31 18:17:42 2014
@@ -369,24 +369,26 @@ public class RSSConnector extends org.ap
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
- * The times passed to this method may be interpreted for greatest efficiency. The time ranges
- * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
- * to the "current time", each time the job is run. For continuous crawling jobs, this method will
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
- * When a job's specification is changed, the framework automatically resets the seeding start time to 0. The
- * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
getSession();
@@ -400,6 +402,7 @@ public class RSSConnector extends org.ap
String canonicalURL = iter.next();
activities.addSeedDocument(canonicalURL);
}
+ return "";
}
/** Convert an absolute or relative URL to a document identifier. This may involve several steps at some point,
@@ -5723,7 +5726,7 @@ public class RSSConnector extends org.ap
protected final List<Pattern> excludePatterns = new ArrayList<Pattern>();
/** Constructor. */
- public Filter(DocumentSpecification spec, boolean warnOnBadSeed)
+ public Filter(Specification spec, boolean warnOnBadSeed)
throws ManifoldCFException
{
String excludes = "";
Modified: manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java (original)
+++ manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java Sun Aug 31 18:17:42 2014
@@ -650,31 +650,33 @@ public class SharePointRepository extend
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
- * The times passed to this method may be interpreted for greatest efficiency. The time ranges
- * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
- * to the "current time", each time the job is run. For continuous crawling jobs, this method will
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
- * When a job's specification is changed, the framework automatically resets the seeding start time to 0. The
- * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime, int jobMode)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
// Check the session
getSession();
// Add just the root.
activities.addSeedDocument("/");
+ return "";
}
Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Sun Aug 31 18:17:42 2014
@@ -565,7 +565,7 @@ public class SolrConnector extends org.a
public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
{
- SpecPacker sp = new SpecPacker(pipelineDescription.getVersionString());
+ SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
// Establish a session
getSession();
@@ -2284,35 +2284,6 @@ public class SolrConnector extends org.a
}
- /** Packed string parser.
- * This method unpacks a packed version string, and makes the formerly packed data available for use.
- * Note that it is actually *not* a requirement for this method to do the unpacking; that can happen "on demand"
- * for performance, if deemed helpful.
- */
- public SpecPacker(String packedString) {
- // Build the argument map we'll send.
- int index = 0;
- List<String> nameValues = new ArrayList<String>();
- index = unpackList(nameValues,packedString,index,'+');
-
- String[] fixedBuffer = new String[2];
-
- // Do the name/value pairs
- for (String x : nameValues)
- {
- unpackFixedList(fixedBuffer,x,0,'=');
- String attrName = fixedBuffer[0];
- List<String> list = args.get(attrName);
- if (list == null)
- {
- list = new ArrayList<String>();
- args.put(attrName,list);
- }
- list.add(fixedBuffer[1]);
- }
-
- }
-
public String toPackedString() {
StringBuilder sb = new StringBuilder();
String[] sortArray = new String[args.size()];
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Sun Aug 31 18:17:42 2014
@@ -493,24 +493,26 @@ public class WebcrawlerConnector extends
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
- * The times passed to this method may be interpreted for greatest efficiency. The time ranges
- * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
- * to the "current time", each time the job is run. For continuous crawling jobs, this method will
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
- * When a job's specification is changed, the framework automatically resets the seeding start time to 0. The
- * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
getSession();
@@ -544,6 +546,7 @@ public class WebcrawlerConnector extends
}
activities.addSeedDocument(documentIdentifier,calculateDocumentEvents(activities,documentIdentifier));
}
+ return "";
}
// Session login states (so we can use the same fetch logic multiple times)
@@ -7206,7 +7209,7 @@ public class WebcrawlerConnector extends
* This should be checked at save time to prevent errors. Any syntax errors found here
* will thus cause the include or exclude regexp to be skipped.
*/
- public DocumentURLFilter(DocumentSpecification spec)
+ public DocumentURLFilter(Specification spec)
throws ManifoldCFException
{
String includes = ".*";
Modified: manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Sun Aug 31 18:17:42 2014
@@ -849,26 +849,37 @@ public class WikiConnector extends org.a
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
- * The times passed to this method may be interpreted for greatest efficiency. The time ranges
- * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
- * to the "current time", each time the job is run. For continuous crawling jobs, this method will
+ * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+ * For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
- * When a job's specification is changed, the framework automatically resets the seeding start time to 0. The
- * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+ * When a job's specification is changed, the framework automatically resets the seeding version string to null. The
+ * seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
+ * The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
- *@param startTime is the beginning of the time range to consider, inclusive.
- *@param endTime is the end of the time range to consider, exclusive.
+ *@param seedTime is the end of the time range of documents to consider, exclusive.
+ *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+ *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+ *@return an updated seeding version string, to be stored with the job.
*/
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
+ long startTime;
+ if (lastSeedVersion == null)
+ startTime = 0L;
+ else
+ {
+ // Unpack seed time from seed version string
+ startTime = new Long(lastSeedVersion).longValue();
+ }
+
// Scan specification nodes and extract prefixes and namespaces
boolean seenAny = false;
for (int i = 0 ; i < spec.getChildCount() ; i++)
@@ -878,12 +889,14 @@ public class WikiConnector extends org.a
{
String namespace = sn.getAttributeValue(WikiConfig.ATTR_NAMESPACE);
String titleprefix = sn.getAttributeValue(WikiConfig.ATTR_TITLEPREFIX);
- listAllPages(activities,namespace,titleprefix,startTime,endTime);
+ listAllPages(activities,namespace,titleprefix,startTime,seedTime);
seenAny = true;
}
}
if (!seenAny)
- listAllPages(activities,null,null,startTime,endTime);
+ listAllPages(activities,null,null,startTime,seedTime);
+
+ return new Long(seedTime).toString();
}
/** Get document versions given an array of document identifiers.
Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -154,6 +154,8 @@ public abstract class BaseRepositoryConn
String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
+ return "";
+ /*
long startTime;
if (lastSeedVersion == null)
startTime = 0L;
@@ -164,6 +166,7 @@ public abstract class BaseRepositoryConn
}
addSeedDocuments(activities,spec,startTime,seedTime,jobMode);
return new Long(seedTime).toString();
+ */
}
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
@@ -192,6 +195,7 @@ public abstract class BaseRepositoryConn
*@param endTime is the end of the time range to consider, exclusive.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*/
+ /*
public void addSeedDocuments(ISeedingActivity activities, Specification spec,
long startTime, long endTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
@@ -205,7 +209,8 @@ public abstract class BaseRepositoryConn
{
addSeedDocuments(activities,spec,startTime,endTime);
}
-
+ */
+
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
*
@@ -231,6 +236,7 @@ public abstract class BaseRepositoryConn
*@param startTime is the beginning of the time range to consider, inclusive.
*@param endTime is the end of the time range to consider, exclusive.
*/
+ /*
public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
@@ -272,7 +278,8 @@ public abstract class BaseRepositoryConn
}
}
}
-
+ */
+
/** The long version of getDocumentIdentifiers.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
@@ -280,19 +287,22 @@ public abstract class BaseRepositoryConn
*@param endTime is the end of the time range to consider, exclusive.
*@return the local document identifiers that should be added to the queue, as a stream.
*/
+ /*
public IDocumentIdentifierStream getDocumentIdentifiers(ISeedingActivity activities, DocumentSpecification spec,
long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
{
return getDocumentIdentifiers(spec,startTime,endTime);
}
-
+ */
+
/** The short version of getDocumentIdentifiers.
*@param spec is a document specification (that comes from the job).
*@param startTime is the beginning of the time range to consider, inclusive.
*@param endTime is the end of the time range to consider, exclusive.
*@return the local document identifiers that should be added to the queue, as a stream.
*/
+ /*
public IDocumentIdentifierStream getDocumentIdentifiers(DocumentSpecification spec,
long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
@@ -300,7 +310,8 @@ public abstract class BaseRepositoryConn
// Something provided here so we can override either one.
return null;
}
-
+ */
+
/** This method returns the document identifiers that should be considered part of the seeds, but do not need to be
* queued for processing at this time. This method is used to keep the hopcount tables up to date. It is
* allowed to return more identifiers than it strictly needs to, specifically identifiers that were also returned
@@ -316,6 +327,7 @@ public abstract class BaseRepositoryConn
*@return the local document identifiers that should be added to the queue, as a stream, or null, if none need to be
* returned.
*/
+ /*
public IDocumentIdentifierStream getRemainingDocumentIdentifiers(ISeedingActivity activities, DocumentSpecification spec,
long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
@@ -323,7 +335,8 @@ public abstract class BaseRepositoryConn
// Usually we don't need to worry about this.
return null;
}
-
+ */
+
/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
Modified: manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -34,8 +34,8 @@ public class InterruptionRepositoryConne
}
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime, int jobMode)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
String docCount = "10";
@@ -52,6 +52,7 @@ public class InterruptionRepositoryConne
String doc = "test"+i+".txt";
activities.addSeedDocument(doc,null);
}
+ return "";
}
@Override
Modified: manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -44,8 +44,8 @@ public class SchedulingRepositoryConnect
}
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime, int jobMode)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
// A seed per domain
@@ -58,6 +58,7 @@ public class SchedulingRepositoryConnect
activities.addSeedDocument(Integer.toString(i)+"/",null);
}
System.out.println("Seeding completed at "+System.currentTimeMillis());
+ return "";
}
@Override
Modified: manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -34,8 +34,8 @@ public class TestingRepositoryConnector
}
@Override
- public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
- long startTime, long endTime, int jobMode)
+ public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+ String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
String docCount = "3";
@@ -52,6 +52,7 @@ public class TestingRepositoryConnector
String doc = "test"+i+".txt";
activities.addSeedDocument(doc,null);
}
+ return "";
}
@Override