You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/08/31 20:17:43 UTC

svn commit: r1621613 [2/2] - in /manifoldcf/trunk: connectors/alfresco/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfresco/ connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/ connectors/document...

Modified: manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Sun Aug 31 18:17:42 2014
@@ -51,12 +51,14 @@ import org.apache.manifoldcf.core.interf
 import org.apache.manifoldcf.core.interfaces.IPasswordMapperActivity;
 import org.apache.manifoldcf.core.interfaces.IPostParameters;
 import org.apache.manifoldcf.core.interfaces.ConfigParams;
+import org.apache.manifoldcf.core.interfaces.Specification;
 import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
 import org.apache.manifoldcf.core.interfaces.IKeystoreManager;
 import org.apache.manifoldcf.core.interfaces.KeystoreManagerFactory;
 import org.apache.manifoldcf.core.interfaces.Configuration;
 import org.apache.manifoldcf.core.interfaces.ConfigurationNode;
 import org.apache.manifoldcf.core.interfaces.LockManagerFactory;
+import org.apache.manifoldcf.crawler.interfaces.ISeedingActivity;
 import org.apache.manifoldcf.crawler.interfaces.DocumentSpecification;
 import org.apache.manifoldcf.crawler.interfaces.IDocumentIdentifierStream;
 import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
@@ -429,24 +431,69 @@ public class SharedDriveConnector extend
   }
   
   
-  /** Given a document specification, get either a list of starting document identifiers (seeds),
-  * or a list of changes (deltas), depending on whether this is a "crawled" connector or not.
-  * These document identifiers will be loaded into the job's queue at the beginning of the
-  * job's execution.
-  * This method can return changes only (because it is provided a time range).  For full
-  * recrawls, the start time is always zero.
-  * Note that it is always ok to return MORE documents rather than less with this method.
+  /** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
+  * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+  *
+  * This method can choose to find repository changes that happen only during the specified time interval.
+  * The seeds recorded by this method will be viewed by the framework based on what the
+  * getConnectorModel() method returns.
+  *
+  * It is not a big problem if the connector chooses to create more seeds than are
+  * strictly necessary; it is merely a question of overall work required.
+  *
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
+  * be called once, when the job starts, and at various periodic intervals as the job executes.
+  *
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
+  * getConnectorModel().
+  *
+  * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
+  *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
-  *@return the stream of local document identifiers that should be added to the queue.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public IDocumentIdentifierStream getDocumentIdentifiers(DocumentSpecification spec, long startTime, long endTime)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
-    return new IdentifierStream(spec);
+    try
+    {
+      for (int i = 0; i < spec.getChildCount(); i++)
+      {
+        SpecificationNode n = spec.getChild(i);
+        if (n.getType().equals(NODE_STARTPOINT))
+        {
+          // The id returned MUST be in canonical form!!!
+          String seed = mapToIdentifier(n.getAttributeValue(ATTRIBUTE_PATH));
+          if (Logging.connectors.isDebugEnabled())
+          {
+            Logging.connectors.debug("Seed = '"+seed+"'");
+          }
+          activities.addSeedDocument(seed);
+        }
+      }
+    }
+    catch (java.net.SocketTimeoutException e)
+    {
+      throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
+    }
+    catch (InterruptedIOException e)
+    {
+      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    catch (IOException e)
+    {
+      throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
+    }
+    return "";
   }
 
 
@@ -812,7 +859,7 @@ public class SharedDriveConnector extend
                         // method has no way of signalling this, since it does not do the fingerprinting.
                         if (Logging.connectors.isDebugEnabled())
                           Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
-                        activities.deleteDocument(documentIdentifier, version);
+                        activities.noDocument(documentIdentifier, version);
                         // We should record the access here as well, since this is a non-exception way through the code path.
                         // (I noticed that this was not being recorded in the history while fixing 25477.)
                         activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
@@ -878,7 +925,7 @@ public class SharedDriveConnector extend
         activities.recordActivity(null,ACTIVITY_ACCESS,
           null,documentIdentifier,"Skip","Authorization: "+e.getMessage(),null);
         // We call the delete even if it's a directory; this is harmless.
-        activities.deleteDocument(documentIdentifier, version);
+        activities.noDocument(documentIdentifier, version);
       }
       catch (SmbException se)
       {
@@ -937,7 +984,7 @@ public class SharedDriveConnector extend
             Logging.connectors.debug("JCIFS: Skipping document/directory "+documentIdentifier+" because it cannot be found");
           activities.recordActivity(null,ACTIVITY_ACCESS,
             null,documentIdentifier,"Not found",null,null);
-          activities.deleteDocument(documentIdentifier, version);
+          activities.noDocument(documentIdentifier, version);
         }
         else if (se.getMessage().indexOf("is denied") != -1)
         {
@@ -945,7 +992,7 @@ public class SharedDriveConnector extend
           // We call the delete even if it's a directory; this is harmless and it cleans up the jobqueue row.
           activities.recordActivity(null,ACTIVITY_ACCESS,
             null,documentIdentifier,"Skip","Authorization: "+se.getMessage(),null);
-          activities.deleteDocument(documentIdentifier, version);
+          activities.noDocument(documentIdentifier, version);
         }
         else
         {
@@ -2642,85 +2689,6 @@ public class SharedDriveConnector extend
     return e1m.equals(e2m);
   }
 
-  /** Document identifier stream.
-  */
-  protected class IdentifierStream implements IDocumentIdentifierStream
-  {
-    protected String[] ids = null;
-    protected int currentIndex = 0;
-
-    public IdentifierStream(DocumentSpecification spec)
-      throws ManifoldCFException
-    {
-      try
-      {
-        // Walk the specification for the "startpoint" types.  Amalgamate these into a list of strings.
-        // Presume that all roots are startpoint nodes
-        int i = 0;
-        int j = 0;
-        while (i < spec.getChildCount())
-        {
-          SpecificationNode n = spec.getChild(i);
-          if (n.getType().equals(NODE_STARTPOINT))
-            j++;
-          i++;
-        }
-        ids = new String[j];
-        i = 0;
-        j = 0;
-        while (i < ids.length)
-        {
-          SpecificationNode n = spec.getChild(i);
-          if (n.getType().equals(NODE_STARTPOINT))
-          {
-            // The id returned MUST be in canonical form!!!
-            ids[j] = mapToIdentifier(n.getAttributeValue(ATTRIBUTE_PATH));
-
-            if (Logging.connectors.isDebugEnabled())
-            {
-              Logging.connectors.debug("Seed = '"+ids[j]+"'");
-            }
-            j++;
-          }
-          i++;
-        }
-      }
-      catch (java.net.SocketTimeoutException e)
-      {
-        throw new ManifoldCFException("Couldn't map to canonical path: "+e.getMessage(),e);
-      }
-      catch (InterruptedIOException e)
-      {
-        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-      }
-      catch (IOException e)
-
-      {
-        throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
-      }
-    }
-
-    /** Get the next identifier.
-    *@return the next document identifier, or null if there are no more.
-    */
-    public String getNextIdentifier()
-      throws ManifoldCFException, ServiceInterruption
-    {
-      if (currentIndex == ids.length)
-        return null;
-      return ids[currentIndex++];
-    }
-
-    /** Close the stream.
-    */
-    public void close()
-      throws ManifoldCFException
-    {
-      ids = null;
-    }
-
-  }
-
   // UI support methods.
   //
   // These support methods come in two varieties.  The first bunch is involved in setting up connection configuration information.  The second bunch

Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java (original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java Sun Aug 31 18:17:42 2014
@@ -210,27 +210,37 @@ public class JDBCConnector extends org.a
   * It is not a big problem if the connector chooses to create more seeds than are
   * strictly necessary; it is merely a question of overall work required.
   *
-  * The times passed to this method may be interpreted for greatest efficiency.  The time ranges
-  * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
-  * to the "current time", each time the job is run.  For continuous crawling jobs, this method will
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
   * be called once, when the job starts, and at various periodic intervals as the job executes.
   *
-  * When a job's specification is changed, the framework automatically resets the seeding start time to 0.  The
-  * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
   * getConnectorModel().
   *
   * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime, int jobMode)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
+    long startTime;
+    if (lastSeedVersion == null)
+      startTime = 0L;
+    else
+    {
+      // Unpack seed time from seed version string
+      startTime = new Long(lastSeedVersion).longValue();
+    }
+
     getSession();
 
     // Set up the query
@@ -239,7 +249,7 @@ public class JDBCConnector extends org.a
     VariableMap vm = new VariableMap();
     addConstant(vm,JDBCConstants.idReturnVariable,JDBCConstants.idReturnColumnName);
     addVariable(vm,JDBCConstants.startTimeVariable,startTime);
-    addVariable(vm,JDBCConstants.endTimeVariable,endTime);
+    addVariable(vm,JDBCConstants.endTimeVariable,seedTime);
 
     // Do the substitution
     ArrayList paramList = new ArrayList();
@@ -300,6 +310,7 @@ public class JDBCConnector extends org.a
     {
       idSet.close();
     }
+    return new Long(seedTime).toString();
   }
 
   /** Get document versions given an array of document identifiers.
@@ -1811,7 +1822,7 @@ public class JDBCConnector extends org.a
     public String versionQuery;
     public String dataQuery;
 
-    public TableSpec(DocumentSpecification ds)
+    public TableSpec(Specification ds)
     {
       int i = 0;
       while (i < ds.getChildCount())

Modified: manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -39,6 +39,7 @@ import org.apache.manifoldcf.crawler.sys
 import org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector;
 import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
 import org.apache.manifoldcf.core.interfaces.ConfigParams;
+import org.apache.manifoldcf.core.interfaces.Specification;
 import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
 import org.apache.commons.lang.StringUtils;
 import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
@@ -812,48 +813,37 @@ public class JiraRepositoryConnector ext
     Messages.outputResourceWithVelocity(out,locale,EDIT_SPEC_HEADER_FORWARD,paramMap);
   }
 
-  /**
-   * Queue "seed" documents. Seed documents are the starting places for
-   * crawling activity. Documents are seeded when this method calls
-   * appropriate methods in the passed in ISeedingActivity object.
-   *
-   * This method can choose to find repository changes that happen only during
-   * the specified time interval. The seeds recorded by this method will be
-   * viewed by the framework based on what the getConnectorModel() method
-   * returns.
-   *
-   * It is not a big problem if the connector chooses to create more seeds
-   * than are strictly necessary; it is merely a question of overall work
-   * required.
-   *
-   * The times passed to this method may be interpreted for greatest
-   * efficiency. The time ranges any given job uses with this connector will
-   * not overlap, but will proceed starting at 0 and going to the "current
-   * time", each time the job is run. For continuous crawling jobs, this
-   * method will be called once, when the job starts, and at various periodic
-   * intervals as the job executes.
-   *
-   * When a job's specification is changed, the framework automatically resets
-   * the seeding start time to 0. The seeding start time may also be set to 0
-   * on each job run, depending on the connector model returned by
-   * getConnectorModel().
-   *
-   * Note that it is always ok to send MORE documents rather than less to this
-   * method.
-   *
-   * @param activities is the interface this method should use to perform
-   * whatever framework actions are desired.
-   * @param spec is a document specification (that comes from the job).
-   * @param startTime is the beginning of the time range to consider,
-   * inclusive.
-   * @param endTime is the end of the time range to consider, exclusive.
-   * @param jobMode is an integer describing how the job is being run, whether
-   * continuous or once-only.
-   */
+  /** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
+  * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+  *
+  * This method can choose to find repository changes that happen only during the specified time interval.
+  * The seeds recorded by this method will be viewed by the framework based on what the
+  * getConnectorModel() method returns.
+  *
+  * It is not a big problem if the connector chooses to create more seeds than are
+  * strictly necessary; it is merely a question of overall work required.
+  *
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
+  * be called once, when the job starts, and at various periodic intervals as the job executes.
+  *
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
+  * getConnectorModel().
+  *
+  * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
+  *@param activities is the interface this method should use to perform whatever framework actions are desired.
+  *@param spec is a document specification (that comes from the job).
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
+  */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities,
-      DocumentSpecification spec, long startTime, long endTime, int jobMode)
-      throws ManifoldCFException, ServiceInterruption {
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
+    throws ManifoldCFException, ServiceInterruption {
 
     String jiraDriveQuery = JiraConfig.JIRA_QUERY_DEFAULT;
     int i = 0;
@@ -906,6 +896,7 @@ public class JiraRepositoryConnector ext
     } catch (ResponseException e) {
       handleResponseException(e);
     }
+    return "";
   }
   
 

Modified: manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java (original)
+++ manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java Sun Aug 31 18:17:42 2014
@@ -945,14 +945,34 @@ public class LivelinkConnector extends o
   
   /** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
   * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+  *
+  * This method can choose to find repository changes that happen only during the specified time interval.
+  * The seeds recorded by this method will be viewed by the framework based on what the
+  * getConnectorModel() method returns.
+  *
+  * It is not a big problem if the connector chooses to create more seeds than are
+  * strictly necessary; it is merely a question of overall work required.
+  *
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
+  * be called once, when the job starts, and at various periodic intervals as the job executes.
+  *
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
+  * getConnectorModel().
+  *
+  * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
@@ -1063,7 +1083,7 @@ public class LivelinkConnector extends o
       }
       
     }
-
+    return "";
   }
 
 

Modified: manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java (original)
+++ manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java Sun Aug 31 18:17:42 2014
@@ -67,6 +67,8 @@ public class MeridioConnector extends or
   protected String urlBase = null;
   protected String urlVersionBase = null;
 
+  private final static int maxHitsToReturn      = 100;
+
   /** Deny access token for Meridio */
   private final static String denyToken = GLOBAL_DENY_TOKEN;
 
@@ -563,39 +565,130 @@ public class MeridioConnector extends or
     return true;
   }
 
-  /** Given a document specification, get either a list of starting document identifiers (seeds),
-  * or a list of changes (deltas), depending on whether this is a "crawled" connector or not.
-  * These document identifiers will be loaded into the job's queue at the beginning of the
-  * job's execution.
-  * This method can return changes only (because it is provided a time range).  For full
-  * recrawls, the start time is always zero.
-  * Note that it is always ok to return MORE documents rather than less with this method.
+  /** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
+  * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
+  *
+  * This method can choose to find repository changes that happen only during the specified time interval.
+  * The seeds recorded by this method will be viewed by the framework based on what the
+  * getConnectorModel() method returns.
+  *
+  * It is not a big problem if the connector chooses to create more seeds than are
+  * strictly necessary; it is merely a question of overall work required.
+  *
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
+  * be called once, when the job starts, and at various periodic intervals as the job executes.
+  *
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
+  * getConnectorModel().
+  *
+  * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
+  *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
-  *@return the stream of local document identifiers that should be added to the queue.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public IDocumentIdentifierStream getDocumentIdentifiers(DocumentSpecification spec, long startTime, long endTime)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
-    Logging.connectors.debug("Meridio: Entering 'getDocumentIdentifiers' method");
-
-    try
+    Logging.connectors.debug("Meridio: Entering 'addSeedDocuments' method");
+    long startTime;
+    if (lastSeedVersion == null)
+      startTime = 0L;
+    else
     {
-      // Adjust start time so that we don't miss documents that squeeze in with earlier timestamps after we've already scanned that interval.
-      // Chose an interval of 15 minutes, but I've never seen this effect take place over a time interval even 1/10 of that.
-      long timeAdjust = 15L * 60000L;
-      if (startTime > timeAdjust)
-        startTime -= timeAdjust;
-      else
-        startTime = 0L;
-      return new IdentifierStream(spec, startTime, endTime);
+      // Unpack seed time from seed version string
+      startTime = new Long(lastSeedVersion).longValue();
     }
-    finally
+    // Adjust start time so that we don't miss documents that squeeze in with earlier timestamps after we've already scanned that interval.
+    // Chose an interval of 15 minutes, but I've never seen this effect take place over a time interval even 1/10 of that.
+    long timeAdjust = 15L * 60000L;
+    if (startTime > timeAdjust)
+      startTime -= timeAdjust;
+    else
+      startTime = 0L;
+
+    while (true)
     {
-      Logging.connectors.debug("Meridio: Exiting 'getDocumentIdentifiers' method");
+      getSession();
+
+      try
+      {
+        DMSearchResults searchResults;
+        int numResultsReturnedByStream = 0;
+
+        while (true)
+        {
+          searchResults = documentSpecificationSearch(spec,
+            startTime, seedTime, numResultsReturnedByStream + 1, maxHitsToReturn);
+
+          for (int i = 0; i < searchResults.returnedHitsCount; i++)
+          {
+            long documentId =
+              searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS()[i].getDocId();
+
+            String strDocumentId = new Long(documentId).toString();
+            activities.addSeedDocument(strDocumentId);
+          }
+          
+          numResultsReturnedByStream += searchResults.returnedHitsCount;
+          if (numResultsReturnedByStream == searchResults.totalHitsCount)
+            break;
+        }
+        return new Long(seedTime).toString();
+      }
+      catch (org.apache.axis.AxisFault e)
+      {
+        long currentTime = System.currentTimeMillis();
+        if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
+        {
+          org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
+          if (elem != null)
+          {
+            elem.normalize();
+            String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
+            throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" accessing Meridio: "+e.getMessage(),e);
+          }
+          throw new ManifoldCFException("Unknown http error occurred while performing search: "+e.getMessage(),e);
+        }
+        if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
+        {
+          String exceptionName = e.getFaultString();
+          if (exceptionName.equals("java.lang.InterruptedException"))
+            throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
+        }
+        if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
+        {
+          if (e.getFaultString().indexOf(" 23031#") != -1)
+          {
+            // This means that the session has expired, so reset it and retry
+            meridio_ = null;
+            continue;
+          }
+        }
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Meridio: Got an unknown remote exception while performing search - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString()+" - retrying",e);
+        throw new ServiceInterruption("Remote procedure exception: "+e.getMessage(),  e, currentTime + 300000L,
+          currentTime + 3 * 60 * 60000L,-1,false);
+      }
+      catch (RemoteException remoteException)
+      {
+        throw new ManifoldCFException("Meridio: A Remote Exception occurred while " +
+          "performing a search: "+remoteException.getMessage(), remoteException);
+      }
+      catch (MeridioDataSetException meridioDataSetException)
+      {
+        throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
+          "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
+      }
     }
+
   }
 
 
@@ -3723,7 +3816,7 @@ public class MeridioConnector extends or
 
   private static String [] getMIMETypes
   (
-    DocumentSpecification spec
+    Specification spec
   )
   {
     ArrayList al = new ArrayList ();
@@ -3760,7 +3853,7 @@ public class MeridioConnector extends or
   */
   private DMSearchResults documentSpecificationSearch
   (
-    DocumentSpecification docSpec,      // The castor representation of the Document Specification
+    Specification docSpec,      // The castor representation of the Document Specification
     long startTime,
     long endTime,
     int startPositionOfHits,
@@ -3794,7 +3887,7 @@ public class MeridioConnector extends or
   */
   private DMSearchResults documentSpecificationSearch
   (
-    DocumentSpecification docSpec,      // The castor representation of the Document Specification
+    Specification docSpec,      // The castor representation of the Document Specification
     long startTime,
     long endTime,
     int startPositionOfHits,
@@ -3838,7 +3931,7 @@ public class MeridioConnector extends or
   */
   protected DMSearchResults documentSpecificationSearch
   (
-    DocumentSpecification docSpec,
+    Specification docSpec,
     long startTime,
     long endTime,
     int startPositionOfHits,
@@ -4342,208 +4435,6 @@ public class MeridioConnector extends or
   }
 
 
-  private final static int maxHitsToReturn      = 100;
-
-  /** Document identifier stream.
-  */
-  protected class IdentifierStream implements IDocumentIdentifierStream
-  {
-    protected DMSearchResults searchResults  = null;
-    protected int currentResult              = 0;
-    protected int numResultsReturnedByStream = 0;
-
-    DocumentSpecification spec_              = null;
-    long startTime_                          = 0L;
-    long endTime_                            = 0L;
-
-
-    public IdentifierStream
-    (
-      DocumentSpecification spec,
-      long startTime,
-      long endTime
-    )
-      throws ManifoldCFException,ServiceInterruption
-    {
-      Logging.connectors.debug("Meridio: Entering 'IdentifierStream' constructor");
-      while (true)
-      {
-        getSession();
-
-        try
-        {
-          spec_             = spec;
-          startTime_        = startTime;
-          endTime_          = endTime;
-
-          searchResults = documentSpecificationSearch(spec,
-            startTime, endTime,     1, maxHitsToReturn);
-
-          Logging.connectors.debug("Meridio: Exiting 'IdentifierStream' constructor");
-
-          return;
-        }
-        catch (org.apache.axis.AxisFault e)
-        {
-          long currentTime = System.currentTimeMillis();
-          if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
-          {
-            org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
-            if (elem != null)
-            {
-              elem.normalize();
-              String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
-              throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" accessing Meridio: "+e.getMessage(),e);
-            }
-            throw new ManifoldCFException("Unknown http error occurred while performing search: "+e.getMessage(),e);
-          }
-          if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
-          {
-            String exceptionName = e.getFaultString();
-            if (exceptionName.equals("java.lang.InterruptedException"))
-              throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
-          }
-          if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
-          {
-            if (e.getFaultString().indexOf(" 23031#") != -1)
-            {
-              // This means that the session has expired, so reset it and retry
-              meridio_ = null;
-              continue;
-            }
-          }
-          if (Logging.connectors.isDebugEnabled())
-            Logging.connectors.debug("Meridio: Got an unknown remote exception while performing search - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString()+" - retrying",e);
-          throw new ServiceInterruption("Remote procedure exception: "+e.getMessage(),  e, currentTime + 300000L,
-            currentTime + 3 * 60 * 60000L,-1,false);
-        }
-        catch (RemoteException remoteException)
-        {
-          throw new ManifoldCFException("Meridio: A Remote Exception occurred while " +
-            "performing a search: "+remoteException.getMessage(), remoteException);
-        }
-        catch (MeridioDataSetException meridioDataSetException)
-        {
-          throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
-            "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
-        }
-      }
-    }
-
-
-
-    /** Get the next identifier.
-    *@return the next document identifier, or null if there are no more.
-    */
-    public String getNextIdentifier()
-      throws ManifoldCFException, ServiceInterruption
-    {
-      Logging.connectors.debug("Meridio: Entering 'getNextIdentifier' method");
-
-      try
-      {
-        if (null                       == searchResults ||
-          numResultsReturnedByStream == searchResults.totalHitsCount)
-        {
-          return null;
-        }
-
-        if (currentResult == searchResults.returnedHitsCount)
-        {
-          while (true)
-          {
-            getSession();
-            try
-            {
-              searchResults = documentSpecificationSearch(spec_,
-                startTime_, endTime_, numResultsReturnedByStream + 1,
-                maxHitsToReturn);
-
-              currentResult = 0;
-              break;
-            }
-            catch (org.apache.axis.AxisFault e)
-            {
-              long currentTime = System.currentTimeMillis();
-              if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HTTP")))
-              {
-                org.w3c.dom.Element elem = e.lookupFaultDetail(new javax.xml.namespace.QName("http://xml.apache.org/axis/","HttpErrorCode"));
-                if (elem != null)
-                {
-                  elem.normalize();
-                  String httpErrorCode = elem.getFirstChild().getNodeValue().trim();
-                  throw new ManifoldCFException("Unexpected http error code "+httpErrorCode+" performing search: "+e.getMessage());
-                }
-                throw new ManifoldCFException("Unknown http error occurred while performing search: "+e.getMessage(),e);
-              }
-              if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server.userException")))
-              {
-                String exceptionName = e.getFaultString();
-                if (exceptionName.equals("java.lang.InterruptedException"))
-                  throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
-              }
-              if (e.getFaultCode().equals(new javax.xml.namespace.QName("http://schemas.xmlsoap.org/soap/envelope/","Server")))
-              {
-                if (e.getFaultString().indexOf(" 23031#") != -1)
-                {
-                  // This means that the session has expired, so reset it and retry
-                  meridio_ = null;
-                  continue;
-                }
-              }
-
-              throw new ManifoldCFException("Meridio: Got an unknown remote exception performing search - axis fault = "+e.getFaultCode().getLocalPart()+", detail = "+e.getFaultString(),e);
-            }
-            catch (RemoteException remoteException)
-            {
-              throw new ServiceInterruption("Meridio: A Remote Exception occurred while " +
-                "performing a Meridio search: "+remoteException.getMessage(), remoteException,
-                System.currentTimeMillis() + interruptionRetryTime,
-                -1L, -1, true);
-            }
-            catch (MeridioDataSetException meridioDataSetException)
-            {
-              throw new ManifoldCFException("Meridio: A problem occurred manipulating the Web " +
-                "Service XML: "+meridioDataSetException.getMessage(), meridioDataSetException);
-            }
-          }
-        }
-
-        long documentId =
-          searchResults.dsDM.getSEARCHRESULTS_DOCUMENTS()[currentResult].getDocId();
-
-        String strDocumentId = new Long(documentId).toString();
-
-        currentResult++;
-        numResultsReturnedByStream++;
-
-        return strDocumentId;
-      }
-      finally
-      {
-        Logging.connectors.debug("Meridio: Exiting 'getNextIdentifier' method");
-      }
-    }
-
-
-
-    /** Close the stream.
-    */
-    public void close()
-      throws ManifoldCFException
-    {
-      Logging.connectors.debug("Meridio: Entering 'IdentifierStream.close' method");
-
-      searchResults              = null;
-      currentResult              = 0;
-      numResultsReturnedByStream = 0;
-
-      Logging.connectors.debug("Meridio: Exiting 'IdentifierStream.close' method");
-    }
-  }
-
-
-
   /** Returns the categories set up in the Meridio system; these are used by the UI for two
   * purposes
   *

Modified: manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java (original)
+++ manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerConnector.java Sun Aug 31 18:17:42 2014
@@ -98,15 +98,8 @@ public class OpenSearchServerConnector e
   private HttpClient client = null;
   private long expirationTime = -1L;
 
-  // Private data
-
-  private String specsCacheOutpuDescription;
-  private OpenSearchServerSpecs specsCache;
-
   public OpenSearchServerConnector()
   {
-    specsCacheOutpuDescription = null;
-    specsCache = null;
   }
 
   @Override
@@ -377,27 +370,6 @@ public class OpenSearchServerConnector e
     return new OpenSearchServerSpecs(getSpecNode(os));
   }
 
-  final private OpenSearchServerSpecs getSpecsCache(String outputDescription)
-      throws ManifoldCFException
-  {
-    try
-    {
-      synchronized (this)
-      {
-        if (!outputDescription.equals(specsCacheOutpuDescription))
-          specsCache = null;
-        if (specsCache == null)
-          specsCache = new OpenSearchServerSpecs(new JSONObject(
-              outputDescription));
-        return specsCache;
-      }
-    }
-    catch (JSONException e)
-    {
-      throw new ManifoldCFException(e);
-    }
-  }
-
   @Override
   public VersionContext getPipelineDescription(Specification os)
       throws ManifoldCFException
@@ -410,7 +382,7 @@ public class OpenSearchServerConnector e
   public boolean checkLengthIndexable(VersionContext outputDescription, long length, IOutputCheckActivity activities)
       throws ManifoldCFException, ServiceInterruption
   {
-    OpenSearchServerSpecs specs = getSpecsCache(outputDescription.getVersionString());
+    OpenSearchServerSpecs specs = new OpenSearchServerSpecs(getSpecNode(outputDescription.getSpecification()));
     long maxFileSize = specs.getMaxFileSize();
     if (length > maxFileSize)
       return false;
@@ -421,7 +393,7 @@ public class OpenSearchServerConnector e
   public boolean checkMimeTypeIndexable(VersionContext outputDescription, String mimeType, IOutputCheckActivity activities)
       throws ManifoldCFException, ServiceInterruption
   {
-    OpenSearchServerSpecs specs = getSpecsCache(outputDescription.getVersionString());
+    OpenSearchServerSpecs specs = new OpenSearchServerSpecs(getSpecNode(outputDescription.getSpecification()));
     return specs.checkMimeType(mimeType);
   }
 
@@ -440,7 +412,7 @@ public class OpenSearchServerConnector e
   public boolean checkURLIndexable(VersionContext outputDescription, String url, IOutputCheckActivity activities)
       throws ManifoldCFException, ServiceInterruption
   {
-    OpenSearchServerSpecs specs = getSpecsCache(outputDescription.getVersionString());
+    OpenSearchServerSpecs specs = new OpenSearchServerSpecs(getSpecNode(outputDescription.getSpecification()));
     return specs.checkExtension(FilenameUtils.getExtension(url));
   }
 
@@ -482,12 +454,12 @@ public class OpenSearchServerConnector e
     return null;
   }
 
-  private static Map<String, Integer> ossInstances = null;
+  // Apparently, only one connection to any given Open Search Server instance is allowed at a time.
+  
+  private static Map<String, Integer> ossInstances = new TreeMap<String, Integer>();
 
-  private synchronized final Integer addInstance(OpenSearchServerConfig config)
+  private final Integer addInstance(OpenSearchServerConfig config)
   {
-    if (ossInstances == null)
-      ossInstances = new TreeMap<String, Integer>();
     synchronized (ossInstances)
     {
       String uii = config.getUniqueIndexIdentifier();
@@ -503,10 +475,8 @@ public class OpenSearchServerConnector e
     }
   }
 
-  private synchronized final void removeInstance(OpenSearchServerConfig config)
+  private final void removeInstance(OpenSearchServerConfig config)
   {
-    if (ossInstances == null)
-      return;
     synchronized (ossInstances)
     {
       String uii = config.getUniqueIndexIdentifier();

Modified: manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java (original)
+++ manifoldcf/trunk/connectors/opensearchserver/connector/src/main/java/org/apache/manifoldcf/agents/output/opensearchserver/OpenSearchServerSpecs.java Sun Aug 31 18:17:42 2014
@@ -48,29 +48,6 @@ public class OpenSearchServerSpecs exten
   private Set<String> mimeTypeSet;
 
   /**
-   * Build a set of OpenSearchServer parameters by reading an JSON object
-   * 
-   * @param json
-   * @throws JSONException
-   * @throws ManifoldCFException
-   */
-  public OpenSearchServerSpecs(JSONObject json) throws JSONException,
-      ManifoldCFException {
-    super(SPECIFICATIONLIST);
-    extensionSet = null;
-    mimeTypeSet = null;
-    for (ParameterEnum param : SPECIFICATIONLIST) {
-      String value = null;
-      value = json.getString(param.name());
-      if (value == null)
-        value = param.defaultValue;
-      put(param, value);
-    }
-    extensionSet = createStringSet(getExtensions());
-    mimeTypeSet = createStringSet(getMimeTypes());
-  }
-
-  /**
    * Build a set of OpenSearchServer parameters by reading an instance of
    * SpecificationNode.
    * 

Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Sun Aug 31 18:17:42 2014
@@ -369,24 +369,26 @@ public class RSSConnector extends org.ap
   * It is not a big problem if the connector chooses to create more seeds than are
   * strictly necessary; it is merely a question of overall work required.
   *
-  * The times passed to this method may be interpreted for greatest efficiency.  The time ranges
-  * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
-  * to the "current time", each time the job is run.  For continuous crawling jobs, this method will
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
   * be called once, when the job starts, and at various periodic intervals as the job executes.
   *
-  * When a job's specification is changed, the framework automatically resets the seeding start time to 0.  The
-  * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
   * getConnectorModel().
   *
   * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
@@ -400,6 +402,7 @@ public class RSSConnector extends org.ap
       String canonicalURL = iter.next();
       activities.addSeedDocument(canonicalURL);
     }
+    return "";
   }
 
   /** Convert an absolute or relative URL to a document identifier.  This may involve several steps at some point,
@@ -5723,7 +5726,7 @@ public class RSSConnector extends org.ap
     protected final List<Pattern> excludePatterns = new ArrayList<Pattern>();
 
     /** Constructor. */
-    public Filter(DocumentSpecification spec, boolean warnOnBadSeed)
+    public Filter(Specification spec, boolean warnOnBadSeed)
       throws ManifoldCFException
     {
       String excludes = "";

Modified: manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java (original)
+++ manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java Sun Aug 31 18:17:42 2014
@@ -650,31 +650,33 @@ public class SharePointRepository extend
   * It is not a big problem if the connector chooses to create more seeds than are
   * strictly necessary; it is merely a question of overall work required.
   *
-  * The times passed to this method may be interpreted for greatest efficiency.  The time ranges
-  * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
-  * to the "current time", each time the job is run.  For continuous crawling jobs, this method will
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
   * be called once, when the job starts, and at various periodic intervals as the job executes.
   *
-  * When a job's specification is changed, the framework automatically resets the seeding start time to 0.  The
-  * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
   * getConnectorModel().
   *
   * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime, int jobMode)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     // Check the session
     getSession();
     // Add just the root.
     activities.addSeedDocument("/");
+    return "";
   }
 
 

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Sun Aug 31 18:17:42 2014
@@ -565,7 +565,7 @@ public class SolrConnector extends org.a
   public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
     throws ManifoldCFException, ServiceInterruption, IOException
   {
-    SpecPacker sp = new SpecPacker(pipelineDescription.getVersionString());
+    SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
 
     // Establish a session
     getSession();
@@ -2284,35 +2284,6 @@ public class SolrConnector extends org.a
     
     }
     
-    /** Packed string parser.
-    * This method unpacks a packed version string, and makes the formerly packed data available for use.
-    * Note that it is actually *not* a requirement for this method to do the unpacking; that can happen "on demand"
-    * for performance, if deemed helpful.
-    */
-    public SpecPacker(String packedString) {
-      // Build the argument map we'll send.
-      int index = 0;
-      List<String> nameValues = new ArrayList<String>();
-      index = unpackList(nameValues,packedString,index,'+');
-      
-      String[] fixedBuffer = new String[2];
-      
-      // Do the name/value pairs
-      for (String x : nameValues)
-      {
-        unpackFixedList(fixedBuffer,x,0,'=');
-        String attrName = fixedBuffer[0];
-        List<String> list = args.get(attrName);
-        if (list == null)
-        {
-          list = new ArrayList<String>();
-          args.put(attrName,list);
-        }
-        list.add(fixedBuffer[1]);
-      }
-      
-    }
-    
     public String toPackedString() {
       StringBuilder sb = new StringBuilder();
       String[] sortArray = new String[args.size()];

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Sun Aug 31 18:17:42 2014
@@ -493,24 +493,26 @@ public class WebcrawlerConnector extends
   * It is not a big problem if the connector chooses to create more seeds than are
   * strictly necessary; it is merely a question of overall work required.
   *
-  * The times passed to this method may be interpreted for greatest efficiency.  The time ranges
-  * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
-  * to the "current time", each time the job is run.  For continuous crawling jobs, this method will
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
   * be called once, when the job starts, and at various periodic intervals as the job executes.
   *
-  * When a job's specification is changed, the framework automatically resets the seeding start time to 0.  The
-  * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
   * getConnectorModel().
   *
   * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
@@ -544,6 +546,7 @@ public class WebcrawlerConnector extends
       }
       activities.addSeedDocument(documentIdentifier,calculateDocumentEvents(activities,documentIdentifier));
     }
+    return "";
   }
 
   // Session login states (so we can use the same fetch logic multiple times)
@@ -7206,7 +7209,7 @@ public class WebcrawlerConnector extends
     * This should be checked at save time to prevent errors.  Any syntax errors found here
     * will thus cause the include or exclude regexp to be skipped.
     */
-    public DocumentURLFilter(DocumentSpecification spec)
+    public DocumentURLFilter(Specification spec)
       throws ManifoldCFException
     {
       String includes = ".*";

Modified: manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Sun Aug 31 18:17:42 2014
@@ -849,26 +849,37 @@ public class WikiConnector extends org.a
   * It is not a big problem if the connector chooses to create more seeds than are
   * strictly necessary; it is merely a question of overall work required.
   *
-  * The times passed to this method may be interpreted for greatest efficiency.  The time ranges
-  * any given job uses with this connector will not overlap, but will proceed starting at 0 and going
-  * to the "current time", each time the job is run.  For continuous crawling jobs, this method will
+  * The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
+  * For continuous crawling jobs, this method will
   * be called once, when the job starts, and at various periodic intervals as the job executes.
   *
-  * When a job's specification is changed, the framework automatically resets the seeding start time to 0.  The
-  * seeding start time may also be set to 0 on each job run, depending on the connector model returned by
+  * When a job's specification is changed, the framework automatically resets the seeding version string to null.  The
+  * seeding version string may also be set to null on each job run, depending on the connector model returned by
   * getConnectorModel().
   *
   * Note that it is always ok to send MORE documents rather than less to this method.
+  * The connector will be connected before this method can be called.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
-  *@param startTime is the beginning of the time range to consider, inclusive.
-  *@param endTime is the end of the time range to consider, exclusive.
+  *@param seedTime is the end of the time range of documents to consider, exclusive.
+  *@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@return an updated seeding version string, to be stored with the job.
   */
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
+    long startTime;
+    if (lastSeedVersion == null)
+      startTime = 0L;
+    else
+    {
+      // Unpack seed time from seed version string
+      startTime = new Long(lastSeedVersion).longValue();
+    }
+
     // Scan specification nodes and extract prefixes and namespaces
     boolean seenAny = false;
     for (int i = 0 ; i < spec.getChildCount() ; i++)
@@ -878,12 +889,14 @@ public class WikiConnector extends org.a
       {
         String namespace = sn.getAttributeValue(WikiConfig.ATTR_NAMESPACE);
         String titleprefix = sn.getAttributeValue(WikiConfig.ATTR_TITLEPREFIX);
-        listAllPages(activities,namespace,titleprefix,startTime,endTime);
+        listAllPages(activities,namespace,titleprefix,startTime,seedTime);
         seenAny = true;
       }
     }
     if (!seenAny)
-      listAllPages(activities,null,null,startTime,endTime);
+      listAllPages(activities,null,null,startTime,seedTime);
+    
+    return new Long(seedTime).toString();
   }
 
   /** Get document versions given an array of document identifiers.

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -154,6 +154,8 @@ public abstract class BaseRepositoryConn
     String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
+    return "";
+    /*
     long startTime;
     if (lastSeedVersion == null)
       startTime = 0L;
@@ -164,6 +166,7 @@ public abstract class BaseRepositoryConn
     }
     addSeedDocuments(activities,spec,startTime,seedTime,jobMode);
     return new Long(seedTime).toString();
+    */
   }
 
   /** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
@@ -192,6 +195,7 @@ public abstract class BaseRepositoryConn
   *@param endTime is the end of the time range to consider, exclusive.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
   */
+  /*
   public void addSeedDocuments(ISeedingActivity activities, Specification spec,
     long startTime, long endTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
@@ -205,7 +209,8 @@ public abstract class BaseRepositoryConn
   {
     addSeedDocuments(activities,spec,startTime,endTime);
   }
-
+  */
+  
   /** Queue "seed" documents.  Seed documents are the starting places for crawling activity.  Documents
   * are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
   *
@@ -231,6 +236,7 @@ public abstract class BaseRepositoryConn
   *@param startTime is the beginning of the time range to consider, inclusive.
   *@param endTime is the end of the time range to consider, exclusive.
   */
+  /*
   public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
     long startTime, long endTime)
     throws ManifoldCFException, ServiceInterruption
@@ -272,7 +278,8 @@ public abstract class BaseRepositoryConn
       }
     }
   }
-
+  */
+  
   /** The long version of getDocumentIdentifiers.
   *@param activities is the interface this method should use to perform whatever framework actions are desired.
   *@param spec is a document specification (that comes from the job).
@@ -280,19 +287,22 @@ public abstract class BaseRepositoryConn
   *@param endTime is the end of the time range to consider, exclusive.
   *@return the local document identifiers that should be added to the queue, as a stream.
   */
+  /*
   public IDocumentIdentifierStream getDocumentIdentifiers(ISeedingActivity activities, DocumentSpecification spec,
     long startTime, long endTime)
     throws ManifoldCFException, ServiceInterruption
   {
     return getDocumentIdentifiers(spec,startTime,endTime);
   }
-
+  */
+  
   /** The short version of getDocumentIdentifiers.
   *@param spec is a document specification (that comes from the job).
   *@param startTime is the beginning of the time range to consider, inclusive.
   *@param endTime is the end of the time range to consider, exclusive.
   *@return the local document identifiers that should be added to the queue, as a stream.
   */
+  /*
   public IDocumentIdentifierStream getDocumentIdentifiers(DocumentSpecification spec,
     long startTime, long endTime)
     throws ManifoldCFException, ServiceInterruption
@@ -300,7 +310,8 @@ public abstract class BaseRepositoryConn
     // Something provided here so we can override either one.
     return null;
   }
-
+  */
+  
   /** This method returns the document identifiers that should be considered part of the seeds, but do not need to be
   * queued for processing at this time.  This method is used to keep the hopcount tables up to date.  It is
   * allowed to return more identifiers than it strictly needs to, specifically identifiers that were also returned
@@ -316,6 +327,7 @@ public abstract class BaseRepositoryConn
   *@return the local document identifiers that should be added to the queue, as a stream, or null, if none need to be
   * returned.
   */
+  /*
   public IDocumentIdentifierStream getRemainingDocumentIdentifiers(ISeedingActivity activities, DocumentSpecification spec,
     long startTime, long endTime)
     throws ManifoldCFException, ServiceInterruption
@@ -323,7 +335,8 @@ public abstract class BaseRepositoryConn
     // Usually we don't need to worry about this.
     return null;
   }
-
+  */
+  
   /** Process a set of documents.
   * This is the method that should cause each document to be fetched, processed, and the results either added
   * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.

Modified: manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -34,8 +34,8 @@ public class InterruptionRepositoryConne
   }
 
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime, int jobMode)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     String docCount = "10";
@@ -52,6 +52,7 @@ public class InterruptionRepositoryConne
       String doc = "test"+i+".txt";
       activities.addSeedDocument(doc,null);
     }
+    return "";
   }
   
   @Override

Modified: manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -44,8 +44,8 @@ public class SchedulingRepositoryConnect
   }
 
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime, int jobMode)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     // A seed per domain
@@ -58,6 +58,7 @@ public class SchedulingRepositoryConnect
       activities.addSeedDocument(Integer.toString(i)+"/",null);
     }
     System.out.println("Seeding completed at "+System.currentTimeMillis());
+    return "";
   }
   
   @Override

Modified: manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java?rev=1621613&r1=1621612&r2=1621613&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java Sun Aug 31 18:17:42 2014
@@ -34,8 +34,8 @@ public class TestingRepositoryConnector 
   }
 
   @Override
-  public void addSeedDocuments(ISeedingActivity activities, DocumentSpecification spec,
-    long startTime, long endTime, int jobMode)
+  public String addSeedDocuments(ISeedingActivity activities, Specification spec,
+    String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption
   {
     String docCount = "3";
@@ -52,6 +52,7 @@ public class TestingRepositoryConnector 
       String doc = "test"+i+".txt";
       activities.addSeedDocument(doc,null);
     }
+    return "";
   }
   
   @Override