You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/10/10 23:03:47 UTC
svn commit: r1181236 - in
/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src:
main/java/org/apache/manifoldcf/crawler/connectors/wiki/ test/java/org/
test/java/org/apache/ test/java/org/apache/manifoldcf/
test/java/org/apache/manifold...
Author: kwright
Date: Mon Oct 10 21:03:46 2011
New Revision: 1181236
URL: http://svn.apache.org/viewvc?rev=1181236&view=rev
Log:
Complete the connector functionality
Added:
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java (with props)
Removed:
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/ByteBuffer.java
Modified:
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/BaseProcessingContext.java
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/SingleLevelContext.java
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/BaseProcessingContext.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/BaseProcessingContext.java?rev=1181236&r1=1181235&r2=1181236&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/BaseProcessingContext.java (original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/BaseProcessingContext.java Mon Oct 10 21:03:46 2011
@@ -55,6 +55,8 @@ public abstract class BaseProcessingCont
}
/** Process this data */
- protected abstract void process()
- throws ManifoldCFException;
+ protected void process()
+ throws ManifoldCFException
+ {
+ }
}
Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/SingleLevelContext.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/SingleLevelContext.java?rev=1181236&r1=1181235&r2=1181236&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/SingleLevelContext.java (original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/SingleLevelContext.java Mon Oct 10 21:03:46 2011
@@ -71,6 +71,7 @@ public abstract class SingleLevelContext
super.endTag();
}
- protected abstract void finishChild(BaseProcessingContext child);
+ protected abstract void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException;
}
Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1181236&r1=1181235&r2=1181236&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Mon Oct 10 21:03:46 2011
@@ -136,9 +136,7 @@ public class WikiConnector extends org.a
{
// Destroy saved session setup and repeat it
hasBeenSetup = false;
- getSession();
-
- executeCheckViaThread();
+ performCheck();
return super.check();
}
catch (ServiceInterruption e)
@@ -183,6 +181,14 @@ public class WikiConnector extends org.a
super.disconnect();
}
+ /** Get the maximum number of documents to amalgamate together into one batch, for this connector.
+ *@return the maximum number. 0 indicates "unlimited".
+ */
+ public int getMaxDocumentRequest()
+ {
+ return 20;
+ }
+
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
*
@@ -213,7 +219,7 @@ public class WikiConnector extends org.a
long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
{
- // MHL
+ listAllPages(activities,startTime,endTime);
}
/** Get document versions given an array of document identifiers.
@@ -238,8 +244,14 @@ public class WikiConnector extends org.a
DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption
{
- // MHL
- return null;
+ Map<String,String> versions = new HashMap<String,String>();
+ getTimestamps(documentIdentifiers,versions,activities);
+ String[] rval = new String[documentIdentifiers.length];
+ for (int i = 0 ; i < rval.length ; i++)
+ {
+ rval[i] = versions.get(documentIdentifiers[i]);
+ }
+ return rval;
}
/** Process a set of documents.
@@ -260,7 +272,11 @@ public class WikiConnector extends org.a
DocumentSpecification spec, boolean[] scanOnly, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
- // MHL
+ for (int i = 0 ; i < documentIdentifiers.length ; i++)
+ {
+ if (!scanOnly[i])
+ getDocInfo(documentIdentifiers[i], versions[i], activities);
+ }
}
// UI support methods.
@@ -577,10 +593,14 @@ public class WikiConnector extends org.a
return method;
}
- /** Execute a check() operation via a thread */
- protected void executeCheckViaThread()
+ // -- Methods and classes to perform a "check" operation. --
+
+ /** Do the check operation. This throws an exception if anything is wrong.
+ */
+ protected void performCheck()
throws ManifoldCFException, ServiceInterruption
{
+ getSession();
HttpClient client = getInitializedClient();
HttpMethodBase executeMethod = getInitializedMethod(getCheckURL());
try
@@ -637,6 +657,13 @@ public class WikiConnector extends org.a
executeMethod = null;
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
+ catch (ManifoldCFException e)
+ {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ // Drop the connection on the floor
+ executeMethod = null;
+ throw e;
+ }
catch (java.net.SocketTimeoutException e)
{
long currentTime = System.currentTimeMillis();
@@ -667,14 +694,21 @@ public class WikiConnector extends org.a
executeMethod.releaseConnection();
}
}
+
+ /** Get a URL for a check operation.
+ */
+ protected String getCheckURL()
+ throws ManifoldCFException
+ {
+ return baseURL + "action=query&list=allpages&aplimit=1";
+ }
- /** Thread to execute a check */
+ /** Thread to execute a "check" operation. This thread both executes the operation and parses the result. */
protected static class ExecuteCheckThread extends Thread
{
protected HttpClient client;
protected HttpMethodBase executeMethod;
protected Throwable exception = null;
- protected int rval = 0;
public ExecuteCheckThread(HttpClient client, HttpMethodBase executeMethod)
{
@@ -778,13 +812,9 @@ public class WikiConnector extends org.a
}
protected void finishChild(BaseProcessingContext child)
- {
- responseSeen |= ((WikiCheckQueryContext)child).hasResponse();
- }
-
- protected void process()
throws ManifoldCFException
{
+ responseSeen |= ((WikiCheckQueryContext)child).hasResponse();
}
public boolean hasResponse()
@@ -810,15 +840,11 @@ public class WikiConnector extends org.a
}
protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
{
responseSeen |= ((WikiCheckAllPagesContext)child).hasResponse();
}
- protected void process()
- throws ManifoldCFException
- {
- }
-
public boolean hasResponse()
{
return responseSeen;
@@ -842,15 +868,11 @@ public class WikiConnector extends org.a
}
protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
{
responseSeen |= true;
}
- protected void process()
- throws ManifoldCFException
- {
- }
-
public boolean hasResponse()
{
return responseSeen;
@@ -865,101 +887,29 @@ public class WikiConnector extends org.a
{
super(theStream,namespaceURI,localName,qName,atts);
}
-
- protected void process()
- throws ManifoldCFException
- {
- }
-
- }
-
- /** Execute an HttpClient method via thread, so we don't get stuck in socket wait */
- protected static int executeMethodViaThread(HttpClient client, HttpMethodBase executeMethod)
- throws InterruptedException, IOException
- {
- ExecuteMethodThread t = new ExecuteMethodThread(client,executeMethod);
- try
- {
- t.start();
- t.join();
- Throwable thr = t.getException();
- if (thr != null)
- {
- if (thr instanceof IOException)
- throw (IOException)thr;
- else if (thr instanceof RuntimeException)
- throw (RuntimeException)thr;
- else
- throw (Error)thr;
- }
- return t.getResponse();
- }
- catch (InterruptedException e)
- {
- t.interrupt();
- // We need the caller to abandon any connections left around, so rethrow in a way that forces them to process the event properly.
- throw e;
- }
- }
-
- /** Thread to execute an HttpClient method */
- protected static class ExecuteMethodThread extends Thread
- {
- protected HttpClient client;
- protected HttpMethodBase executeMethod;
- protected Throwable exception = null;
- protected int rval = 0;
-
- public ExecuteMethodThread(HttpClient client, HttpMethodBase executeMethod)
- {
- super();
- setDaemon(true);
- this.client = client;
- this.executeMethod = executeMethod;
- }
-
- public void run()
- {
- try
- {
- // Call the execute method appropriately
- rval = client.executeMethod(executeMethod);
- }
- catch (Throwable e)
- {
- this.exception = e;
- }
- }
-
- public Throwable getException()
- {
- return exception;
- }
-
- public int getResponse()
- {
- return rval;
- }
}
+ // -- Methods and classes to perform a "list pages" operation. --
- /** Create a URL to obtain the next 500 pages.
+ /** Perform a series of listPages() operations, so that we fully obtain the documents we're looking for even though
+ * we're limited to 500 of them per request.
*/
- protected String getListPagesURL(String startingTitle)
- throws ManifoldCFException
+ protected void listAllPages(ISeedingActivity activities, long startTime, long endTime)
+ throws ManifoldCFException, ServiceInterruption
{
- try
- {
- return baseURL + "action=query&list=allpages" +
- ((startingTitle!=null)?"&apfrom="+URLEncoder.encode(startingTitle,"utf-8"):"") +
- "&aplimit=500";
- }
- catch (UnsupportedEncodingException e)
- {
- throw new ManifoldCFException(e.getMessage(),e);
+ getSession();
+ String lastTitle = null;
+ while (true)
+ {
+ // Start with the last title seen in the previous round. This will cause a duplicate to be queued, but that's not
+ // a problem.
+ String newLastTitle = executeListPagesViaThread(lastTitle,activities);
+ if (newLastTitle == null)
+ break;
+ lastTitle = newLastTitle;
}
}
-
+
/** Execute a listPages() operation via a thread. Returns the last page title. */
protected String executeListPagesViaThread(String startPageTitle, ISeedingActivity activities)
throws ManifoldCFException, ServiceInterruption
@@ -969,7 +919,7 @@ public class WikiConnector extends org.a
try
{
PageBuffer pageBuffer = new PageBuffer();
- ExecuteListPagesThread t = new ExecuteListPagesThread(client,executeMethod,pageBuffer);
+ ExecuteListPagesThread t = new ExecuteListPagesThread(client,executeMethod,pageBuffer,startPageTitle);
try
{
t.start();
@@ -1039,6 +989,13 @@ public class WikiConnector extends org.a
executeMethod = null;
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
+ catch (ManifoldCFException e)
+ {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ // Drop the connection on the floor
+ executeMethod = null;
+ throw e;
+ }
catch (java.net.SocketTimeoutException e)
{
long currentTime = System.currentTimeMillis();
@@ -1069,7 +1026,24 @@ public class WikiConnector extends org.a
executeMethod.releaseConnection();
}
}
-
+
+ /** Create a URL to obtain the next 500 pages.
+ */
+ protected String getListPagesURL(String startingTitle)
+ throws ManifoldCFException
+ {
+ try
+ {
+ return baseURL + "action=query&list=allpages" +
+ ((startingTitle!=null)?"&apfrom="+URLEncoder.encode(startingTitle,"utf-8"):"") +
+ "&aplimit=500";
+ }
+ catch (UnsupportedEncodingException e)
+ {
+ throw new ManifoldCFException(e.getMessage(),e);
+ }
+ }
+
/** Thread to execute a list pages operation */
protected static class ExecuteListPagesThread extends Thread
{
@@ -1078,14 +1052,16 @@ public class WikiConnector extends org.a
protected Throwable exception = null;
protected PageBuffer pageBuffer;
protected String lastPageTitle = null;
+ protected String startPageTitle;
- public ExecuteListPagesThread(HttpClient client, HttpMethodBase executeMethod, PageBuffer pageBuffer)
+ public ExecuteListPagesThread(HttpClient client, HttpMethodBase executeMethod, PageBuffer pageBuffer, String startPageTitle)
{
super();
setDaemon(true);
this.client = client;
this.executeMethod = executeMethod;
this.pageBuffer = pageBuffer;
+ this.startPageTitle = startPageTitle;
}
public void run()
@@ -1100,7 +1076,7 @@ public class WikiConnector extends org.a
InputStream is = executeMethod.getResponseBodyAsStream();
try
{
- lastPageTitle = parseListPagesResponse(is,pageBuffer);
+ lastPageTitle = parseListPagesResponse(is,pageBuffer,startPageTitle);
}
finally
{
@@ -1151,12 +1127,12 @@ public class WikiConnector extends org.a
* </query-continue>
* </api>
*/
- protected static String parseListPagesResponse(InputStream is, PageBuffer buffer)
+ protected static String parseListPagesResponse(InputStream is, PageBuffer buffer, String startPageTitle)
throws ManifoldCFException, ServiceInterruption
{
// Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
XMLStream x = new XMLStream();
- WikiListPagesAPIContext c = new WikiListPagesAPIContext(x,buffer);
+ WikiListPagesAPIContext c = new WikiListPagesAPIContext(x,buffer,startPageTitle);
x.setContext(c);
try
{
@@ -1170,19 +1146,6 @@ public class WikiConnector extends org.a
long time = System.currentTimeMillis();
throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
}
- catch (ManifoldCFException e)
- {
- if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
- throw e;
- // Ignore XML parsing errors.
- if (e.getMessage().indexOf("pars") >= 0)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Wiki: listPages() response was unparseable ("+e.getMessage()+"), skipping");
- return null;
- }
- throw e;
- }
}
finally
{
@@ -1195,26 +1158,24 @@ public class WikiConnector extends org.a
{
protected String lastTitle = null;
protected PageBuffer buffer;
+ protected String startPageTitle;
- public WikiListPagesAPIContext(XMLStream theStream, PageBuffer buffer)
+ public WikiListPagesAPIContext(XMLStream theStream, PageBuffer buffer, String startPageTitle)
{
super(theStream,"api");
this.buffer = buffer;
+ this.startPageTitle = startPageTitle;
}
protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
{
- return new WikiListPagesQueryContext(theStream,namespaceURI,localName,qName,atts,buffer);
+ return new WikiListPagesQueryContext(theStream,namespaceURI,localName,qName,atts,buffer,startPageTitle);
}
protected void finishChild(BaseProcessingContext child)
- {
- lastTitle = ((WikiListPagesQueryContext)child).getLastTitle();
- }
-
- protected void process()
throws ManifoldCFException
{
+ lastTitle = ((WikiListPagesQueryContext)child).getLastTitle();
}
public String getLastTitle()
@@ -1229,26 +1190,25 @@ public class WikiConnector extends org.a
{
protected String lastTitle = null;
protected PageBuffer buffer;
+ protected String startPageTitle;
- public WikiListPagesQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, PageBuffer buffer)
+ public WikiListPagesQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ PageBuffer buffer, String startPageTitle)
{
super(theStream,namespaceURI,localName,qName,atts,"query");
this.buffer = buffer;
+ this.startPageTitle = startPageTitle;
}
protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
{
- return new WikiListPagesAllPagesContext(theStream,namespaceURI,localName,qName,atts,buffer);
+ return new WikiListPagesAllPagesContext(theStream,namespaceURI,localName,qName,atts,buffer,startPageTitle);
}
protected void finishChild(BaseProcessingContext child)
- {
- lastTitle = ((WikiListPagesAllPagesContext)child).getLastTitle();
- }
-
- protected void process()
throws ManifoldCFException
{
+ lastTitle = ((WikiListPagesAllPagesContext)child).getLastTitle();
}
public String getLastTitle()
@@ -1263,29 +1223,28 @@ public class WikiConnector extends org.a
{
protected String lastTitle = null;
protected PageBuffer buffer;
+ protected String startPageTitle;
- public WikiListPagesAllPagesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, PageBuffer buffer)
+ public WikiListPagesAllPagesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ PageBuffer buffer, String startPageTitle)
{
super(theStream,namespaceURI,localName,qName,atts,"allpages");
this.buffer = buffer;
+ this.startPageTitle = startPageTitle;
}
protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
{
// When we recognize allpages, we need to look for <p> records.
- return new WikiListPagesPContext(theStream,namespaceURI,localName,qName,atts,buffer);
+ return new WikiListPagesPContext(theStream,namespaceURI,localName,qName,atts,buffer,startPageTitle);
}
protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
{
// Update the last title from all the <p> records we saw.
lastTitle = ((WikiListPagesPContext)child).getLastTitle();
}
-
- protected void process()
- throws ManifoldCFException
- {
- }
public String getLastTitle()
{
@@ -1299,11 +1258,14 @@ public class WikiConnector extends org.a
{
protected String lastTitle = null;
protected PageBuffer buffer;
+ protected String startPageTitle;
- public WikiListPagesPContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, PageBuffer buffer)
+ public WikiListPagesPContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ PageBuffer buffer, String startPageTitle)
{
super(theStream,namespaceURI,localName,qName,atts);
this.buffer = buffer;
+ this.startPageTitle = startPageTitle;
}
protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
@@ -1311,188 +1273,981 @@ public class WikiConnector extends org.a
{
if (qName.equals("p"))
{
- lastTitle = atts.getValue("title");
- String pageID = atts.getValue("pageid");
- // Add the discovered page id to the page buffer
- try
- {
- buffer.add(pageID);
- }
- catch (InterruptedException e)
+ String currentTitle = atts.getValue("title");
+ // Skip the record that matches the start page title (just pretend it isn't there)
+ if (startPageTitle == null || !currentTitle.equals(startPageTitle))
{
- throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ lastTitle = currentTitle;
+ String pageID = atts.getValue("pageid");
+ // Add the discovered page id to the page buffer
+ try
+ {
+ buffer.add(pageID);
+ }
+ catch (InterruptedException e)
+ {
+ throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
}
}
return super.beginTag(namespaceURI,localName,qName,atts);
}
- protected void process()
- throws ManifoldCFException
- {
- }
-
public String getLastTitle()
{
return lastTitle;
}
}
- /** Create a URL to obtain a page's browse URL, given the page ID.
- */
- protected String getGetURLURL(String documentIdentifier)
- throws ManifoldCFException
- {
- return baseURL + "action=query&prop=info&pageids="+documentIdentifier+"&inprop=url";
- }
-
- /** Parse the response to a GetURL request, of the form:
- * <api>
- * <query>
- * <pages>
- * <page pageid="27697087" ns="0" title="API" touched="2011-09-27T07:00:55Z" lastrevid="367741756" counter="" length="70" redirect="" fullurl="http://en.wikipedia.org/wiki/API" editurl="http://en.wikipedia.org/w/index.php?title=API&action=edit" />
- * </pages>
- * </query>
- *</api>
+
+ // -- Methods and classes to perform a "get Timestamp" operation. --
+
+ /** Obtain document versions for a set of documents.
*/
- protected static String parseGetURLResponse(InputStream is, String documentIdentifier)
+ protected void getTimestamps(String[] documentIdentifiers, Map<String,String> versions, IVersionActivity activities)
throws ManifoldCFException, ServiceInterruption
{
- // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
- XMLStream x = new XMLStream();
- WikiGetURLAPIContext c = new WikiGetURLAPIContext(x);
- x.setContext(c);
+ getSession();
+ HttpClient client = getInitializedClient();
+ HttpMethodBase executeMethod = getInitializedMethod(getGetTimestampURL(documentIdentifiers));
try
{
+ ExecuteGetTimestampThread t = new ExecuteGetTimestampThread(client,executeMethod,versions);
try
{
- x.parse(is);
- return c.getURL();
+ t.start();
+ t.join();
+ Throwable thr = t.getException();
+ if (thr != null)
+ {
+ if (thr instanceof ManifoldCFException)
+ {
+ if (((ManifoldCFException)thr).getErrorCode() == ManifoldCFException.INTERRUPTED)
+ throw new InterruptedException(thr.getMessage());
+ throw (ManifoldCFException)thr;
+ }
+ else if (thr instanceof ServiceInterruption)
+ throw (ServiceInterruption)thr;
+ else if (thr instanceof IOException)
+ throw (IOException)thr;
+ else if (thr instanceof RuntimeException)
+ throw (RuntimeException)thr;
+ else
+ throw (Error)thr;
+ }
+ }
+ catch (ManifoldCFException e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (ServiceInterruption e)
+ {
+ t.interrupt();
+ throw e;
}
catch (IOException e)
{
- long time = System.currentTimeMillis();
- throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+ t.interrupt();
+ throw e;
}
- catch (ManifoldCFException e)
+ catch (InterruptedException e)
{
- // Ignore XML parsing errors.
- if (e.getMessage().indexOf("pars") >= 0)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Wiki: getURL() document '"+documentIdentifier+"' was unparseable ("+e.getMessage()+"), skipping");
- return null;
- }
+ t.interrupt();
+ // We need the caller to abandon any connections left around, so rethrow in a way that forces them to process the event properly.
throw e;
}
}
- finally
+ catch (InterruptedException e)
{
- x.cleanup();
+ // Drop the connection on the floor
+ executeMethod = null;
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
- }
-
- /** Class representing the "api" context of a "get url" response */
- protected static class WikiGetURLAPIContext extends SingleLevelContext
- {
- protected String fullURL = null;
-
- public WikiGetURLAPIContext(XMLStream theStream)
+ catch (ManifoldCFException e)
{
- super(theStream,"api");
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ // Drop the connection on the floor
+ executeMethod = null;
+ throw e;
}
-
- protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ catch (java.net.SocketTimeoutException e)
{
- return new WikiGetURLQueryContext(theStream,namespaceURI,localName,qName,atts);
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Version fetch timed out reading from the Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
}
-
- protected void finishChild(BaseProcessingContext child)
+ catch (java.net.SocketException e)
{
- fullURL = ((WikiGetURLQueryContext)child).getURL();
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Version fetch received a socket error reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
}
-
- protected void process()
- throws ManifoldCFException
+ catch (org.apache.commons.httpclient.ConnectTimeoutException e)
{
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Version fetch connection timed out reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
}
-
- public String getURL()
+ catch (InterruptedIOException e)
{
- return fullURL;
+ executeMethod = null;
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
-
- }
-
- /** Class representing the "api/query" context of a "get url" response */
- protected static class WikiGetURLQueryContext extends SingleLevelContext
- {
- protected String fullURL = null;
-
- public WikiGetURLQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ catch (IOException e)
{
- super(theStream,namespaceURI,localName,qName,atts,"query");
+ throw new ManifoldCFException("Version fetch had an IO failure: "+e.getMessage(),e);
+ }
+ finally
+ {
+ if (executeMethod != null)
+ executeMethod.releaseConnection();
+ }
+ }
+
+ /** Create a URL to obtain multiple page's timestamps, given the page IDs.
+ */
+ protected String getGetTimestampURL(String[] documentIdentifiers)
+ throws ManifoldCFException
+ {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0 ; i < documentIdentifiers.length ; i++)
+ {
+ if (i > 0)
+ sb.append(",");
+ sb.append(documentIdentifiers[i]);
+ }
+ return baseURL + "action=query&prop=revisions&pageids="+sb.toString()+"&rvprop=timestamp";
+ }
+
+ /** Thread to execute a "get timestamp" operation. This thread both executes the operation and parses the result. */
+ protected static class ExecuteGetTimestampThread extends Thread
+ {
+ protected HttpClient client;
+ protected HttpMethodBase executeMethod;
+ protected Throwable exception = null;
+ protected Map<String,String> versions;
+
+ public ExecuteGetTimestampThread(HttpClient client, HttpMethodBase executeMethod, Map<String,String> versions)
+ {
+ super();
+ setDaemon(true);
+ this.client = client;
+ this.executeMethod = executeMethod;
+ this.versions = versions;
+ }
+
+ public void run()
+ {
+ try
+ {
+ // Call the execute method appropriately
+ int rval = client.executeMethod(executeMethod);
+ if (rval != 200)
+ throw new ManifoldCFException("Unexpected response code: "+rval);
+ // Read response and make sure it's valid
+ InputStream is = executeMethod.getResponseBodyAsStream();
+ try
+ {
+ parseGetTimestampResponse(is,versions);
+ }
+ finally
+ {
+ try
+ {
+ is.close();
+ }
+ catch (IllegalStateException e)
+ {
+ // Ignore this error
+ }
+ }
+ }
+ catch (Throwable e)
+ {
+ this.exception = e;
+ }
+ }
+
+ public Throwable getException()
+ {
+ return exception;
+ }
+
+ }
+
+ /** This method parses a response like the following:
+ * <api>
+ * <query>
+ * <pages>
+ * <page pageid="27697087" ns="0" title="API">
+ * <revisions>
+ * <rev user="Graham87" timestamp="2010-06-13T08:41:17Z" />
+ * </revisions>
+ * </page>
+ * </pages>
+ * </query>
+ * </api>
+ */
+ protected static void parseGetTimestampResponse(InputStream is, Map<String,String> versions)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
+ XMLStream x = new XMLStream();
+ WikiGetTimestampAPIContext c = new WikiGetTimestampAPIContext(x,versions);
+ x.setContext(c);
+ try
+ {
+ try
+ {
+ x.parse(is);
+ }
+ catch (IOException e)
+ {
+ long time = System.currentTimeMillis();
+ throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+ }
+ }
+ finally
+ {
+ x.cleanup();
+ }
+ }
+
+ /** Class representing the "api" context of a "get timestamp" response */
+ protected static class WikiGetTimestampAPIContext extends SingleLevelContext
+ {
+ protected Map<String,String> versions;
+
+ public WikiGetTimestampAPIContext(XMLStream theStream, Map<String,String> versions)
+ {
+ super(theStream,"api");
+ this.versions = versions;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetTimestampQueryContext(theStream,namespaceURI,localName,qName,atts,versions);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ }
+
+ }
+
+ /** Class representing the "api/query" context of a "get timestamp" response */
+ protected static class WikiGetTimestampQueryContext extends SingleLevelContext
+ {
+ protected Map<String,String> versions;
+
+ public WikiGetTimestampQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ Map<String,String> versions)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"query");
+ this.versions = versions;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetTimestampPagesContext(theStream,namespaceURI,localName,qName,atts,versions);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ }
+
+ }
+
+ /** Class looking for the "api/query/pages" context of a "get timestamp" response */
+ protected static class WikiGetTimestampPagesContext extends SingleLevelContext
+ {
+ protected Map<String,String> versions;
+
+ public WikiGetTimestampPagesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ Map<String,String> versions)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"pages");
+ this.versions = versions;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetTimestampPageContext(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ WikiGetTimestampPageContext pc = (WikiGetTimestampPageContext)child;
+ String pageID = pc.getPageID();
+ String version = pc.getLastEdit();
+ if (pageID != null && version != null)
+ versions.put(pageID,version);
+ }
+ }
+
+ /** Class looking for the "api/query/pages/page" context of a "get timestamp" response */
+ protected static class WikiGetTimestampPageContext extends BaseProcessingContext
+ {
+ protected String pageID = null;
+ protected String lastRevEdit = null;
+
+ public WikiGetTimestampPageContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (qName.equals("page"))
+ {
+ pageID = atts.getValue("pageid");
+ return new WikiGetTimestampRevisionsContext(theStream,namespaceURI,localName,qName,atts);
+ }
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+
+ if (theTag.equals("page"))
+ lastRevEdit = ((WikiGetTimestampRevisionsContext)theContext).getTimestamp();
+ else
+ super.endTag();
+ }
+
+ public String getPageID()
+ {
+ return pageID;
+ }
+
+ public String getLastEdit()
+ {
+ return lastRevEdit;
+ }
+ }
+
+ /** Class looking for the "api/query/pages/page/revisions" context of a "get timestamp" response */
+ protected static class WikiGetTimestampRevisionsContext extends SingleLevelContext
+ {
+ protected String timestamp = null;
+
+ public WikiGetTimestampRevisionsContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"revisions");
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetTimestampRevContext(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ WikiGetTimestampRevContext rc = (WikiGetTimestampRevContext)child;
+ if (timestamp == null)
+ timestamp = rc.getTimestamp();
+ }
+
+ public String getTimestamp()
+ {
+ return timestamp;
+ }
+ }
+
+ /** Class looking for the "api/query/pages/page/revisions/rev" context of a "get timestamp" response */
+ protected static class WikiGetTimestampRevContext extends BaseProcessingContext
+ {
+ protected String timestamp = null;
+
+ public WikiGetTimestampRevContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (qName.equals("rev"))
+ timestamp = atts.getValue("timestamp");
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ public String getTimestamp()
+ {
+ return timestamp;
+ }
+ }
+
+ // -- Methods and classes to perform a "get Docinfo" operation. --
+
+ /** Get document info and index the document.
+ */
+ protected void getDocInfo(String documentIdentifier, String documentVersion, IProcessActivity activities)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ getSession();
+ HttpClient client = getInitializedClient();
+ HttpMethodBase executeMethod = getInitializedMethod(getGetDocInfoURL(documentIdentifier));
+
+ String statusCode = "UNKNOWN";
+ String errorMessage = null;
+ long startTime = System.currentTimeMillis();
+ long dataSize = 0L;
+
+ try
+ {
+ ExecuteGetDocInfoThread t = new ExecuteGetDocInfoThread(client,executeMethod,documentIdentifier);
+ try
+ {
+ t.start();
+ t.join();
+
+ statusCode = t.getStatusCode();
+ errorMessage = t.getErrorMessage();
+
+ Throwable thr = t.getException();
+ if (thr != null)
+ {
+ if (thr instanceof ManifoldCFException)
+ {
+ if (((ManifoldCFException)thr).getErrorCode() == ManifoldCFException.INTERRUPTED)
+ throw new InterruptedException(thr.getMessage());
+ throw (ManifoldCFException)thr;
+ }
+ else if (thr instanceof ServiceInterruption)
+ throw (ServiceInterruption)thr;
+ else if (thr instanceof IOException)
+ throw (IOException)thr;
+ else if (thr instanceof RuntimeException)
+ throw (RuntimeException)thr;
+ else
+ throw (Error)thr;
+ }
+
+ // Fetch all the data we need from the thread, and do the indexing.
+ File contentFile = t.getContentFile();
+ if (contentFile != null)
+ {
+ statusCode = "OK";
+ try
+ {
+ String author = t.getAuthor();
+ String comment = t.getComment();
+ String title = t.getTitle();
+ String fullURL = t.getFullURL();
+
+ RepositoryDocument rd = new RepositoryDocument();
+ dataSize = contentFile.length();
+ InputStream is = new FileInputStream(contentFile);
+ try
+ {
+ rd.setBinary(is,dataSize);
+ if (comment != null)
+ rd.addField("comment",comment);
+ if (author != null)
+ rd.addField("author",author);
+ if (title != null)
+ rd.addField("title",title);
+ activities.ingestDocument(documentIdentifier,documentVersion,fullURL,rd);
+ }
+ finally
+ {
+ is.close();
+ }
+ }
+ finally
+ {
+ contentFile.delete();
+ }
+ }
+ }
+ catch (ManifoldCFException e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (ServiceInterruption e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (IOException e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (InterruptedException e)
+ {
+ t.interrupt();
+ // We need the caller to abandon any connections left around, so rethrow in a way that forces them to process the event properly.
+ throw e;
+ }
+ finally
+ {
+ t.cleanup();
+ }
+ }
+ catch (InterruptedException e)
+ {
+ // Drop the connection on the floor
+ executeMethod = null;
+ statusCode = null;
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (ManifoldCFException e)
+ {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ {
+ // Drop the connection on the floor
+ executeMethod = null;
+ statusCode = null;
+ }
+ throw e;
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Get doc info timed out reading from the Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+ }
+ catch (java.net.SocketException e)
+ {
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Get doc info received a socket error reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+ }
+ catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ {
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Get doc info connection timed out reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+ }
+ catch (InterruptedIOException e)
+ {
+ executeMethod = null;
+ statusCode = null;
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new ManifoldCFException("Get doc info had an IO failure: "+e.getMessage(),e);
+ }
+ finally
+ {
+ if (executeMethod != null)
+ executeMethod.releaseConnection();
+ if (statusCode != null)
+ activities.recordActivity(new Long(startTime),ACTIVITY_FETCH,new Long(dataSize),documentIdentifier,statusCode,errorMessage,null);
+ }
+ }
+
+ /** Thread to execute a "get timestamp" operation. This thread both executes the operation and parses the result. */
+ protected static class ExecuteGetDocInfoThread extends Thread
+ {
+ protected HttpClient client;
+ protected HttpMethodBase executeMethod;
+ protected Throwable exception = null;
+ protected String documentIdentifier;
+ protected File contentFile = null;
+ protected String author = null;
+ protected String title = null;
+ protected String comment = null;
+ protected String fullURL = null;
+
+ protected String statusCode = null;
+ protected String errorMessage = null;
+
+ public ExecuteGetDocInfoThread(HttpClient client, HttpMethodBase executeMethod, String documentIdentifier)
+ {
+ super();
+ setDaemon(true);
+ this.client = client;
+ this.executeMethod = executeMethod;
+ this.documentIdentifier = documentIdentifier;
+ }
+
+ public void run()
+ {
+ try
+ {
+ // Call the execute method appropriately
+ int rval = client.executeMethod(executeMethod);
+ if (rval != 200)
+ {
+ statusCode = "HTTP code "+rval;
+ throw new ManifoldCFException("Unexpected response code "+rval+": "+executeMethod.getResponseBodyAsString());
+ }
+ // Read response and make sure it's valid
+ InputStream is = executeMethod.getResponseBodyAsStream();
+ try
+ {
+ // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
+ // <api>
+ // <query>
+ // <pages>
+ // <page pageid="27697087" ns="0" title="API" touched="2011-09-27T07:00:55Z" lastrevid="367741756" counter="" length="70" redirect="" fullurl="http://en.wikipedia.org/wiki/API" editurl="http://en.wikipedia.org/w/index.php?title=API&action=edit">
+ // <revisions>
+ // <rev user="Graham87" timestamp="2010-06-13T08:41:17Z" comment="Protected API: restore protection ([edit=sysop] (indefinite) [move=sysop] (indefinite))" xml:space="preserve">#REDIRECT [[Application programming interface]]{{R from abbreviation}}</rev>
+ // </revisions>
+ // </page>
+ // </pages>
+ // </query>
+ //</api>
+
+ XMLStream x = new XMLStream();
+ WikiGetDocInfoAPIContext c = new WikiGetDocInfoAPIContext(x);
+ x.setContext(c);
+ try
+ {
+ try
+ {
+ x.parse(is);
+ contentFile = c.getContentFile();
+ fullURL = c.getURL();
+ title = c.getTitle();
+ author = c.getAuthor();
+ comment = c.getComment();
+ statusCode = "OK";
+ }
+ catch (IOException e)
+ {
+ long time = System.currentTimeMillis();
+ throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+ }
+ }
+ finally
+ {
+ x.cleanup();
+ }
+ }
+ finally
+ {
+ try
+ {
+ is.close();
+ }
+ catch (IllegalStateException e)
+ {
+ // Ignore this error
+ }
+ }
+ }
+ catch (Throwable e)
+ {
+ statusCode = "Exception";
+ errorMessage = e.getMessage();
+ this.exception = e;
+ }
}
- protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ public Throwable getException()
+ {
+ return exception;
+ }
+
+ public String getStatusCode()
+ {
+ return statusCode;
+ }
+
+ public String getErrorMessage()
+ {
+ return errorMessage;
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
+
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public String getFullURL()
+ {
+ return fullURL;
+ }
+
+ public void cleanup()
+ {
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
+ }
+
+ }
+
+ /** Create a URL to obtain a page's metadata and content, given the page ID.
+ * QUESTION: Can we do multiple document identifiers at a time??
+ */
+ protected String getGetDocInfoURL(String documentIdentifier)
+ throws ManifoldCFException
+ {
+ return baseURL + "action=query&prop=revisions&pageids="+documentIdentifier+"&rvprop=user|comment|content&inprop=url";
+ }
+
+ /** Class representing the "api" context of a "get doc info" response */
+ protected static class WikiGetDocInfoAPIContext extends SingleLevelContext
+ {
+ /** Full URL */
+ protected String fullURL = null;
+ /** Title */
+ protected String title = null;
+ /** Content file */
+ protected File contentFile = null;
+ /** Author */
+ protected String author = null;
+ /** Comment */
+ protected String comment = null;
+
+ public WikiGetDocInfoAPIContext(XMLStream theStream)
{
- return new WikiGetURLPagesContext(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,"api");
}
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetDocInfoQueryContext(theStream,namespaceURI,localName,qName,atts);
+ }
+
protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ WikiGetDocInfoQueryContext pc = (WikiGetDocInfoQueryContext)child;
+ tagCleanup();
+ fullURL = pc.getURL();
+ title = pc.getTitle();
+ contentFile = pc.getContentFile();
+ author = pc.getAuthor();
+ comment = pc.getComment();
+ }
+
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ // Delete the contents file if it is there.
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
+ }
+
+ public String getURL()
+ {
+ return fullURL;
+ }
+
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
+
+ }
+
+ /** Class representing the "api/query" context of a "get doc info" response */
+ protected static class WikiGetDocInfoQueryContext extends SingleLevelContext
+ {
+ /** Full URL */
+ protected String fullURL = null;
+ /** Title */
+ protected String title = null;
+ /** Content file */
+ protected File contentFile = null;
+ /** Author */
+ protected String author = null;
+ /** Comment */
+ protected String comment = null;
+
+ public WikiGetDocInfoQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"query");
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
{
- fullURL = ((WikiGetURLPagesContext)child).getURL();
+ return new WikiGetDocInfoPagesContext(theStream,namespaceURI,localName,qName,atts);
}
- protected void process()
+ protected void finishChild(BaseProcessingContext child)
throws ManifoldCFException
{
+ WikiGetDocInfoPagesContext pc = (WikiGetDocInfoPagesContext)child;
+ tagCleanup();
+ fullURL = pc.getURL();
+ title = pc.getTitle();
+ contentFile = pc.getContentFile();
+ author = pc.getAuthor();
+ comment = pc.getComment();
}
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ // Delete the contents file if it is there.
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
+ }
+
public String getURL()
{
return fullURL;
}
+
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
}
- /** Class representing the "api/query/pages" context of a "get url" response */
- protected static class WikiGetURLPagesContext extends SingleLevelContext
+ /** Class representing the "api/query/pages" context of a "get doc info" response */
+ protected static class WikiGetDocInfoPagesContext extends SingleLevelContext
{
+ /** Full URL */
protected String fullURL = null;
+ /** Title */
+ protected String title = null;
+ /** Content file */
+ protected File contentFile = null;
+ /** Author */
+ protected String author = null;
+ /** Comment */
+ protected String comment = null;
- public WikiGetURLPagesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public WikiGetDocInfoPagesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
{
super(theStream,namespaceURI,localName,qName,atts,"pages");
}
protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
{
- return new WikiGetURLPageContext(theStream,namespaceURI,localName,qName,atts);
+ return new WikiGetDocInfoPageContext(theStream,namespaceURI,localName,qName,atts);
}
protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
{
- fullURL = ((WikiGetURLPagesContext)child).getURL();
+ WikiGetDocInfoPageContext pc = (WikiGetDocInfoPageContext)child;
+ tagCleanup();
+ fullURL = pc.getURL();
+ title = pc.getTitle();
+ contentFile = pc.getContentFile();
+ author = pc.getAuthor();
+ comment = pc.getComment();
}
-
- protected void process()
+
+ protected void tagCleanup()
throws ManifoldCFException
{
+ // Delete the contents file if it is there.
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
}
-
+
public String getURL()
{
return fullURL;
}
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
+
}
- /** Class representing the "api/query/pages/page" context of a "get url" response */
- protected static class WikiGetURLPageContext extends BaseProcessingContext
+ /** Class representing the "api/query/pages/page" context of a "get doc info" response */
+ protected static class WikiGetDocInfoPageContext extends BaseProcessingContext
{
+ /** Full URL */
protected String fullURL = null;
+ /** Title */
+ protected String title = null;
+ /** Content file */
+ protected File contentFile = null;
+ /** Author */
+ protected String author = null;
+ /** Comment */
+ protected String comment = null;
- public WikiGetURLPageContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public WikiGetDocInfoPageContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
{
super(theStream,namespaceURI,localName,qName,atts);
}
@@ -1501,43 +2256,213 @@ public class WikiConnector extends org.a
throws ManifoldCFException, ServiceInterruption
{
if (qName.equals("page"))
+ {
fullURL = atts.getValue("fullurl");
+ title = atts.getValue("title");
+ return new WikiGetDocInfoRevisionsContext(theStream,namespaceURI,localName,qName,atts);
+ }
return super.beginTag(namespaceURI,localName,qName,atts);
}
- protected void process()
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("page"))
+ {
+ // Pull down the data
+ WikiGetDocInfoRevisionsContext rc = (WikiGetDocInfoRevisionsContext)theContext;
+ tagCleanup();
+ contentFile = rc.getContentFile();
+ author = rc.getAuthor();
+ comment = rc.getComment();
+ }
+ super.endTag();
+ }
+
+ protected void tagCleanup()
throws ManifoldCFException
{
+ // Delete the contents file if it is there.
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
}
-
+
public String getURL()
{
return fullURL;
}
+
+ public String getTitle()
+ {
+ return title;
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
+
}
- /** Create a URL to obtain a page's timestamp, given the page ID.
- */
- protected String getGetTimestampURL(String documentIdentifier)
- throws ManifoldCFException
+ /** Class representing the "api/query/pages/page/revisions" context of a "get doc info" response */
+ protected static class WikiGetDocInfoRevisionsContext extends SingleLevelContext
{
- return baseURL + "action=query&prop=revisions&pageids="+documentIdentifier+"&rvprop=timestamp";
- }
+ protected File contentFile = null;
+ protected String author = null;
+ protected String comment = null;
+
+ public WikiGetDocInfoRevisionsContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"revisions");
+ }
- /** Create a URL to obtain a page's metadata and content, given the page ID.
- */
- protected String getGetDocinfoURL(String documentIdentifier)
- throws ManifoldCFException
- {
- return baseURL + "action=query&prop=revisions&pageids="+documentIdentifier+"&rvprop=user|comment|content";
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ // MHL to insure that only the FIRST revision is taken.
+ return new WikiGetDocInfoRevContext(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ WikiGetDocInfoRevContext rc = (WikiGetDocInfoRevContext)child;
+ tagCleanup();
+ contentFile = rc.getContentFile();
+ author = rc.getAuthor();
+ comment = rc.getComment();
+ }
+
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ // Delete the contents file if it is there.
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
+
}
- /** Get a URL for a check operation.
- */
- protected String getCheckURL()
- throws ManifoldCFException
+ /** Class looking for the "api/query/pages/page/revisions/rev" context of a "get doc info" response */
+ protected static class WikiGetDocInfoRevContext extends BaseProcessingContext
{
- return baseURL + "action=query&list=allpages&aplimit=1";
+ protected String author = null;
+ protected String comment = null;
+ protected File contentFile = null;
+
+ public WikiGetDocInfoRevContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (qName.equals("rev"))
+ {
+ author = atts.getValue("user");
+ comment = atts.getValue("comment");
+ try
+ {
+ File tempFile = File.createTempFile("_wikidata_","tmp");
+ return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
+ }
+ catch (InterruptedIOException e)
+ {
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new ManifoldCFException("IO exception creating temp file: "+e.getMessage(),e);
+ }
+ }
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("rev"))
+ {
+ // Pull down the data
+ XMLFileContext rc = (XMLFileContext)theContext;
+ tagCleanup();
+ contentFile = rc.getCompletedFile();
+ }
+ else
+ super.endTag();
+ }
+
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ // Delete the contents file if it is there.
+ if (contentFile != null)
+ {
+ contentFile.delete();
+ contentFile = null;
+ }
+ }
+
+ public String getAuthor()
+ {
+ return author;
+ }
+
+ public String getComment()
+ {
+ return comment;
+ }
+
+ public File getContentFile()
+ {
+ File rval = contentFile;
+ contentFile = null;
+ return rval;
+ }
+
}
}
Added: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java?rev=1181236&view=auto
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java (added)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java Mon Oct 10 21:03:46 2011
@@ -0,0 +1,119 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.wiki.tests;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.agents.interfaces.*;
+import org.apache.manifoldcf.crawler.interfaces.*;
+import org.apache.manifoldcf.crawler.system.ManifoldCF;
+
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.ServletHolder;
+
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import java.io.*;
+import java.util.*;
+import org.junit.*;
+
+/** This is a testing base class that is responsible for setting up/tearing down the agents framework. */
+public class Base extends org.apache.manifoldcf.crawler.tests.ConnectorBase
+{
+
+ protected String[] getConnectorNames()
+ {
+ return new String[]{"Wiki Connector"};
+ }
+
+ protected String[] getConnectorClasses()
+ {
+ return new String[]{"org.apache.manifoldcf.crawler.connectors.wiki.WikiConnector"};
+ }
+
+ /** Mock wiki service */
+ static class MockWikiService
+ {
+ Server server;
+
+ public MockWikiService()
+ {
+ server = new Server(8089);
+ Context asContext = new Context(server,"/w",Context.SESSIONS);
+ asContext.addServlet(new ServletHolder(new WikiAPIServlet()), "/api.php");
+ }
+
+ public void start() throws Exception {
+ server.start();
+ }
+
+ public void stop() throws Exception {
+ server.stop();
+ }
+
+ public static class WikiAPIServlet extends HttpServlet
+ {
+ @Override
+ public void service(HttpServletRequest req, HttpServletResponse res)
+ throws IOException
+ {
+ String format = req.getParameter("format");
+ if (!format.equals("xml"))
+ throw new IOException("Format parameter incorrect: "+format);
+ String list = req.getParameter("list");
+ String action = req.getParameter("action");
+ if (action == null && list == null)
+ throw new IOException("Must have either action or list");
+ if (action != null && list != null)
+ throw new IOException("Cannot have both action and list");
+ if (action != null)
+ {
+ if (!action.equals("query"))
+ throw new IOException("Action parameter incorrect: "+query);
+ String prop = req.getParameter("prop");
+ String pageIds = req.getParameter("pageids");
+ String rvprop = req.getParameter("rvprop");
+ String inprop = req.getParameter("inprop");
+ // MHL
+ }
+ else if (list != null)
+ {
+ if (!list.equals("allpages"))
+ throw new IOException("List parameter incorrect: "+list);
+ String apfrom = req.getParameter("apfrom");
+ String aplimit = req.getParameter("aplimit");
+ // MHL
+ }
+
+ // MHL
+ String user = req.getParameter("username");
+ res.setStatus(HttpServletResponse.SC_OK);
+ if(user.equals("user1") || user.equals("user2") || user.equals("user3"))
+ res.getWriter().printf("TOKEN:token1\n");
+ if(user.equals("user2") || user.equals("user3"))
+ res.getWriter().printf("TOKEN:token2\n");
+ if(user.equals("user3"))
+ res.getWriter().printf("TOKEN:token3\n");
+ }
+ }
+ }
+
+}
Propchange: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/wiki/tests/Base.java
------------------------------------------------------------------------------
svn:keywords = Id