You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/10/20 02:44:52 UTC
svn commit: r1186588 -
/incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Author: kwright
Date: Thu Oct 20 00:44:51 2011
New Revision: 1186588
URL: http://svn.apache.org/viewvc?rev=1186588&view=rev
Log:
Put in all the pieces for namespaces and prefix restrictions.
Modified:
incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Modified: incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1186588&r1=1186587&r2=1186588&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Thu Oct 20 00:44:51 2011
@@ -220,7 +220,7 @@ public class WikiConnector extends org.a
long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
{
- listAllPages(activities,startTime,endTime);
+ listAllPages(activities,null,null,startTime,endTime);
}
/** Get document versions given an array of document identifiers.
@@ -901,7 +901,7 @@ public class WikiConnector extends org.a
/** Perform a series of listPages() operations, so that we fully obtain the documents we're looking for even though
* we're limited to 500 of them per request.
*/
- protected void listAllPages(ISeedingActivity activities, long startTime, long endTime)
+ protected void listAllPages(ISeedingActivity activities, String namespace, String prefix, long startTime, long endTime)
throws ManifoldCFException, ServiceInterruption
{
getSession();
@@ -911,7 +911,7 @@ public class WikiConnector extends org.a
activities.checkJobStillActive();
// Start with the last title seen in the previous round.
- String newLastTitle = executeListPagesViaThread(lastTitle,activities);
+ String newLastTitle = executeListPagesViaThread(lastTitle,namespace,prefix,activities);
if (newLastTitle == null)
break;
lastTitle = newLastTitle;
@@ -919,11 +919,11 @@ public class WikiConnector extends org.a
}
/** Execute a listPages() operation via a thread. Returns the last page title. */
- protected String executeListPagesViaThread(String startPageTitle, ISeedingActivity activities)
+ protected String executeListPagesViaThread(String startPageTitle, String namespace, String prefix, ISeedingActivity activities)
throws ManifoldCFException, ServiceInterruption
{
HttpClient client = getInitializedClient();
- HttpMethodBase executeMethod = getInitializedMethod(getListPagesURL(startPageTitle));
+ HttpMethodBase executeMethod = getInitializedMethod(getListPagesURL(startPageTitle,namespace,prefix));
try
{
PageBuffer pageBuffer = new PageBuffer();
@@ -1037,12 +1037,14 @@ public class WikiConnector extends org.a
/** Create a URL to obtain the next 500 pages.
*/
- protected String getListPagesURL(String startingTitle)
+ protected String getListPagesURL(String startingTitle, String namespace, String prefix)
throws ManifoldCFException
{
try
{
return baseURL + "action=query&list=allpages" +
+ ((prefix != null)?"&apprefix="+URLEncoder.encode(prefix,"utf-8"):"") +
+ ((namespace != null)?"&apnamespace="+URLEncoder.encode(namespace,"utf-8"):"") +
((startingTitle!=null)?"&apfrom="+URLEncoder.encode(startingTitle,"utf-8"):"") +
"&aplimit=500";
}
@@ -1997,7 +1999,328 @@ public class WikiConnector extends org.a
return timestamp;
}
}
+
+ // -- Methods and classes to perform a "get namespaces" operation. --
+
+ /** Obtain the set of namespaces, as a map keyed by the canonical namespace name
+ * where the value is the descriptive name.
+ */
+ protected void getNamespaces(Map<String,String> namespaces)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ getSession();
+ HttpClient client = getInitializedClient();
+ HttpMethodBase executeMethod = getInitializedMethod(getGetNamespacesURL());
+
+ try
+ {
+ ExecuteGetNamespacesThread t = new ExecuteGetNamespacesThread(client,executeMethod,namespaces);
+ try
+ {
+ t.start();
+ t.join();
+
+ Throwable thr = t.getException();
+ if (thr != null)
+ {
+ if (thr instanceof ManifoldCFException)
+ {
+ if (((ManifoldCFException)thr).getErrorCode() == ManifoldCFException.INTERRUPTED)
+ throw new InterruptedException(thr.getMessage());
+ throw (ManifoldCFException)thr;
+ }
+ else if (thr instanceof ServiceInterruption)
+ throw (ServiceInterruption)thr;
+ else if (thr instanceof IOException)
+ throw (IOException)thr;
+ else if (thr instanceof RuntimeException)
+ throw (RuntimeException)thr;
+ else
+ throw (Error)thr;
+ }
+
+ }
+ catch (ManifoldCFException e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (ServiceInterruption e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (IOException e)
+ {
+ t.interrupt();
+ throw e;
+ }
+ catch (InterruptedException e)
+ {
+ t.interrupt();
+ // We need the caller to abandon any connections left around, so rethrow in a way that forces them to process the event properly.
+ throw e;
+ }
+ }
+ catch (InterruptedException e)
+ {
+ // Drop the connection on the floor
+ executeMethod = null;
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (ManifoldCFException e)
+ {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ // Drop the connection on the floor
+ executeMethod = null;
+ throw e;
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Get namespaces timed out reading from the Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+ }
+ catch (java.net.SocketException e)
+ {
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Get namespaces received a socket error reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+ }
+ catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ {
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Get namespaces connection timed out reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+ }
+ catch (InterruptedIOException e)
+ {
+ executeMethod = null;
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new ManifoldCFException("Get namespaces had an IO failure: "+e.getMessage(),e);
+ }
+ finally
+ {
+ if (executeMethod != null)
+ executeMethod.releaseConnection();
+ }
+ }
+
+ /** Thread to execute a "get namespaces" operation. This thread both executes the operation and parses the result. */
+ protected static class ExecuteGetNamespacesThread extends Thread
+ {
+ protected HttpClient client;
+ protected HttpMethodBase executeMethod;
+ protected Throwable exception = null;
+ protected Map<String,String> namespaces;
+
+ public ExecuteGetNamespacesThread(HttpClient client, HttpMethodBase executeMethod, Map<String,String> namespaces)
+ {
+ super();
+ setDaemon(true);
+ this.client = client;
+ this.executeMethod = executeMethod;
+ this.namespaces = namespaces;
+ }
+
+ public void run()
+ {
+ try
+ {
+ // Call the execute method appropriately
+ int rval = client.executeMethod(executeMethod);
+ if (rval != 200)
+ {
+ throw new ManifoldCFException("Unexpected response code "+rval+": "+executeMethod.getResponseBodyAsString());
+ }
+ // Read response and make sure it's valid
+ InputStream is = executeMethod.getResponseBodyAsStream();
+ try
+ {
+ // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
+ //<api>
+ // <query>
+ // <namespaces>
+ // <ns id="-2" case="first-letter" canonical="Media" xml:space="preserve">Media</ns>
+ // <ns id="-1" case="first-letter" canonical="Special" xml:space="preserve">Special</ns>
+ // <ns id="0" case="first-letter" subpages="" content="" xml:space="preserve" />
+ // <ns id="1" case="first-letter" subpages="" canonical="Talk" xml:space="preserve">Talk</ns>
+ // <ns id="2" case="first-letter" subpages="" canonical="User" xml:space="preserve">User</ns>
+ // <ns id="90" case="first-letter" canonical="Thread" xml:space="preserve">Thread</ns>
+ // <ns id="91" case="first-letter" canonical="Thread talk" xml:space="preserve">Thread talk</ns>
+ // </namespaces>
+ // </query>
+ //</api>
+ XMLStream x = new XMLStream();
+ WikiGetNamespacesAPIContext c = new WikiGetNamespacesAPIContext(x,namespaces);
+ x.setContext(c);
+ try
+ {
+ try
+ {
+ x.parse(is);
+ }
+ catch (IOException e)
+ {
+ long time = System.currentTimeMillis();
+ throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+ }
+ }
+ finally
+ {
+ x.cleanup();
+ }
+ }
+ finally
+ {
+ try
+ {
+ is.close();
+ }
+ catch (IllegalStateException e)
+ {
+ // Ignore this error
+ }
+ }
+ }
+ catch (Throwable e)
+ {
+ this.exception = e;
+ }
+ }
+
+ public Throwable getException()
+ {
+ return exception;
+ }
+
+ }
+
+ /** Create a URL to obtain the namespaces.
+ */
+ protected String getGetNamespacesURL()
+ throws ManifoldCFException
+ {
+ return baseURL + "action=query&meta=siteinfo&siprop=namespaces";
+ }
+
+ /** Class representing the "api" context of a "get namespaces" response */
+ protected static class WikiGetNamespacesAPIContext extends SingleLevelContext
+ {
+ protected Map<String,String> namespaces;
+
+ public WikiGetNamespacesAPIContext(XMLStream theStream, Map<String,String> namespaces)
+ {
+ super(theStream,"api");
+ this.namespaces = namespaces;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetNamespacesQueryContext(theStream,namespaceURI,localName,qName,atts,namespaces);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ }
+
+ }
+
+ /** Class representing the "api/query" context of a "get namespaces" response */
+ protected static class WikiGetNamespacesQueryContext extends SingleLevelContext
+ {
+ protected Map<String,String> namespaces;
+ public WikiGetNamespacesQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ Map<String,String> namespaces)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"query");
+ this.namespaces = namespaces;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetNamespacesNamespacesContext(theStream,namespaceURI,localName,qName,atts,namespaces);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ }
+
+ }
+
+ /** Class representing the "api/query/namespaces" context of a "get namespaces" response */
+ protected static class WikiGetNamespacesNamespacesContext extends SingleLevelContext
+ {
+ protected Map<String,String> namespaces;
+
+ public WikiGetNamespacesNamespacesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ Map<String,String> namespaces)
+ {
+ super(theStream,namespaceURI,localName,qName,atts,"namespaces");
+ this.namespaces = namespaces;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiGetNamespacesNsContext(theStream,namespaceURI,localName,qName,atts,namespaces);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ throws ManifoldCFException
+ {
+ }
+
+ }
+
+ /** Class representing the "api/query/pages/page" context of a "get doc info" response */
+ protected static class WikiGetNamespacesNsContext extends BaseProcessingContext
+ {
+ protected Map<String,String> namespaces;
+ protected String canonical = null;
+
+ public WikiGetNamespacesNsContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+ Map<String,String> namespaces)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ this.namespaces = namespaces;
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (qName.equals("ns"))
+ {
+ canonical = atts.getValue("canonical");
+ return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ }
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("ns"))
+ {
+ // Pull down the data
+ XMLStringContext sc = (XMLStringContext)theContext;
+ namespaces.put(canonical,sc.getValue());
+ }
+ else
+ super.endTag();
+ }
+
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ }
+
+ }
+
// -- Methods and classes to perform a "get Docinfo" operation. --
/** Get document info and index the document.