You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/10/20 02:44:52 UTC

svn commit: r1186588 - /incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Author: kwright
Date: Thu Oct 20 00:44:51 2011
New Revision: 1186588

URL: http://svn.apache.org/viewvc?rev=1186588&view=rev
Log:
Put in all the pieces for namespaces and prefix restrictions.

Modified:
    incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Modified: incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1186588&r1=1186587&r2=1186588&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ incubator/lcf/branches/CONNECTORS-277/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Thu Oct 20 00:44:51 2011
@@ -220,7 +220,7 @@ public class WikiConnector extends org.a
     long startTime, long endTime)
     throws ManifoldCFException, ServiceInterruption
   {
-    listAllPages(activities,startTime,endTime);
+    listAllPages(activities,null,null,startTime,endTime);
   }
 
   /** Get document versions given an array of document identifiers.
@@ -901,7 +901,7 @@ public class WikiConnector extends org.a
   /** Perform a series of listPages() operations, so that we fully obtain the documents we're looking for even though
   * we're limited to 500 of them per request.
   */
-  protected void listAllPages(ISeedingActivity activities, long startTime, long endTime)
+  protected void listAllPages(ISeedingActivity activities, String namespace, String prefix, long startTime, long endTime)
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
@@ -911,7 +911,7 @@ public class WikiConnector extends org.a
       activities.checkJobStillActive();
       
       // Start with the last title seen in the previous round. 
-      String newLastTitle = executeListPagesViaThread(lastTitle,activities);
+      String newLastTitle = executeListPagesViaThread(lastTitle,namespace,prefix,activities);
       if (newLastTitle == null)
         break;
       lastTitle = newLastTitle;
@@ -919,11 +919,11 @@ public class WikiConnector extends org.a
   }
   
   /** Execute a listPages() operation via a thread.  Returns the last page title. */
-  protected String executeListPagesViaThread(String startPageTitle, ISeedingActivity activities)
+  protected String executeListPagesViaThread(String startPageTitle, String namespace, String prefix, ISeedingActivity activities)
     throws ManifoldCFException, ServiceInterruption
   {
     HttpClient client = getInitializedClient();
-    HttpMethodBase executeMethod = getInitializedMethod(getListPagesURL(startPageTitle));
+    HttpMethodBase executeMethod = getInitializedMethod(getListPagesURL(startPageTitle,namespace,prefix));
     try
     {
       PageBuffer pageBuffer = new PageBuffer();
@@ -1037,12 +1037,14 @@ public class WikiConnector extends org.a
 
   /** Create a URL to obtain the next 500 pages.
   */
-  protected String getListPagesURL(String startingTitle)
+  protected String getListPagesURL(String startingTitle, String namespace, String prefix)
     throws ManifoldCFException
   {
     try
     {
       return baseURL + "action=query&list=allpages" +
+        ((prefix != null)?"&apprefix="+URLEncoder.encode(prefix,"utf-8"):"") +
+        ((namespace != null)?"&apnamespace="+URLEncoder.encode(namespace,"utf-8"):"") +
         ((startingTitle!=null)?"&apfrom="+URLEncoder.encode(startingTitle,"utf-8"):"") +
         "&aplimit=500";
     }
@@ -1997,7 +1999,328 @@ public class WikiConnector extends org.a
       return timestamp;
     }
   }
+  
+  // -- Methods and classes to perform a "get namespaces" operation. --
+  
+  /** Obtain the set of namespaces, as a map keyed by the canonical namespace name
+  * where the value is the descriptive name.
+  */
+  protected void getNamespaces(Map<String,String> namespaces)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    getSession();
+    HttpClient client = getInitializedClient();
+    HttpMethodBase executeMethod = getInitializedMethod(getGetNamespacesURL());
+    
+    try
+    {
+      ExecuteGetNamespacesThread t = new ExecuteGetNamespacesThread(client,executeMethod,namespaces);
+      try
+      {
+        t.start();
+        t.join();
+        
+        Throwable thr = t.getException();
+        if (thr != null)
+        {
+          if (thr instanceof ManifoldCFException)
+          {
+            if (((ManifoldCFException)thr).getErrorCode() == ManifoldCFException.INTERRUPTED)
+              throw new InterruptedException(thr.getMessage());
+            throw (ManifoldCFException)thr;
+          }
+          else if (thr instanceof ServiceInterruption)
+            throw (ServiceInterruption)thr;
+          else if (thr instanceof IOException)
+            throw (IOException)thr;
+          else if (thr instanceof RuntimeException)
+            throw (RuntimeException)thr;
+          else
+            throw (Error)thr;
+        }
+ 
+      }
+      catch (ManifoldCFException e)
+      {
+        t.interrupt();
+        throw e;
+      }
+      catch (ServiceInterruption e)
+      {
+        t.interrupt();
+        throw e;
+      }
+      catch (IOException e)
+      {
+        t.interrupt();
+        throw e;
+      }
+      catch (InterruptedException e)
+      {
+        t.interrupt();
+        // We need the caller to abandon any connections left around, so rethrow in a way that forces them to process the event properly.
+        throw e;
+      }
+    }
+    catch (InterruptedException e)
+    {
+      // Drop the connection on the floor
+      executeMethod = null;
+      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    catch (ManifoldCFException e)
+    {
+      if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+        // Drop the connection on the floor
+        executeMethod = null;
+      throw e;
+    }
+    catch (java.net.SocketTimeoutException e)
+    {
+      long currentTime = System.currentTimeMillis();
+      throw new ServiceInterruption("Get namespaces timed out reading from the Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+    }
+    catch (java.net.SocketException e)
+    {
+      long currentTime = System.currentTimeMillis();
+      throw new ServiceInterruption("Get namespaces received a socket error reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+    }
+    catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+    {
+      long currentTime = System.currentTimeMillis();
+      throw new ServiceInterruption("Get namespaces connection timed out reading from Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+    }
+    catch (InterruptedIOException e)
+    {
+      executeMethod = null;
+      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    catch (IOException e)
+    {
+      throw new ManifoldCFException("Get namespaces had an IO failure: "+e.getMessage(),e);
+    }
+    finally
+    {
+      if (executeMethod != null)
+        executeMethod.releaseConnection();
+    }
+  }
+  
+  /** Thread to execute a "get namespaces" operation.  This thread both executes the operation and parses the result. */
+  protected static class ExecuteGetNamespacesThread extends Thread
+  {
+    protected HttpClient client;
+    protected HttpMethodBase executeMethod;
+    protected Throwable exception = null;
+    protected Map<String,String> namespaces;
+
+    public ExecuteGetNamespacesThread(HttpClient client, HttpMethodBase executeMethod, Map<String,String> namespaces)
+    {
+      super();
+      setDaemon(true);
+      this.client = client;
+      this.executeMethod = executeMethod;
+      this.namespaces = namespaces;
+    }
+
+    public void run()
+    {
+      try
+      {
+        // Call the execute method appropriately
+        int rval = client.executeMethod(executeMethod);
+        if (rval != 200)
+        {
+          throw new ManifoldCFException("Unexpected response code "+rval+": "+executeMethod.getResponseBodyAsString());
+        }
+        // Read response and make sure it's valid
+        InputStream is = executeMethod.getResponseBodyAsStream();
+        try
+        {
+          // Parse the document.  This will cause various things to occur, within the instantiated XMLContext class.
+          //<api>
+          //  <query>
+          //    <namespaces>
+          //      <ns id="-2" case="first-letter" canonical="Media" xml:space="preserve">Media</ns>
+          //      <ns id="-1" case="first-letter" canonical="Special" xml:space="preserve">Special</ns>
+          //      <ns id="0" case="first-letter" subpages="" content="" xml:space="preserve" />
+          //      <ns id="1" case="first-letter" subpages="" canonical="Talk" xml:space="preserve">Talk</ns>
+          //      <ns id="2" case="first-letter" subpages="" canonical="User" xml:space="preserve">User</ns>
+          //      <ns id="90" case="first-letter" canonical="Thread" xml:space="preserve">Thread</ns>
+          //      <ns id="91" case="first-letter" canonical="Thread talk" xml:space="preserve">Thread talk</ns>
+          //    </namespaces>
+          //  </query>
+          //</api>
+          XMLStream x = new XMLStream();
+          WikiGetNamespacesAPIContext c = new WikiGetNamespacesAPIContext(x,namespaces);
+          x.setContext(c);
+          try
+          {
+            try
+            {
+              x.parse(is);
+            }
+            catch (IOException e)
+            {
+              long time = System.currentTimeMillis();
+              throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+            }
+          }
+          finally
+          {
+            x.cleanup();
+          }
+        }
+        finally
+        {
+          try
+          {
+            is.close();
+          }
+          catch (IllegalStateException e)
+          {
+            // Ignore this error
+          }
+        }
+      }
+      catch (Throwable e)
+      {
+        this.exception = e;
+      }
+    }
+
+    public Throwable getException()
+    {
+      return exception;
+    }
+
+  }
+
+  /** Create a URL to obtain the namespaces.
+  */
+  protected String getGetNamespacesURL()
+    throws ManifoldCFException
+  {
+    return baseURL + "action=query&meta=siteinfo&siprop=namespaces";
+  }
+
+  /** Class representing the "api" context of a "get namespaces" response */
+  protected static class WikiGetNamespacesAPIContext extends SingleLevelContext
+  {
+    protected Map<String,String> namespaces;
+    
+    public WikiGetNamespacesAPIContext(XMLStream theStream, Map<String,String> namespaces)
+    {
+      super(theStream,"api");
+      this.namespaces = namespaces;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      return new WikiGetNamespacesQueryContext(theStream,namespaceURI,localName,qName,atts,namespaces);
+    }
+    
+    protected void finishChild(BaseProcessingContext child)
+      throws ManifoldCFException
+    {
+    }
+
+  }
+
+  /** Class representing the "api/query" context of a "get namespaces" response */
+  protected static class WikiGetNamespacesQueryContext extends SingleLevelContext
+  {
+    protected Map<String,String> namespaces;
     
+    public WikiGetNamespacesQueryContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+      Map<String,String> namespaces)
+    {
+      super(theStream,namespaceURI,localName,qName,atts,"query");
+      this.namespaces = namespaces;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      return new WikiGetNamespacesNamespacesContext(theStream,namespaceURI,localName,qName,atts,namespaces);
+    }
+
+    protected void finishChild(BaseProcessingContext child)
+      throws ManifoldCFException
+    {
+    }
+    
+  }
+
+  /** Class representing the "api/query/namespaces" context of a "get namespaces" response */
+  protected static class WikiGetNamespacesNamespacesContext extends SingleLevelContext
+  {
+    protected Map<String,String> namespaces;
+    
+    public WikiGetNamespacesNamespacesContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+      Map<String,String> namespaces)
+    {
+      super(theStream,namespaceURI,localName,qName,atts,"namespaces");
+      this.namespaces = namespaces;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      return new WikiGetNamespacesNsContext(theStream,namespaceURI,localName,qName,atts,namespaces);
+    }
+
+    protected void finishChild(BaseProcessingContext child)
+      throws ManifoldCFException
+    {
+    }
+    
+  }
+
+  /** Class representing the "api/query/pages/page" context of a "get doc info" response */
+  protected static class WikiGetNamespacesNsContext extends BaseProcessingContext
+  {
+    protected Map<String,String> namespaces;
+    protected String canonical = null;
+    
+    public WikiGetNamespacesNsContext(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts,
+      Map<String,String> namespaces)
+    {
+      super(theStream,namespaceURI,localName,qName,atts);
+      this.namespaces = namespaces;
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      if (qName.equals("ns"))
+      {
+        canonical = atts.getValue("canonical");
+        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+      }
+      return super.beginTag(namespaceURI,localName,qName,atts);
+    }
+    
+    protected void endTag()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      XMLContext theContext = theStream.getContext();
+      String theTag = theContext.getQname();
+      if (theTag.equals("ns"))
+      {
+        // Pull down the data
+        XMLStringContext sc = (XMLStringContext)theContext;
+        namespaces.put(canonical,sc.getValue());
+      }
+      else
+        super.endTag();
+    }
+
+    protected void tagCleanup()
+      throws ManifoldCFException
+    {
+    }
+   
+  }
+  
   // -- Methods and classes to perform a "get Docinfo" operation. --
 
   /** Get document info and index the document.