You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/10/06 03:39:14 UTC

svn commit: r1179486 - in /incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki: PageBuffer.java WikiConnector.java

Author: kwright
Date: Thu Oct  6 01:39:13 2011
New Revision: 1179486

URL: http://svn.apache.org/viewvc?rev=1179486&view=rev
Log:
Hook up ability to parse AllPages requests.

Added:
    incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java   (with props)
Modified:
    incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Added: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java?rev=1179486&view=auto
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java (added)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java Thu Oct  6 01:39:13 2011
@@ -0,0 +1,92 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.wiki;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** Thread-safe class that functions as a limited-size buffer of pageIDs */
+public class PageBuffer
+{
+  protected static int MAX_SIZE = 1024;
+  
+  protected List<String> buffer = new ArrayList<String>(MAX_SIZE);
+  
+  protected boolean complete = false;
+  
+  /** Constructor */
+  public PageBuffer()
+  {
+  }
+  
+  /** Add a page id to the buffer, and block if the buffer is full */
+  public synchronized void add(String pageID)
+    throws ManifoldCFException
+  {
+    try
+    {
+      while (buffer.size() == MAX_SIZE)
+      {
+        wait();
+      }
+    }
+    catch (InterruptedException e)
+    {
+      throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    buffer.add(pageID);
+    // Notify threads that are waiting on there being stuff in the queue
+    notifyAll();
+  }
+  
+  /** Signal that the operation is complete, and that no more pageID's
+  * will be added.
+  */
+  public synchronized void signalDone()
+  {
+    complete = true;
+    // Notify threads that are waiting for stuff to appear, because it won't
+    notifyAll();
+  }
+  
+  /** Pull an id off the buffer, and wait if there's more to come.
+  * Returns null if the operation is complete.
+  */
+  public synchronized String fetch()
+    throws ManifoldCFException
+  {
+    try
+    {
+      while (buffer.size() == 0 && !complete)
+        wait();
+    }
+    catch (InterruptedException e)
+    {
+      throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    if (buffer.size() == 0)
+      return null;
+    boolean isBufferFull = (buffer.size() == MAX_SIZE);
+    String rval = buffer.remove(buffer.size()-1);
+    // Notify those threads waiting on buffer being not completely full to wake
+    notifyAll();
+    return rval;
+  }
+  
+}

Propchange: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1179486&r1=1179485&r2=1179486&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Thu Oct  6 01:39:13 2011
@@ -779,6 +779,199 @@ public class WikiConnector extends org.a
     }
   }
 
+  /** Parse list output, e.g.:
+  * <api xmlns="http://www.mediawiki.org/xml/api/">
+  *   <query>
+  *     <allpages>
+  *       <p pageid="19839654" ns="0" title="Kre&#039;fey" />
+  *       <p pageid="30955295" ns="0" title="Kre-O" />
+  *       <p pageid="14773725" ns="0" title="Kre8tiveworkz" />
+  *       <p pageid="19219017" ns="0" title="Kre M&#039;Baye" />
+  *       <p pageid="19319577" ns="0" title="Kre Mbaye" />
+  *     </allpages>
+  *   </query>
+  *   <query-continue>
+  *     <allpages apfrom="Krea" />
+  *   </query-continue>
+  * </api>
+  */
+  protected static String parseListPagesResponse(InputStream is, PageBuffer buffer)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // Parse the document.  This will cause various things to occur, within the instantiated XMLContext class.
+    XMLStream x = new XMLStream();
+    WikiListPagesAPIContext c = new WikiListPagesAPIContext(x,buffer);
+    x.setContext(c);
+    try
+    {
+      try
+      {
+        x.parse(is);
+        return c.getLastTitle();
+      }
+      catch (IOException e)
+      {
+        long time = System.currentTimeMillis();
+        throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+      }
+      catch (ManifoldCFException e)
+      {
+        // Ignore XML parsing errors.
+        if (e.getMessage().indexOf("pars") >= 0)
+        {
+          if (Logging.connectors.isDebugEnabled())
+            Logging.connectors.debug("Wiki: listPages() response was unparseable ("+e.getMessage()+"), skipping");
+          return null;
+        }
+        throw e;
+      }
+    }
+    finally
+    {
+      x.cleanup();
+    }
+  }
+
+  /** Class representing the "api" context of a "list all pages" response */
+  protected static class WikiListPagesAPIContext extends SingleLevelContext
+  {
+    protected String lastTitle = null;
+    protected PageBuffer buffer;
+    
+    public WikiListPagesAPIContext(XMLStream theStream, PageBuffer buffer)
+    {
+      super(theStream,"api");
+      this.buffer = buffer;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      return new WikiListPagesQueryContext(theStream,buffer);
+    }
+    
+    protected void finishChild(BaseProcessingContext child)
+    {
+      lastTitle = ((WikiListPagesQueryContext)child).getLastTitle();
+    }
+
+    protected void process()
+      throws ManifoldCFException
+    {
+    }
+    
+    public String getLastTitle()
+    {
+      return lastTitle;
+    }
+
+  }
+
+  /** Class representing the "api/query" context of a "list all pages" response */
+  protected static class WikiListPagesQueryContext extends SingleLevelContext
+  {
+    protected String lastTitle = null;
+    protected PageBuffer buffer;
+    
+    public WikiListPagesQueryContext(XMLStream theStream, PageBuffer buffer)
+    {
+      super(theStream,"query");
+      this.buffer = buffer;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      return new WikiListPagesAllPagesContext(theStream,buffer);
+    }
+
+    protected void finishChild(BaseProcessingContext child)
+    {
+      lastTitle = ((WikiListPagesAllPagesContext)child).getLastTitle();
+    }
+
+    protected void process()
+      throws ManifoldCFException
+    {
+    }
+    
+    public String getLastTitle()
+    {
+      return lastTitle;
+    }
+    
+  }
+
+  /** Class recognizing the "api/query/allpages" context of a "list all pages" response */
+  protected static class WikiListPagesAllPagesContext extends SingleLevelContext
+  {
+    protected String lastTitle = null;
+    protected PageBuffer buffer;
+    
+    public WikiListPagesAllPagesContext(XMLStream theStream, PageBuffer buffer)
+    {
+      super(theStream,"allpages");
+      this.buffer = buffer;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      // When we recognize allpages, we need to look for <p> records.
+      return new WikiListPagesPContext(theStream,buffer);
+    }
+
+    protected void finishChild(BaseProcessingContext child)
+    {
+      // Update the last title from all the <p> records we saw.
+      lastTitle = ((WikiListPagesPContext)child).getLastTitle();
+    }
+
+    protected void process()
+      throws ManifoldCFException
+    {
+    }
+    
+    public String getLastTitle()
+    {
+      return lastTitle;
+    }
+    
+  }
+  
+  /** Class representing the "api/query/allpages/p" context of a "list all pages" response */
+  protected static class WikiListPagesPContext extends BaseProcessingContext
+  {
+    protected String lastTitle = null;
+    protected PageBuffer buffer;
+    
+    public WikiListPagesPContext(XMLStream theStream, PageBuffer buffer)
+    {
+      super(theStream);
+      this.buffer = buffer;
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      if (qName.equals("p"))
+      {
+        lastTitle = atts.getValue("title");
+        String pageID = atts.getValue("pageid");
+        // Add the discovered page id to the page buffer
+        buffer.add(pageID);
+      }
+      return super.beginTag(namespaceURI,localName,qName,atts);
+    }
+    
+    protected void process()
+      throws ManifoldCFException
+    {
+    }
+    
+    public String getLastTitle()
+    {
+      return lastTitle;
+    }
+  }
+
   /** Create a URL to obtain a page's browse URL, given the page ID.
   */
   protected String getGetURLURL(String documentIdentifier)