You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/10/06 03:39:14 UTC
svn commit: r1179486 - in
/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki:
PageBuffer.java WikiConnector.java
Author: kwright
Date: Thu Oct 6 01:39:13 2011
New Revision: 1179486
URL: http://svn.apache.org/viewvc?rev=1179486&view=rev
Log:
Hook up ability to parse AllPages requests.
Added:
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java (with props)
Modified:
incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Added: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java?rev=1179486&view=auto
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java (added)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java Thu Oct 6 01:39:13 2011
@@ -0,0 +1,92 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.wiki;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** Thread-safe class that functions as a limited-size buffer of pageIDs */
+public class PageBuffer
+{
+ protected static int MAX_SIZE = 1024;
+
+ protected List<String> buffer = new ArrayList<String>(MAX_SIZE);
+
+ protected boolean complete = false;
+
+ /** Constructor */
+ public PageBuffer()
+ {
+ }
+
+ /** Add a page id to the buffer, and block if the buffer is full */
+ public synchronized void add(String pageID)
+ throws ManifoldCFException
+ {
+ try
+ {
+ while (buffer.size() == MAX_SIZE)
+ {
+ wait();
+ }
+ }
+ catch (InterruptedException e)
+ {
+ throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ buffer.add(pageID);
+ // Notify threads that are waiting on there being stuff in the queue
+ notifyAll();
+ }
+
+ /** Signal that the operation is complete, and that no more pageID's
+ * will be added.
+ */
+ public synchronized void signalDone()
+ {
+ complete = true;
+ // Notify threads that are waiting for stuff to appear, because it won't
+ notifyAll();
+ }
+
+ /** Pull an id off the buffer, and wait if there's more to come.
+ * Returns null if the operation is complete.
+ */
+ public synchronized String fetch()
+ throws ManifoldCFException
+ {
+ try
+ {
+ while (buffer.size() == 0 && !complete)
+ wait();
+ }
+ catch (InterruptedException e)
+ {
+ throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ if (buffer.size() == 0)
+ return null;
+ boolean isBufferFull = (buffer.size() == MAX_SIZE);
+ String rval = buffer.remove(buffer.size()-1);
+ // Notify those threads waiting on buffer being not completely full to wake
+ notifyAll();
+ return rval;
+ }
+
+}
Propchange: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/PageBuffer.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1179486&r1=1179485&r2=1179486&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Thu Oct 6 01:39:13 2011
@@ -779,6 +779,199 @@ public class WikiConnector extends org.a
}
}
+ /** Parse list output, e.g.:
+ * <api xmlns="http://www.mediawiki.org/xml/api/">
+ * <query>
+ * <allpages>
+ * <p pageid="19839654" ns="0" title="Kre'fey" />
+ * <p pageid="30955295" ns="0" title="Kre-O" />
+ * <p pageid="14773725" ns="0" title="Kre8tiveworkz" />
+ * <p pageid="19219017" ns="0" title="Kre M'Baye" />
+ * <p pageid="19319577" ns="0" title="Kre Mbaye" />
+ * </allpages>
+ * </query>
+ * <query-continue>
+ * <allpages apfrom="Krea" />
+ * </query-continue>
+ * </api>
+ */
+ protected static String parseListPagesResponse(InputStream is, PageBuffer buffer)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
+ XMLStream x = new XMLStream();
+ WikiListPagesAPIContext c = new WikiListPagesAPIContext(x,buffer);
+ x.setContext(c);
+ try
+ {
+ try
+ {
+ x.parse(is);
+ return c.getLastTitle();
+ }
+ catch (IOException e)
+ {
+ long time = System.currentTimeMillis();
+ throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+ }
+ catch (ManifoldCFException e)
+ {
+ // Ignore XML parsing errors.
+ if (e.getMessage().indexOf("pars") >= 0)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Wiki: listPages() response was unparseable ("+e.getMessage()+"), skipping");
+ return null;
+ }
+ throw e;
+ }
+ }
+ finally
+ {
+ x.cleanup();
+ }
+ }
+
+ /** Class representing the "api" context of a "list all pages" response */
+ protected static class WikiListPagesAPIContext extends SingleLevelContext
+ {
+ protected String lastTitle = null;
+ protected PageBuffer buffer;
+
+ public WikiListPagesAPIContext(XMLStream theStream, PageBuffer buffer)
+ {
+ super(theStream,"api");
+ this.buffer = buffer;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiListPagesQueryContext(theStream,buffer);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ {
+ lastTitle = ((WikiListPagesQueryContext)child).getLastTitle();
+ }
+
+ protected void process()
+ throws ManifoldCFException
+ {
+ }
+
+ public String getLastTitle()
+ {
+ return lastTitle;
+ }
+
+ }
+
+ /** Class representing the "api/query" context of a "list all pages" response */
+ protected static class WikiListPagesQueryContext extends SingleLevelContext
+ {
+ protected String lastTitle = null;
+ protected PageBuffer buffer;
+
+ public WikiListPagesQueryContext(XMLStream theStream, PageBuffer buffer)
+ {
+ super(theStream,"query");
+ this.buffer = buffer;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ return new WikiListPagesAllPagesContext(theStream,buffer);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ {
+ lastTitle = ((WikiListPagesAllPagesContext)child).getLastTitle();
+ }
+
+ protected void process()
+ throws ManifoldCFException
+ {
+ }
+
+ public String getLastTitle()
+ {
+ return lastTitle;
+ }
+
+ }
+
+ /** Class recognizing the "api/query/allpages" context of a "list all pages" response */
+ protected static class WikiListPagesAllPagesContext extends SingleLevelContext
+ {
+ protected String lastTitle = null;
+ protected PageBuffer buffer;
+
+ public WikiListPagesAllPagesContext(XMLStream theStream, PageBuffer buffer)
+ {
+ super(theStream,"allpages");
+ this.buffer = buffer;
+ }
+
+ protected BaseProcessingContext createChild(String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ // When we recognize allpages, we need to look for <p> records.
+ return new WikiListPagesPContext(theStream,buffer);
+ }
+
+ protected void finishChild(BaseProcessingContext child)
+ {
+ // Update the last title from all the <p> records we saw.
+ lastTitle = ((WikiListPagesPContext)child).getLastTitle();
+ }
+
+ protected void process()
+ throws ManifoldCFException
+ {
+ }
+
+ public String getLastTitle()
+ {
+ return lastTitle;
+ }
+
+ }
+
+ /** Class representing the "api/query/allpages/p" context of a "list all pages" response */
+ protected static class WikiListPagesPContext extends BaseProcessingContext
+ {
+ protected String lastTitle = null;
+ protected PageBuffer buffer;
+
+ public WikiListPagesPContext(XMLStream theStream, PageBuffer buffer)
+ {
+ super(theStream);
+ this.buffer = buffer;
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (qName.equals("p"))
+ {
+ lastTitle = atts.getValue("title");
+ String pageID = atts.getValue("pageid");
+ // Add the discovered page id to the page buffer
+ buffer.add(pageID);
+ }
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ protected void process()
+ throws ManifoldCFException
+ {
+ }
+
+ public String getLastTitle()
+ {
+ return lastTitle;
+ }
+ }
+
/** Create a URL to obtain a page's browse URL, given the page ID.
*/
protected String getGetURLURL(String documentIdentifier)