You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/10 23:55:31 UTC
svn commit: r1444628 [1/2] - in /manifoldcf/trunk: ./ connectors/rss/
connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler...
Author: kwright
Date: Sun Feb 10 22:55:30 2013
New Revision: 1444628
URL: http://svn.apache.org/r1444628
Log:
Fix for CONNECTORS-633. Remove dependency on patched xerces parser, and instead use our own parser for fuzzy parsing of XML.
Added:
manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/
- copied from r1444626, manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/
Removed:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
Modified:
manifoldcf/trunk/ (props changed)
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/build.xml
manifoldcf/trunk/connectors/rss/ (props changed)
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
manifoldcf/trunk/connectors/wiki/ (props changed)
manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
manifoldcf/trunk/mvn-bootstrap.bat
manifoldcf/trunk/mvn-bootstrap.sh
manifoldcf/trunk/pom.xml
Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
Merged /manifoldcf/branches/CONNECTORS-633:r1442813-1444626
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sun Feb 10 22:55:30 2013
@@ -3,6 +3,11 @@ $Id$
======================= 1.2-dev =====================
+CONNECTORS-633: Remove dependency on custom version of xerces;
+extend the simple tag parser to be able to handle XML, and move it into
+core/fuzzyml for general use.
+(Karl Wright)
+
CONNECTORS-639: Maven execute of jetty-runner fails.
(Maciej Li¿ewski)
Modified: manifoldcf/trunk/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/build.xml?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/build.xml (original)
+++ manifoldcf/trunk/build.xml Sun Feb 10 22:55:30 2013
@@ -3153,29 +3153,12 @@ Use Apache Forrest version forrest-0.9-d
<target name="download-xerces">
<mkdir dir="lib"/>
- <!-- Download and build patched version of xerces 2.9.1 -->
- <mkdir dir="build/download"/>
- <delete dir="build/download/xerces2-j"/>
- <antcall target="checkout-source-via-svn">
- <param name="root-dir" value="build/download"/>
- <param name="svn-url" value="http://svn.apache.org/repos/asf/xerces/java/tags/Xerces-J_2_9_1"/>
- <param name="dir-name" value="xerces2-j"/>
- </antcall>
- <!-- Apply mcf-specific features and fixes patch -->
- <antcall target="patch-source">
- <param name="root-dir" value="build/download"/>
- <param name="diff-file" value="../../upstream-diffs/xerces2-j-2.9.1.mcf.patch"/>
- <param name="dir-name" value="xerces2-j"/>
- </antcall>
- <!-- Build it -->
- <exec dir="build/download/xerces2-j" executable="cmd" osfamily="windows" failifexecutionfails="true" failonerror="true">
- <arg line="/c build.bat jar"/>
- </exec>
- <exec dir="build/download/xerces2-j" executable="/bin/sh" osfamily="unix" failifexecutionfails="true" failonerror="true">
- <arg value="build.sh" />
- <arg value="jar" />
- </exec>
- <copy todir="lib" file="build/download/xerces2-j/build/xercesImpl.jar"/>
+ <property name="xerces-version" value="2.10.0"/>
+ <property name="xerces-package" value="xerces"/>
+ <antcall target="download-via-maven"><param name="project-path" value="${xerces-package}"/><param name="artifact-version" value="${xerces-version}"/><param name="target" value="lib"/>
+ <param name="artifact-name" value="xercesImpl"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
</target>
<target name="download-xalan">
Propchange: manifoldcf/trunk/connectors/rss/
------------------------------------------------------------------------------
Merged /manifoldcf/branches/CONNECTORS-633/connectors/rss:r1442813-1444626
Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Sun Feb 10 22:55:30 2013
@@ -24,13 +24,7 @@ import org.apache.manifoldcf.crawler.int
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
-import org.xml.sax.Attributes;
-
-import org.apache.manifoldcf.core.common.XMLDoc;
-import org.apache.manifoldcf.agents.common.XMLStream;
-import org.apache.manifoldcf.agents.common.XMLContext;
-import org.apache.manifoldcf.agents.common.XMLStringContext;
-import org.apache.manifoldcf.agents.common.XMLFileContext;
+import org.apache.manifoldcf.core.fuzzyml.*;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.client.RedirectException;
@@ -3473,31 +3467,15 @@ public class RSSConnector extends org.ap
}
try
{
- // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
- XMLStream x = new XMLStream();
+ Parser p = new Parser();
+ // Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class.
+ XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
OuterContextClass c = new OuterContextClass(x,documentIdentifier,activities,filter);
x.setContext(c);
try
{
- try
- {
- x.parse(is);
- }
- catch (ManifoldCFException e)
- {
- // Ignore XML parsing errors.
- if (e.getMessage().indexOf("pars") >= 0)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: XML document '"+documentIdentifier+"' was unparseable ("+e.getMessage()+"), skipping");
-
- c.setDefaultRescanTimeIfNeeded();
-
- return;
- }
- throw e;
-
- }
+ // Believe it or not, there are no parsing errors we can get back now.
+ p.parseWithCharsetDetection(null,is,x);
c.checkIfValidFeed();
c.setDefaultRescanTimeIfNeeded();
}
@@ -3531,7 +3509,7 @@ public class RSSConnector extends org.ap
}
/** This class handles the outermost XML context for the feed document. */
- protected class OuterContextClass extends XMLContext
+ protected class OuterContextClass extends XMLParsingContext
{
/** Keep track of the number of valid feed signals we saw */
protected int outerTagCount = 0;
@@ -3544,7 +3522,7 @@ public class RSSConnector extends org.ap
/** Flag indicating the the rescan time was set for this feed */
protected boolean rescanTimeSet = false;
- public OuterContextClass(XMLStream theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
+ public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
{
super(theStream);
this.documentIdentifier = documentIdentifier;
@@ -3585,8 +3563,9 @@ public class RSSConnector extends org.ap
}
/** Handle the tag beginning to set the correct second-level parsing context */
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
if (localName.equals("rss"))
{
@@ -3594,36 +3573,37 @@ public class RSSConnector extends org.ap
outerTagCount++;
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("RSS: Parsed bottom-level XML for RSS document '"+documentIdentifier+"'");
- return new RSSContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+ return new RSSContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
else if (localName.equals("RDF"))
{
// RDF/Atom feed detected
outerTagCount++;
- return new RDFContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+ return new RDFContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
else if (localName.equals("feed"))
{
// Basic feed detected
outerTagCount++;
- return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+ return new FeedContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
else if (localName.equals("urlset") || localName.equals("sitemapindex"))
{
// Sitemap detected
outerTagCount++;
- return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+ return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
// The default action is to establish a new default context.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
/** Handle the tag ending */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext context = theStream.getContext();
+ XMLParsingContext context = theStream.getContext();
String tagName = context.getLocalname();
if (tagName.equals("rss"))
{
@@ -3647,7 +3627,7 @@ public class RSSConnector extends org.ap
}
- protected class RSSContextClass extends XMLContext
+ protected class RSSContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
@@ -3658,33 +3638,35 @@ public class RSSConnector extends org.ap
/** Rescan time set flag */
protected boolean rescanTimeSet = false;
- public RSSContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+ public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// Handle each channel
if (localName.equals("channel"))
{
// Channel detected
- return new RSSChannelContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+ return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
// If it's our channel tag, process global channel information
- XMLContext context = theStream.getContext();
+ XMLParsingContext context = theStream.getContext();
String tagName = context.getLocalname();
if (tagName.equals("channel"))
{
@@ -3703,7 +3685,7 @@ public class RSSConnector extends org.ap
}
- protected class RSSChannelContextClass extends XMLContext
+ protected class RSSChannelContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
@@ -3715,40 +3697,42 @@ public class RSSConnector extends org.ap
/** TTL value is set on a per-channel basis */
protected String ttlValue = null;
- public RSSChannelContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+ public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("item"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new RSSItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+ return new RSSItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
- ttlValue = ((XMLStringContext)theContext).getValue();
+ ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("item"))
{
// It's an item.
@@ -3813,7 +3797,7 @@ public class RSSConnector extends org.ap
}
}
- protected class RSSItemContextClass extends XMLContext
+ protected class RSSItemContextClass extends XMLParsingContext
{
protected int dechromedContentMode;
protected String guidField = null;
@@ -3824,40 +3808,41 @@ public class RSSConnector extends org.ap
protected ArrayList categoryField = new ArrayList();
protected File contentsFile = null;
- public RSSItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+ public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.dechromedContentMode = dechromedContentMode;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("guid"))
{
// "guid" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("pubDate"))
{
// "pubDate" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("title"))
{
// "title" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("category"))
{
// "category" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
@@ -3869,7 +3854,7 @@ public class RSSConnector extends org.ap
case DECHROMED_NONE:
if (localName.equals("description"))
{
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
case DECHROMED_DESCRIPTION:
@@ -3878,7 +3863,7 @@ public class RSSConnector extends org.ap
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
- return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
@@ -3900,7 +3885,7 @@ public class RSSConnector extends org.ap
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
- return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
@@ -3917,42 +3902,43 @@ public class RSSConnector extends org.ap
}
else if (localName.equals("description"))
{
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
default:
break;
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("link"))
{
- linkField = ((XMLStringContext)theContext).getValue();
+ linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("guid"))
{
- guidField = ((XMLStringContext)theContext).getValue();
+ guidField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("pubDate"))
{
- pubDateField = ((XMLStringContext)theContext).getValue();
+ pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("title"))
{
- titleField = ((XMLStringContext)theContext).getValue();
+ titleField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("category"))
{
- categoryField.add(((XMLStringContext)theContext).getValue());
+ categoryField.add(((XMLStringParsingContext)theContext).getValue());
}
else
{
@@ -3964,7 +3950,7 @@ public class RSSConnector extends org.ap
case DECHROMED_NONE:
if (theTag.equals("description"))
{
- descriptionField = ((XMLStringContext)theContext).getValue();
+ descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
case DECHROMED_DESCRIPTION:
@@ -3972,7 +3958,7 @@ public class RSSConnector extends org.ap
{
// Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
tagCleanup();
- contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+ contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
break;
@@ -3981,12 +3967,12 @@ public class RSSConnector extends org.ap
{
tagCleanup();
// Retrieve content file
- contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+ contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
else if (theTag.equals("description"))
{
- descriptionField = ((XMLStringContext)theContext).getValue();
+ descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
default:
@@ -4139,7 +4125,7 @@ public class RSSConnector extends org.ap
}
}
- protected class RDFContextClass extends XMLContext
+ protected class RDFContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
@@ -4151,40 +4137,42 @@ public class RSSConnector extends org.ap
/** ttl value */
protected String ttlValue = null;
- public RDFContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+ public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("item"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new RDFItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+ return new RDFItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
- ttlValue = ((XMLStringContext)theContext).getValue();
+ ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("item"))
{
// It's an item.
@@ -4249,7 +4237,7 @@ public class RSSConnector extends org.ap
}
}
- protected class RDFItemContextClass extends XMLContext
+ protected class RDFItemContextClass extends XMLParsingContext
{
protected int dechromedContentMode;
protected String linkField = null;
@@ -4258,30 +4246,31 @@ public class RSSConnector extends org.ap
protected String descriptionField = null;
protected File contentsFile = null;
- public RDFItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+ public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.dechromedContentMode = dechromedContentMode;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("date"))
{
// "dc:date" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("title"))
{
// "title" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
@@ -4290,7 +4279,7 @@ public class RSSConnector extends org.ap
case DECHROMED_NONE:
if (localName.equals("description"))
{
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
case DECHROMED_DESCRIPTION:
@@ -4299,7 +4288,7 @@ public class RSSConnector extends org.ap
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
- return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
@@ -4321,7 +4310,7 @@ public class RSSConnector extends org.ap
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
- return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
@@ -4338,34 +4327,35 @@ public class RSSConnector extends org.ap
}
else if (localName.equals("description"))
{
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
default:
break;
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("link"))
{
- linkField = ((XMLStringContext)theContext).getValue();
+ linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("date"))
{
- pubDateField = ((XMLStringContext)theContext).getValue();
+ pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("title"))
{
- titleField = ((XMLStringContext)theContext).getValue();
+ titleField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
@@ -4374,7 +4364,7 @@ public class RSSConnector extends org.ap
case DECHROMED_NONE:
if (theTag.equals("description"))
{
- descriptionField = ((XMLStringContext)theContext).getValue();
+ descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
case DECHROMED_DESCRIPTION:
@@ -4382,7 +4372,7 @@ public class RSSConnector extends org.ap
{
// Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
tagCleanup();
- contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+ contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
break;
@@ -4391,12 +4381,12 @@ public class RSSConnector extends org.ap
{
// Retrieve content file
tagCleanup();
- contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+ contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
else if (theTag.equals("description"))
{
- descriptionField = ((XMLStringContext)theContext).getValue();
+ descriptionField = ((XMLStringParsingContext)theContext).getValue();
}
break;
default:
@@ -4525,7 +4515,7 @@ public class RSSConnector extends org.ap
}
}
- protected class FeedContextClass extends XMLContext
+ protected class FeedContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
@@ -4537,40 +4527,42 @@ public class RSSConnector extends org.ap
/** ttl value */
protected String ttlValue = null;
- public FeedContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+ public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("entry"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new FeedItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+ return new FeedItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
- ttlValue = ((XMLStringContext)theContext).getValue();
+ ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("entry"))
{
// It's an item.
@@ -4635,7 +4627,7 @@ public class RSSConnector extends org.ap
}
}
- protected class FeedItemContextClass extends XMLContext
+ protected class FeedItemContextClass extends XMLParsingContext
{
protected int dechromedContentMode;
protected List<String> linkField = new ArrayList<String>();
@@ -4645,40 +4637,41 @@ public class RSSConnector extends org.ap
protected File contentsFile = null;
protected String descriptionField = null;
- public FeedItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+ public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.dechromedContentMode = dechromedContentMode;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
- String ref = atts.getValue("href");
+ String ref = atts.get("href");
if (ref != null && ref.length() > 0)
linkField.add(ref);
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
else if (localName.equals("published") || localName.equals("updated"))
{
// "published" pr "updated" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("title"))
{
// "title" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("category"))
{
- String category = atts.getValue("term");
+ String category = atts.get("term");
if (category != null && category.length() > 0)
categoryField.add(category);
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
else
{
@@ -4687,7 +4680,7 @@ public class RSSConnector extends org.ap
case DECHROMED_NONE:
if (localName.equals("subtitle"))
{
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
case DECHROMED_DESCRIPTION:
@@ -4696,7 +4689,7 @@ public class RSSConnector extends org.ap
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
- return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
@@ -4718,7 +4711,7 @@ public class RSSConnector extends org.ap
try
{
File tempFile = File.createTempFile("_rssdata_","tmp");
- return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+ return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
}
catch (java.net.SocketTimeoutException e)
{
@@ -4735,30 +4728,31 @@ public class RSSConnector extends org.ap
}
else if (localName.equals("subtitle"))
{
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
break;
default:
break;
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("published") || theTag.equals("updated"))
{
- pubDateField = ((XMLStringContext)theContext).getValue();
+ pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("title"))
{
- titleField = ((XMLStringContext)theContext).getValue();
+ titleField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
@@ -4767,7 +4761,7 @@ public class RSSConnector extends org.ap
case DECHROMED_NONE:
if (theTag.equals("subtitle"))
{
- titleField = ((XMLStringContext)theContext).getValue();
+ titleField = ((XMLStringParsingContext)theContext).getValue();
}
break;
case DECHROMED_DESCRIPTION:
@@ -4775,7 +4769,7 @@ public class RSSConnector extends org.ap
{
// Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
tagCleanup();
- contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+ contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
break;
@@ -4784,12 +4778,12 @@ public class RSSConnector extends org.ap
{
// Retrieve content file
tagCleanup();
- contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+ contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
return;
}
else if (theTag.equals("subtitle"))
{
- titleField = ((XMLStringContext)theContext).getValue();
+ titleField = ((XMLStringParsingContext)theContext).getValue();
}
break;
default:
@@ -4936,7 +4930,7 @@ public class RSSConnector extends org.ap
}
}
- protected class UrlsetContextClass extends XMLContext
+ protected class UrlsetContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentIdentifier;
@@ -4948,31 +4942,33 @@ public class RSSConnector extends org.ap
/** ttl value */
protected String ttlValue = null;
- public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+ public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentIdentifier = documentIdentifier;
this.activities = activities;
this.filter = filter;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "url", nothing else.
if (localName.equals("url") || localName.equals("sitemap"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("url") || theTag.equals("sitemap"))
{
@@ -5038,50 +5034,52 @@ public class RSSConnector extends org.ap
}
}
- protected class UrlsetItemContextClass extends XMLContext
+ protected class UrlsetItemContextClass extends XMLParsingContext
{
protected String linkField = null;
protected String pubDateField = null;
- public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "loc" and "lastmod", nothing else.
if (localName.equals("loc"))
{
// "loc" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("lastmod"))
{
// "lastmod" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("loc"))
{
- linkField = ((XMLStringContext)theContext).getValue();
+ linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("lastmod"))
{
- pubDateField = ((XMLStringContext)theContext).getValue();
+ pubDateField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java Sun Feb 10 22:55:30 2013
@@ -19,6 +19,7 @@
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
import java.util.*;
/** This class interprets the tag stream generated by the BasicParseState class, and keeps track of the form tags. */
@@ -46,10 +47,11 @@ public class FormParseState extends Link
// Override methods having to do with notification of tag discovery
@Override
- protected void noteNonscriptTag(String tagName, Map attributes)
+ protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
- super.noteNonscriptTag(tagName,attributes);
+ if (super.noteNonscriptTag(tagName,attributes))
+ return true;
switch (formParseState)
{
case FORMPARSESTATE_NORMAL:
@@ -125,13 +127,15 @@ public class FormParseState extends Link
default:
throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
}
+ return false;
}
@Override
- protected void noteNonscriptEndTag(String tagName)
+ protected boolean noteNonscriptEndTag(String tagName)
throws ManifoldCFException
{
- super.noteNonscriptEndTag(tagName);
+ if (super.noteNonscriptEndTag(tagName))
+ return true;
switch (formParseState)
{
case FORMPARSESTATE_NORMAL:
@@ -158,7 +162,7 @@ public class FormParseState extends Link
optionMap.put("type","select");
optionMap.put("name",selectName);
optionMap.put("multiple",selectMultiple);
- optionMap.put("value",htmlBodyDecode(optionValueText.toString()));
+ optionMap.put("value",optionValueText.toString());
optionMap.put("selected",optionSelected);
handler.noteFormInput(optionMap);
}
@@ -175,13 +179,15 @@ public class FormParseState extends Link
default:
throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
}
+ return false;
}
@Override
- protected void noteNormalCharacter(char thisChar)
+ protected boolean noteNormalCharacter(char thisChar)
throws ManifoldCFException
{
- super.noteNormalCharacter(thisChar);
+ if (super.noteNormalCharacter(thisChar))
+ return true;
if (formParseState == FORMPARSESTATE_IN_OPTION)
{
if (optionValueText != null)
@@ -189,6 +195,7 @@ public class FormParseState extends Link
}
else
handler.noteTextCharacter(thisChar);
+ return false;
}
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Sun Feb 10 22:55:30 2013
@@ -19,6 +19,7 @@
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
import java.util.*;
/** This class recognizes and interprets all links */
@@ -34,10 +35,11 @@ public class LinkParseState extends Meta
}
@Override
- protected void noteNonscriptTag(String tagName, Map attributes)
+ protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
- super.noteNonscriptTag(tagName,attributes);
+ if (super.noteNonscriptTag(tagName,attributes))
+ return true;
if (tagName.equals("a"))
{
String hrefValue = (String)attributes.get("href");
@@ -62,6 +64,7 @@ public class LinkParseState extends Meta
if (srcValue != null && srcValue.length() > 0)
handler.noteFRAMESRC(srcValue);
}
+ return false;
}
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java Sun Feb 10 22:55:30 2013
@@ -19,6 +19,7 @@
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
import java.util.*;
/** This class recognizes and interprets all meta tags */
@@ -33,14 +34,16 @@ public class MetaParseState extends Scri
}
@Override
- protected void noteNonscriptTag(String tagName, Map attributes)
+ protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
- super.noteNonscriptTag(tagName,attributes);
+ if (super.noteNonscriptTag(tagName,attributes))
+ return true;
if (tagName.equals("meta"))
{
handler.noteMetaTag(attributes);
}
+ return false;
}
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Sun Feb 10 22:55:30 2013
@@ -19,10 +19,11 @@
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
import java.util.*;
-/** This class interprets the tag stream generated by the BasicParseState class, and causes script sections to be skipped */
-public class ScriptParseState extends BasicParseState
+/** This class interprets the tag stream generated by the HTMLParseState class, and causes script sections to be skipped */
+public class ScriptParseState extends HTMLParseState
{
// Script tag parsing states
protected static final int SCRIPTPARSESTATE_NORMAL = 0;
@@ -38,17 +39,19 @@ public class ScriptParseState extends Ba
// Override methods having to do with notification of tag discovery
@Override
- protected void noteTag(String tagName, Map attributes)
+ protected boolean noteTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
- super.noteTag(tagName,attributes);
+ if (super.noteTag(tagName,attributes))
+ return true;
switch (scriptParseState)
{
case SCRIPTPARSESTATE_NORMAL:
if (tagName.equals("script"))
scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
else
- noteNonscriptTag(tagName,attributes);
+ if (noteNonscriptTag(tagName,attributes))
+ return true;
break;
case SCRIPTPARSESTATE_INSCRIPT:
// Skip all tags until we see the end script one.
@@ -56,17 +59,20 @@ public class ScriptParseState extends Ba
default:
throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
}
+ return false;
}
@Override
- protected void noteEndTag(String tagName)
+ protected boolean noteTagEnd(String tagName)
throws ManifoldCFException
{
- super.noteEndTag(tagName);
+ if (super.noteTagEnd(tagName))
+ return true;
switch (scriptParseState)
{
case SCRIPTPARSESTATE_NORMAL:
- noteNonscriptEndTag(tagName);
+ if (noteNonscriptEndTag(tagName))
+ return true;
break;
case SCRIPTPARSESTATE_INSCRIPT:
// Skip all tags until we see the end script one.
@@ -76,16 +82,19 @@ public class ScriptParseState extends Ba
default:
break;
}
+ return false;
}
- protected void noteNonscriptTag(String tagName, Map attributes)
+ protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
throws ManifoldCFException
{
+ return false;
}
- protected void noteNonscriptEndTag(String tagName)
+ protected boolean noteNonscriptEndTag(String tagName)
throws ManifoldCFException
{
+ return false;
}
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Sun Feb 10 22:55:30 2013
@@ -24,13 +24,7 @@ import org.apache.manifoldcf.crawler.int
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
-import org.xml.sax.Attributes;
-
-import org.apache.manifoldcf.core.common.XMLDoc;
-import org.apache.manifoldcf.agents.common.XMLStream;
-import org.apache.manifoldcf.agents.common.XMLContext;
-import org.apache.manifoldcf.agents.common.XMLStringContext;
-import org.apache.manifoldcf.agents.common.XMLFileContext;
+import org.apache.manifoldcf.core.fuzzyml.*;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.client.RedirectException;
@@ -6023,8 +6017,6 @@ public class WebcrawlerConnector extends
// from it presuming it is an RSS feed.
String encoding = extractEncoding(contentType);
- if (encoding == null)
- encoding = "utf-8";
InputStream is = cache.getData(documentURI);
if (is == null)
@@ -6034,13 +6026,14 @@ public class WebcrawlerConnector extends
}
try
{
- // Parse the document. This will cause various things to occur, within the instantiated XMLContext class.
- XMLStream x = new XMLStream();
+ // Parse the document. This will cause various things to occur, within the instantiated XMLParsingContext class.
+ Parser p = new Parser();
+ XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
OuterContextClass c = new OuterContextClass(x,documentURI,handler);
x.setContext(c);
try
{
- x.parse(is);
+ p.parseWithCharsetDetection(encoding,is,x);
c.checkIfValidFeed();
}
finally
@@ -6048,17 +6041,6 @@ public class WebcrawlerConnector extends
x.cleanup();
}
}
- catch (ManifoldCFException e)
- {
- // Ignore XML parsing errors. These should probably have their own error code, but that requires a core change.
- if (e.getMessage().indexOf("pars") >= 0)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: XML document '"+documentURI+"' was unparseable ("+e.getMessage()+"), skipping");
- return;
- }
- throw e;
- }
finally
{
is.close();
@@ -6085,7 +6067,7 @@ public class WebcrawlerConnector extends
}
/** This class handles the outermost XML context for the feed document. */
- protected class OuterContextClass extends XMLContext
+ protected class OuterContextClass extends XMLParsingContext
{
/** Keep track of the number of valid feed signals we saw */
protected int outerTagCount = 0;
@@ -6094,7 +6076,7 @@ public class WebcrawlerConnector extends
/** The link handler */
protected IXMLHandler handler;
- public OuterContextClass(XMLStream theStream, String documentURI, IXMLHandler handler)
+ public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentURI, IXMLHandler handler)
{
super(theStream);
this.documentURI = documentURI;
@@ -6112,8 +6094,9 @@ public class WebcrawlerConnector extends
}
/** Handle the tag beginning to set the correct second-level parsing context */
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
if (localName.equals("rss"))
{
@@ -6121,36 +6104,37 @@ public class WebcrawlerConnector extends
outerTagCount++;
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("WEB: Parsed bottom-level XML for RSS document '"+documentURI+"'");
- return new RSSContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+ return new RSSContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
}
else if (localName.equals("RDF"))
{
// RDF/Atom feed detected
outerTagCount++;
- return new RDFContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+ return new RDFContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
}
else if (localName.equals("feed"))
{
// Basic feed detected
outerTagCount++;
- return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+ return new FeedContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
}
else if (localName.equals("urlset") || localName.equals("sitemapindex"))
{
// Sitemap detected
outerTagCount++;
- return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+ return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
}
// The default action is to establish a new default context.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
/** Handle the tag ending */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext context = theStream.getContext();
+ XMLParsingContext context = theStream.getContext();
String tagName = context.getLocalname();
if (tagName.equals("RDF"))
{
@@ -6170,39 +6154,41 @@ public class WebcrawlerConnector extends
}
- protected class RSSContextClass extends XMLContext
+ protected class RSSContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentURI;
/** Link notification interface */
protected IXMLHandler handler;
- public RSSContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+ public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentURI = documentURI;
this.handler = handler;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// Handle each channel
if (localName.equals("channel"))
{
// Channel detected
- return new RSSChannelContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+ return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
// If it's our channel tag, process global channel information
- XMLContext context = theStream.getContext();
+ XMLParsingContext context = theStream.getContext();
String tagName = context.getLocalname();
if (tagName.equals("channel"))
{
@@ -6213,7 +6199,7 @@ public class WebcrawlerConnector extends
}
}
- protected class RSSChannelContextClass extends XMLContext
+ protected class RSSChannelContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentURI;
@@ -6223,39 +6209,41 @@ public class WebcrawlerConnector extends
/** TTL value is set on a per-channel basis */
protected String ttlValue = null;
- public RSSChannelContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+ public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentURI = documentURI;
this.handler = handler;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("item"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new RSSItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ return new RSSItemContextClass(theStream,namespace,localName,qName,atts);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
- ttlValue = ((XMLStringContext)theContext).getValue();
+ ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("item"))
{
// It's an item.
@@ -6287,50 +6275,52 @@ public class WebcrawlerConnector extends
}
}
- protected class RSSItemContextClass extends XMLContext
+ protected class RSSItemContextClass extends XMLParsingContext
{
protected String guidField = null;
protected String linkField = null;
- public RSSItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("guid"))
{
// "guid" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("link"))
{
- linkField = ((XMLStringContext)theContext).getValue();
+ linkField = ((XMLStringParsingContext)theContext).getValue();
}
else if (theTag.equals("guid"))
{
- guidField = ((XMLStringContext)theContext).getValue();
+ guidField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
@@ -6359,7 +6349,7 @@ public class WebcrawlerConnector extends
}
}
- protected class RDFContextClass extends XMLContext
+ protected class RDFContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentURI;
@@ -6369,39 +6359,41 @@ public class WebcrawlerConnector extends
/** ttl value */
protected String ttlValue = null;
- public RDFContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+ public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentURI = documentURI;
this.handler = handler;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("item"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new RDFItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ return new RDFItemContextClass(theStream,namespace,localName,qName,atts);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
- ttlValue = ((XMLStringContext)theContext).getValue();
+ ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("item"))
{
// It's an item.
@@ -6429,40 +6421,42 @@ public class WebcrawlerConnector extends
}
}
- protected class RDFItemContextClass extends XMLContext
+ protected class RDFItemContextClass extends XMLParsingContext
{
protected String linkField = null;
- public RDFItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("link"))
{
- linkField = ((XMLStringContext)theContext).getValue();
+ linkField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
@@ -6488,7 +6482,7 @@ public class WebcrawlerConnector extends
}
}
- protected class FeedContextClass extends XMLContext
+ protected class FeedContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentURI;
@@ -6498,39 +6492,41 @@ public class WebcrawlerConnector extends
/** ttl value */
protected String ttlValue = null;
- public FeedContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+ public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentURI = documentURI;
this.handler = handler;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("ttl"))
{
// TTL value seen. Prepare to record it, as a string.
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else if (localName.equals("entry"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new FeedItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ return new FeedItemContextClass(theStream,namespace,localName,qName,atts);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("ttl"))
// If the current context must be the TTL one, record its data value.
- ttlValue = ((XMLStringContext)theContext).getValue();
+ ttlValue = ((XMLStringParsingContext)theContext).getValue();
else if (theTag.equals("entry"))
{
// It's an item.
@@ -6559,31 +6555,32 @@ public class WebcrawlerConnector extends
}
}
- protected class FeedItemContextClass extends XMLContext
+ protected class FeedItemContextClass extends XMLParsingContext
{
protected List<String> linkField = new ArrayList<String>();
- public FeedItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "ttl" and "item", nothing else.
if (localName.equals("link"))
{
// "link" tag
- String ref = atts.getValue("href");
+ String ref = atts.get("href");
if (ref != null && ref.length() > 0)
linkField.add(ref);
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
else
{
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
@@ -6608,7 +6605,7 @@ public class WebcrawlerConnector extends
}
}
- protected class UrlsetContextClass extends XMLContext
+ protected class UrlsetContextClass extends XMLParsingContext
{
/** The document identifier */
protected String documentURI;
@@ -6618,30 +6615,32 @@ public class WebcrawlerConnector extends
/** ttl value */
protected String ttlValue = null;
- public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+ public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
this.documentURI = documentURI;
this.handler = handler;
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "url", nothing else.
if (localName.equals("url") || localName.equals("sitemap"))
{
// Item seen. We don't need any of the attributes etc., but we need to start a new context.
- return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts);
}
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("url") || theTag.equals("sitemap"))
{
@@ -6674,40 +6673,42 @@ public class WebcrawlerConnector extends
}
}
- protected class UrlsetItemContextClass extends XMLContext
+ protected class UrlsetItemContextClass extends XMLParsingContext
{
protected String linkField = null;
- public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
{
- super(theStream,namespaceURI,localName,qName,atts);
+ super(theStream,namespace,localName,qName,atts);
}
- protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
- throws ManifoldCFException, ServiceInterruption
+ @Override
+ protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+ throws ManifoldCFException
{
// The tags we care about are "loc", nothing else.
if (localName.equals("loc"))
{
// "loc" tag
- return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
}
else
{
// Skip everything else.
- return super.beginTag(namespaceURI,localName,qName,atts);
+ return super.beginTag(namespace,localName,qName,atts);
}
}
/** Convert the individual sub-fields of the item context into their final forms */
+ @Override
protected void endTag()
- throws ManifoldCFException, ServiceInterruption
+ throws ManifoldCFException
{
- XMLContext theContext = theStream.getContext();
+ XMLParsingContext theContext = theStream.getContext();
String theTag = theContext.getLocalname();
if (theTag.equals("loc"))
{
- linkField = ((XMLStringContext)theContext).getValue();
+ linkField = ((XMLStringParsingContext)theContext).getValue();
}
else
{
@@ -6768,33 +6769,14 @@ public class WebcrawlerConnector extends
return;
}
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Document '"+documentURI+"' is text, with encoding '"+encoding+"'; link extraction starting");
-
try
{
- // Create a reader for the described encoding, if that's possible
- Reader r = new InputStreamReader(is,encoding);
- try
- {
- // We read characters at a time, understanding the basic form of html.
- // This code represents a basic bottom-up parser, which is the best thing since we really don't want to code up all the context we'd need
- // to do a top-down parse. So, there is a parse state, and the code walks through the document recognizing symbols and modifying the state.
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Document '"+documentURI+"' is text, with encoding '"+encoding+"'; link extraction starting");
- FormParseState currentParseState = new FormParseState(handler);
- while (true)
- {
- int x = r.read();
- if (x == -1)
- break;
- currentParseState.dealWithCharacter((char)x);
- }
- currentParseState.finishUp();
- }
- finally
- {
- r.close();
- }
+ // Instantiate the parser, and call the right method
+ Parser p = new Parser();
+ p.parseWithoutCharsetDetection(encoding,is,new FormParseState(handler));
}
catch (UnsupportedEncodingException e)
{
Propchange: manifoldcf/trunk/connectors/wiki/
------------------------------------------------------------------------------
Merged /manifoldcf/branches/CONNECTORS-633/connectors/wiki:r1442813-1444626