You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2021/11/19 22:16:11 UTC
svn commit: r1895187 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Fri Nov 19 22:16:11 2021
New Revision: 1895187
URL: http://svn.apache.org/viewvc?rev=1895187&view=rev
Log:
CONNECTORS-1680: Add support for BASE HREF
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Nov 19 22:16:11 2021
@@ -3,6 +3,9 @@ $Id$
======================= 2.21-dev =====================
+CONNECTORS-1680: Add support for BASE HREF in web connector.
+(Karl Wright, Marcus Schuch)
+
CONNECTORS-1675: Cut-and-paste error corrected so that mapping connection
deletion via API now works.
(Julien Massiera)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Fri Nov 19 22:16:11 2021
@@ -130,6 +130,13 @@ public class FindContentHandler extends
{
}
+ /** Note discovered base href */
+ @Override
+ public void noteBASEHREF(String rawURL)
+ throws ManifoldCFException
+ {
+ }
+
/** Note discovered href */
@Override
public void noteLINKHREF(String rawURL)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java Fri Nov 19 22:16:11 2021
@@ -160,6 +160,13 @@ public class FindHTMLFormHandler extends
}
}
+ /** Note discovered base href */
+ @Override
+ public void noteBASEHREF(String rawURL)
+ throws ManifoldCFException
+ {
+ }
+
/** Note discovered href */
@Override
public void noteAHREF(String rawURL)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java Fri Nov 19 22:16:11 2021
@@ -82,6 +82,13 @@ public class FindHTMLHrefHandler extends
{
}
+ @Override
+ public void noteDiscoveredBase(String rawURL)
+ throws ManifoldCFException
+ {
+ super.noteDiscoveredBase(rawURL);
+ }
+
/** Override noteDiscoveredLink */
@Override
public void noteDiscoveredLink(String rawURL)
@@ -113,6 +120,14 @@ public class FindHTMLHrefHandler extends
}
}
+ /** Note discovered base */
+ @Override
+ public void noteBASEHREF(String rawURL)
+ throws ManifoldCFException
+ {
+ noteDiscoveredBase(rawURL);
+ }
+
/** Note discovered href */
@Override
public void noteAHREF(String rawURL)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java Fri Nov 19 22:16:11 2021
@@ -31,6 +31,13 @@ public class FindHandler implements IDis
this.parentURI = parentURI;
}
+ @Override
+ public void noteDiscoveredBase(String rawURL)
+ throws ManifoldCFException
+ {
+ // MHL
+ }
+
/** Inform the world of a discovered link.
*@param rawURL is the raw discovered url. This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
*/
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java Fri Nov 19 22:16:11 2021
@@ -44,7 +44,14 @@ public class FindPreferredRedirectionHan
super.noteDiscoveredLink(lp.getOverrideTargetURL());
}
}
-
+
+ @Override
+ public void noteDiscoveredBase(String rawURL)
+ throws ManifoldCFException
+ {
+ super.noteDiscoveredBase(rawURL);
+ }
+
/** Override noteDiscoveredLink */
@Override
public void noteDiscoveredLink(String rawURL)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java Fri Nov 19 22:16:11 2021
@@ -24,6 +24,12 @@ import org.apache.manifoldcf.core.interf
*/
public interface IDiscoveredLinkHandler
{
+ /** Inform the world of a new base HREF.
+ *@param rawURL is the new base HREF, in raw form. This may be relative, malformed, etc.
+ */
+ public void noteDiscoveredBase(String rawURL)
+ throws ManifoldCFException;
+
/** Inform the world of a discovered link.
*@param rawURL is the raw discovered url. This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
*/
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Fri Nov 19 22:16:11 2021
@@ -45,6 +45,10 @@ public interface IHTMLHandler extends ID
public void noteLINKHREF(String rawURL)
throws ManifoldCFException;
+ /** Note base href */
+ public void noteBASEHREF(String rawURL)
+ throws ManifoldCFException;
+
/** Note discovered IMG SRC */
public void noteIMGSRC(String rawURL)
throws ManifoldCFException;
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Fri Nov 19 22:16:11 2021
@@ -52,6 +52,12 @@ public class LinkParseState extends Meta
if (hrefValue != null && hrefValue.length() > 0)
handler.noteLINKHREF(hrefValue);
}
+ else if (tagName.equals("base"))
+ {
+ String hrefValue = (String)attributes.get("href");
+ if (hrefValue != null && hrefValue.length() > 0)
+ handler.noteBASEHREF(hrefValue);
+ }
else if (tagName.equals("img"))
{
String srcValue = (String)attributes.get("src");
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Nov 19 22:16:11 2021
@@ -3876,6 +3876,7 @@ public class WebcrawlerConnector extends
protected class ProcessActivityLinkHandler implements IDiscoveredLinkHandler
{
protected String documentIdentifier;
+ protected String baseDocumentIdentifier;
protected IProcessActivity activities;
protected DocumentURLFilter filter;
protected String contextDescription;
@@ -3885,12 +3886,22 @@ public class WebcrawlerConnector extends
public ProcessActivityLinkHandler(String documentIdentifier, IProcessActivity activities, DocumentURLFilter filter, String contextDescription, String linkType)
{
this.documentIdentifier = documentIdentifier;
+ this.baseDocumentIdentifier = baseDocumentIdentifier;
this.activities = activities;
this.filter = filter;
this.contextDescription = contextDescription;
this.linkType = linkType;
}
+ @Override
+ public void noteDiscoveredBase(String rawURL)
+ throws ManifoldCFException
+ {
+ String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
+ if (newIdentifier != null)
+ baseDocumentIdentifier = newIdentifier;
+ }
+
/** Inform the world of a discovered link.
*@param rawURL is the raw discovered url. This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
*/
@@ -3898,7 +3909,7 @@ public class WebcrawlerConnector extends
public void noteDiscoveredLink(String rawURL)
throws ManifoldCFException
{
- String newIdentifier = makeDocumentIdentifier(documentIdentifier,rawURL,filter);
+ String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
if (newIdentifier != null)
{
if (Logging.connectors.isDebugEnabled())
@@ -4023,6 +4034,15 @@ public class WebcrawlerConnector extends
{
}
+ /** Note discovered base */
+ @Override
+ public void noteBASEHREF(String rawURL)
+ throws ManifoldCFException
+ {
+ if (allowFollow)
+ noteDiscoveredBase(rawURL);
+ }
+
/** Note discovered href */
@Override
public void noteAHREF(String rawURL)