You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2021/11/19 22:16:11 UTC

svn commit: r1895187 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Fri Nov 19 22:16:11 2021
New Revision: 1895187

URL: http://svn.apache.org/viewvc?rev=1895187&view=rev
Log:
CONNECTORS-1680: Add support for BASE HREF

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Nov 19 22:16:11 2021
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.21-dev =====================
 
+CONNECTORS-1680: Add support for BASE HREF in web connector.
+(Karl Wright, Marcus Schuch)
+
 CONNECTORS-1675: Cut-and-paste error corrected so that mapping connection
 deletion via API now works.
 (Julien Massiera)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Fri Nov 19 22:16:11 2021
@@ -130,6 +130,13 @@ public class FindContentHandler extends
   {
   }
 
+  /** Note discovered base href */
+  @Override
+  public void noteBASEHREF(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+  
   /** Note discovered href */
   @Override
   public void noteLINKHREF(String rawURL)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java Fri Nov 19 22:16:11 2021
@@ -160,6 +160,13 @@ public class FindHTMLFormHandler extends
     }
   }
 
+  /** Note discovered base href */
+  @Override
+  public void noteBASEHREF(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+  
   /** Note discovered href */
   @Override
   public void noteAHREF(String rawURL)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java Fri Nov 19 22:16:11 2021
@@ -82,6 +82,13 @@ public class FindHTMLHrefHandler extends
   {
   }
 
+  @Override
+  public void noteDiscoveredBase(String rawURL)
+    throws ManifoldCFException
+  {
+    super.noteDiscoveredBase(rawURL);
+  }
+  
   /** Override noteDiscoveredLink */
   @Override
   public void noteDiscoveredLink(String rawURL)
@@ -113,6 +120,14 @@ public class FindHTMLHrefHandler extends
     }
   }
 
+  /** Note discovered base */
+  @Override
+  public void noteBASEHREF(String rawURL)
+    throws ManifoldCFException
+  {
+    noteDiscoveredBase(rawURL);
+  }
+  
   /** Note discovered href */
   @Override
   public void noteAHREF(String rawURL)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java Fri Nov 19 22:16:11 2021
@@ -31,6 +31,13 @@ public class FindHandler implements IDis
     this.parentURI = parentURI;
   }
 
+  @Override
+  public void noteDiscoveredBase(String rawURL)
+    throws ManifoldCFException
+  {
+    // MHL
+  }
+  
   /** Inform the world of a discovered link.
   *@param rawURL is the raw discovered url.  This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
   */

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java Fri Nov 19 22:16:11 2021
@@ -44,7 +44,14 @@ public class FindPreferredRedirectionHan
         super.noteDiscoveredLink(lp.getOverrideTargetURL());
     }
   }
-    
+
+  @Override
+  public void noteDiscoveredBase(String rawURL)
+    throws ManifoldCFException
+  {
+    super.noteDiscoveredBase(rawURL);
+  }
+  
   /** Override noteDiscoveredLink */
   @Override
   public void noteDiscoveredLink(String rawURL)

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java Fri Nov 19 22:16:11 2021
@@ -24,6 +24,12 @@ import org.apache.manifoldcf.core.interf
 */
 public interface IDiscoveredLinkHandler
 {
+  /** Inform the world of a new base HREF.
+  *@param rawURL is the new base HREF, in raw form.  This may be relative, malformed, etc.
+  */
+  public void noteDiscoveredBase(String rawURL)
+    throws ManifoldCFException;
+
   /** Inform the world of a discovered link.
   *@param rawURL is the raw discovered url.  This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
   */

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Fri Nov 19 22:16:11 2021
@@ -45,6 +45,10 @@ public interface IHTMLHandler extends ID
   public void noteLINKHREF(String rawURL)
     throws ManifoldCFException;
 
+  /** Note base href */
+  public void noteBASEHREF(String rawURL)
+    throws ManifoldCFException;
+    
   /** Note discovered IMG SRC */
   public void noteIMGSRC(String rawURL)
     throws ManifoldCFException;

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Fri Nov 19 22:16:11 2021
@@ -52,6 +52,12 @@ public class LinkParseState extends Meta
       if (hrefValue != null && hrefValue.length() > 0)
         handler.noteLINKHREF(hrefValue);
     }
+    else if (tagName.equals("base"))
+    {
+      String hrefValue = (String)attributes.get("href");
+      if (hrefValue != null && hrefValue.length() > 0)
+        handler.noteBASEHREF(hrefValue);
+    }
     else if (tagName.equals("img"))
     {
       String srcValue = (String)attributes.get("src");

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1895187&r1=1895186&r2=1895187&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Nov 19 22:16:11 2021
@@ -3876,6 +3876,7 @@ public class WebcrawlerConnector extends
   protected class ProcessActivityLinkHandler implements IDiscoveredLinkHandler
   {
     protected String documentIdentifier;
+    protected String baseDocumentIdentifier;
     protected IProcessActivity activities;
     protected DocumentURLFilter filter;
     protected String contextDescription;
@@ -3885,12 +3886,22 @@ public class WebcrawlerConnector extends
     public ProcessActivityLinkHandler(String documentIdentifier, IProcessActivity activities, DocumentURLFilter filter, String contextDescription, String linkType)
     {
       this.documentIdentifier = documentIdentifier;
+      this.baseDocumentIdentifier = baseDocumentIdentifier;
       this.activities = activities;
       this.filter = filter;
       this.contextDescription = contextDescription;
       this.linkType = linkType;
     }
 
+    @Override
+    public void noteDiscoveredBase(String rawURL)
+      throws ManifoldCFException
+    {
+      String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
+      if (newIdentifier != null)
+        baseDocumentIdentifier = newIdentifier;
+    }
+    
     /** Inform the world of a discovered link.
     *@param rawURL is the raw discovered url.  This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
     */
@@ -3898,7 +3909,7 @@ public class WebcrawlerConnector extends
     public void noteDiscoveredLink(String rawURL)
       throws ManifoldCFException
     {
-      String newIdentifier = makeDocumentIdentifier(documentIdentifier,rawURL,filter);
+      String newIdentifier = makeDocumentIdentifier(baseDocumentIdentifier,rawURL,filter);
       if (newIdentifier != null)
       {
         if (Logging.connectors.isDebugEnabled())
@@ -4023,6 +4034,15 @@ public class WebcrawlerConnector extends
     {
     }
 
+    /** Note discovered base */
+    @Override
+    public void noteBASEHREF(String rawURL)
+      throws ManifoldCFException
+    {
+      if (allowFollow)
+        noteDiscoveredBase(rawURL);
+    }
+    
     /** Note discovered href */
     @Override
     public void noteAHREF(String rawURL)