You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/04/30 02:40:56 UTC

svn commit: r1676882 - /manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Thu Apr 30 00:40:55 2015
New Revision: 1676882

URL: http://svn.apache.org/r1676882
Log:
Part of the fix for CONNECTORS-1192.  Introduce finishUp() method to IHTMLHandler, to allow final processing of what's hanging around in the buffer at the end of the page.

Modified:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Thu Apr 30 00:40:55 2015
@@ -60,10 +60,7 @@ public class FindContentHandler extends
       contentBuffer.append(textCharacter);
     else
     {
-      String bufferContents = contentBuffer.toString();
-      contentBuffer.setLength(0);
-      if (contentPattern.matcher(bufferContents).find())
-        targetURI = "";
+      processBuffer();
     }
   }
 
@@ -123,5 +120,22 @@ public class FindContentHandler extends
   {
   }
 
+  /** Finish up all processing.  Called ONLY if we haven't already aborted.
+  */
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+    if (targetURI == null)
+      processBuffer();
+  }
+
+  protected void processBuffer()
+  {
+    String bufferContents = contentBuffer.toString();
+    contentBuffer.setLength(0);
+    if (contentPattern.matcher(bufferContents).find())
+      targetURI = "";
+  }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java Thu Apr 30 00:40:55 2015
@@ -188,4 +188,10 @@ public class FindHTMLFormHandler extends
   {
   }
 
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+  }
+
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java Thu Apr 30 00:40:55 2015
@@ -144,4 +144,10 @@ public class FindHTMLHrefHandler extends
     noteDiscoveredLink(rawURL);
   }
 
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+  }
+
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Thu Apr 30 00:40:55 2015
@@ -59,4 +59,9 @@ public interface IHTMLHandler extends ID
   public void noteTextCharacter(char textCharacter)
     throws ManifoldCFException;
 
+  /** Done with the document.
+  */
+  public void finishUp()
+    throws ManifoldCFException;
+
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Thu Apr 30 00:40:55 2015
@@ -67,4 +67,12 @@ public class LinkParseState extends Meta
     return false;
   }
 
+  @Override
+  public void finishUp()
+    throws ManifoldCFException
+  {
+    handler.finishUp();
+    super.finishUp();
+  }
+  
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Apr 30 00:40:55 2015
@@ -6157,6 +6157,12 @@ public class WebcrawlerConnector extends
         noteDiscoveredLink(rawURL);
     }
 
+    @Override
+    public void finishUp()
+      throws ManifoldCFException
+    {
+    }
+
   }
 
   /** Class that describes XML handling */