You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/04/30 09:21:40 UTC

svn commit: r1676910 - in /manifoldcf/trunk: CHANGES.txt connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java

Author: kwright
Date: Thu Apr 30 07:21:39 2015
New Revision: 1676910

URL: http://svn.apache.org/r1676910
Log:
Fix for CONNECTORS-1192.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1676910&r1=1676909&r2=1676910&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Apr 30 07:21:39 2015
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 2.2-dev =====================
 
+CONNECTORS-1192: Fix a problem with login-page detection based on
+content.  Last line was getting skipped, and infinite amounts of content
+could be buffered in memory.  Adopted a compromise that scans an
+overlapping window of minimum size 16K.
+(Karl Wright)
 
 ======================= Release 2.1 =====================
 

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1676910&r1=1676909&r2=1676910&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Thu Apr 30 07:21:39 2015
@@ -29,6 +29,9 @@ public class FindContentHandler extends
   protected final Pattern contentPattern;
   protected final StringBuilder contentBuffer = new StringBuilder();
 
+  protected final static int MAX_LENGTH = 65536;
+  protected final static int OVERLAP_AMOUNT = 16384;
+  
   public FindContentHandler(String parentURI, Pattern contentPattern)
   {
     super(parentURI);
@@ -57,7 +60,23 @@ public class FindContentHandler extends
       return;
     // Build characters up into lines, and apply the regexp against them
     if (textCharacter == '\t' || textCharacter >= ' ')
+    {
       contentBuffer.append(textCharacter);
+      // If too big, do the search and clear out the buffer, retaining some of it for overlap purposes
+      if (contentBuffer.length() >= MAX_LENGTH)
+      {
+        // Process what we have, and keep around what we need for
+        // continuity
+        String bufferContents = contentBuffer.toString();
+        contentBuffer.setLength(0);
+        if (contentPattern.matcher(bufferContents).find())
+          targetURI = "";
+        else
+        {
+          contentBuffer.append(bufferContents.substring(bufferContents.length() - OVERLAP_AMOUNT));
+        }
+      }
+    }
     else
     {
       processBuffer();