You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/04/30 09:21:40 UTC
svn commit: r1676910 - in /manifoldcf/trunk: CHANGES.txt
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
Author: kwright
Date: Thu Apr 30 07:21:39 2015
New Revision: 1676910
URL: http://svn.apache.org/r1676910
Log:
Fix for CONNECTORS-1192.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1676910&r1=1676909&r2=1676910&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Apr 30 07:21:39 2015
@@ -3,6 +3,11 @@ $Id$
======================= 2.2-dev =====================
+CONNECTORS-1192: Fix a problem with login-page detection based on
+content. Last line was getting skipped, and infinite amounts of content
+could be buffered in memory. Adopted a compromise that scans an
+overlapping window of minimum size 16K.
+(Karl Wright)
======================= Release 2.1 =====================
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1676910&r1=1676909&r2=1676910&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Thu Apr 30 07:21:39 2015
@@ -29,6 +29,9 @@ public class FindContentHandler extends
protected final Pattern contentPattern;
protected final StringBuilder contentBuffer = new StringBuilder();
+ protected final static int MAX_LENGTH = 65536;
+ protected final static int OVERLAP_AMOUNT = 16384;
+
public FindContentHandler(String parentURI, Pattern contentPattern)
{
super(parentURI);
@@ -57,7 +60,23 @@ public class FindContentHandler extends
return;
// Build characters up into lines, and apply the regexp against them
if (textCharacter == '\t' || textCharacter >= ' ')
+ {
contentBuffer.append(textCharacter);
+ // If too big, do the search and clear out the buffer, retaining some of it for overlap purposes
+ if (contentBuffer.length() >= MAX_LENGTH)
+ {
+ // Process what we have, and keep around what we need for
+ // continuity
+ String bufferContents = contentBuffer.toString();
+ contentBuffer.setLength(0);
+ if (contentPattern.matcher(bufferContents).find())
+ targetURI = "";
+ else
+ {
+ contentBuffer.append(bufferContents.substring(bufferContents.length() - OVERLAP_AMOUNT));
+ }
+ }
+ }
else
{
processBuffer();