You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2015/04/30 02:40:56 UTC
svn commit: r1676882 -
/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Thu Apr 30 00:40:55 2015
New Revision: 1676882
URL: http://svn.apache.org/r1676882
Log:
Part of the fix for CONNECTORS-1192. Introduce finishUp() method to IHTMLHandler, to allow final processing of what's hanging around in the buffer at the end of the page.
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java Thu Apr 30 00:40:55 2015
@@ -60,10 +60,7 @@ public class FindContentHandler extends
contentBuffer.append(textCharacter);
else
{
- String bufferContents = contentBuffer.toString();
- contentBuffer.setLength(0);
- if (contentPattern.matcher(bufferContents).find())
- targetURI = "";
+ processBuffer();
}
}
@@ -123,5 +120,22 @@ public class FindContentHandler extends
{
}
+ /** Finish up all processing. Called ONLY if we haven't already aborted.
+ */
+ @Override
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ if (targetURI == null)
+ processBuffer();
+ }
+
+ protected void processBuffer()
+ {
+ String bufferContents = contentBuffer.toString();
+ contentBuffer.setLength(0);
+ if (contentPattern.matcher(bufferContents).find())
+ targetURI = "";
+ }
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java Thu Apr 30 00:40:55 2015
@@ -188,4 +188,10 @@ public class FindHTMLFormHandler extends
{
}
+ @Override
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ }
+
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java Thu Apr 30 00:40:55 2015
@@ -144,4 +144,10 @@ public class FindHTMLHrefHandler extends
noteDiscoveredLink(rawURL);
}
+ @Override
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ }
+
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Thu Apr 30 00:40:55 2015
@@ -59,4 +59,9 @@ public interface IHTMLHandler extends ID
public void noteTextCharacter(char textCharacter)
throws ManifoldCFException;
+ /** Done with the document.
+ */
+ public void finishUp()
+ throws ManifoldCFException;
+
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Thu Apr 30 00:40:55 2015
@@ -67,4 +67,12 @@ public class LinkParseState extends Meta
return false;
}
+ @Override
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ handler.finishUp();
+ super.finishUp();
+ }
+
}
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1676882&r1=1676881&r2=1676882&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Apr 30 00:40:55 2015
@@ -6157,6 +6157,12 @@ public class WebcrawlerConnector extends
noteDiscoveredLink(rawURL);
}
+ @Override
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ }
+
}
/** Class that describes XML handling */