You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2019/09/24 16:29:23 UTC
svn commit: r1867468 - in /manifoldcf/trunk: ./
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/
Author: kwright
Date: Tue Sep 24 16:29:23 2019
New Revision: 1867468
URL: http://svn.apache.org/viewvc?rev=1867468&view=rev
Log:
Fix for CONNECTORS-1623.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1867468&r1=1867467&r2=1867468&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Sep 24 16:29:23 2019
@@ -6,6 +6,9 @@ $Id$
======================= Release 2.14 =====================
+CONNECTORS-1623: Harden HTML parser against stuff in SCRIPT tags that looks like tags.
+(Julien Massiera, Karl Wright)
+
CONNECTORS-1566: Develop csws connector to replace deprecated LiveLink connector.
(Markus Schuch, Karl Wright)
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1867468&r1=1867467&r2=1867468&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Tue Sep 24 16:29:23 2019
@@ -63,6 +63,11 @@ public class ScriptParseState extends HT
}
@Override
+ protected boolean acceptNewTag() {
+ return scriptParseState != SCRIPTPARSESTATE_INSCRIPT;
+ }
+
+ @Override
protected boolean noteTagEnd(String tagName)
throws ManifoldCFException
{
Modified: manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java?rev=1867468&r1=1867467&r2=1867468&view=diff
==============================================================================
--- manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java (original)
+++ manifoldcf/trunk/framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java Tue Sep 24 16:29:23 2019
@@ -285,7 +285,7 @@ public class TagParseState extends Singl
}
else if (bTagDepth == 0)
{
- if (isWhitespace(thisChar))
+ if (isWhitespace(thisChar) || !acceptNewTag())
{
// Not a tag.
currentState = TAGPARSESTATE_NORMAL;
@@ -937,6 +937,12 @@ public class TagParseState extends Singl
return false;
}
+ /** Allow parsing within tag.
+ */
+ protected boolean acceptNewTag() {
+ return true;
+ }
+
/** Allocate the buffer.
*/
protected StringBuilder newBuffer()